]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[REFACT] SocialList clean again
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE TemplateHaskell #-}
13
14 module Gargantext.Core.Text.List
15 where
16
17
18 import Control.Lens ((^.), set)
19 import Data.Maybe (fromMaybe, catMaybes)
20 import Data.Ord (Down(..))
21 import Data.Map (Map)
22 import Data.Text (Text)
23 import qualified Data.Char as Char
24 import qualified Data.List as List
25 import qualified Data.Map as Map
26 import qualified Data.Set as Set
27 import qualified Data.Text as Text
28
29 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
30 import Gargantext.API.Ngrams.Types (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
31 import Gargantext.API.Ngrams.Types (RepoCmdM)
32 import Gargantext.Core.Text.List.Social (flowSocialList, invertForw)
33 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
34 import Gargantext.Core.Text.Group
35 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
36 import Gargantext.Core.Types.Individu (User(..))
37 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, groupNodesByNgramsWith, getNodesByNgramsOnlyUser)
38 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
39 import Gargantext.Database.Prelude (CmdM)
40 import Gargantext.Database.Query.Table.Node (defaultList)
41 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
42 import Gargantext.Database.Query.Tree.Error (HasTreeError)
43 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
44 import Gargantext.Prelude
45
46
47 -- | TODO improve grouping functions of Authors, Sources, Institutes..
48 buildNgramsLists :: ( RepoCmdM env err m
49 , CmdM env err m
50 , HasTreeError err
51 , HasNodeError err
52 )
53 => User
54 -> GroupParams
55 -> UserCorpusId
56 -> MasterCorpusId
57 -> m (Map NgramsType [NgramsElement])
58 buildNgramsLists user gp uCid mCid = do
59 ngTerms <- buildNgramsTermsList user uCid mCid gp
60 othersTerms <- mapM (buildNgramsOthersList user uCid (ngramsGroup GroupIdentity))
61 [ (Authors , MapListSize 9)
62 , (Sources , MapListSize 9)
63 , (Institutes, MapListSize 9)
64 ]
65
66 pure $ Map.unions $ [ngTerms] <> othersTerms
67
68
69 data MapListSize = MapListSize Int
70
71 buildNgramsOthersList ::( HasNodeError err
72 , CmdM env err m
73 , RepoCmdM env err m
74 , HasTreeError err
75 )
76 => User
77 -> UserCorpusId
78 -> (Text -> Text)
79 -> (NgramsType, MapListSize)
80 -> m (Map NgramsType [NgramsElement])
81 buildNgramsOthersList user uCid groupIt (nt, MapListSize mapListSize) = do
82
83 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
84
85
86 let
87 grouped = toGroupedText groupIt (Set.size . snd) fst snd
88 (Map.toList $ Map.mapWithKey (\k (a,b) -> (Set.delete k a, b)) $ ngs)
89
90 socialLists <- flowSocialList user nt (Set.fromList $ Map.keys ngs)
91
92 let
93 groupedWithList = map (addListType (invertForw socialLists)) grouped
94 (stopTerms, tailTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
95 (mapTerms, tailTerms') = Map.partition (\t -> t ^. gt_listType == Just MapTerm) tailTerms
96
97 listSize = mapListSize - (List.length mapTerms)
98 (mapTerms', candiTerms) = List.splitAt listSize $ List.sortOn (Down . _gt_score) $ Map.elems tailTerms'
99
100 pure $ Map.fromList [( nt, (List.concat $ map toNgramsElement stopTerms)
101 <> (List.concat $ map toNgramsElement mapTerms)
102 <> (List.concat $ map toNgramsElement $ map (set gt_listType (Just MapTerm)) mapTerms')
103 <> (List.concat $ map toNgramsElement $ map (set gt_listType (Just CandidateTerm)) candiTerms)
104 )]
105
106 -- TODO use ListIds
107 buildNgramsTermsList :: ( HasNodeError err
108 , CmdM env err m
109 , RepoCmdM env err m
110 , HasTreeError err
111 )
112 => User
113 -> UserCorpusId
114 -> MasterCorpusId
115 -> GroupParams
116 -> m (Map NgramsType [NgramsElement])
117 buildNgramsTermsList user uCid mCid groupParams = do
118
119 -- Computing global speGen score
120 allTerms <- Map.toList <$> getTficf uCid mCid NgramsTerms
121
122 -- printDebug "head candidates" (List.take 10 $ allTerms)
123 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
124
125 -- First remove stops terms
126 socialLists <- flowSocialList user NgramsTerms (Set.fromList $ map fst allTerms)
127 -- printDebug "\n * socialLists * \n" socialLists
128
129 printDebug "\n * socialLists * \n" socialLists
130
131 let
132 socialStop = fromMaybe Set.empty $ Map.lookup StopTerm socialLists
133 _socialMap = fromMaybe Set.empty $ Map.lookup MapTerm socialLists
134 _socialCand = fromMaybe Set.empty $ Map.lookup CandidateTerm socialLists
135 -- stopTerms ignored for now (need to be tagged already)
136 -- (stopTerms, candidateTerms) = List.partition ((\t -> Set.member t socialStop) . fst) allTerms
137 -- (mapTerms, candidateTerms) = List.partition ((\t -> Set.member t socialMap ) . fst) allTerms
138
139 -- printDebug "stopTerms" stopTerms
140
141 -- Grouping the ngrams and keeping the maximum score for label
142 let grouped = toGroupedText (ngramsGroup groupParams) identity (const Set.empty) (const Set.empty) allTerms
143
144 groupedWithList = map (addListType (invertForw socialLists)) grouped
145
146 (stopTerms, candidateTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
147 (groupedMono, groupedMult) = Map.partition (\t -> t ^. gt_size < 2) candidateTerms
148
149 -- printDebug "\n * stopTerms * \n" stopTerms
150 -- splitting monterms and multiterms to take proportional candidates
151 let
152 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if to small
153 monoSize = 0.4 :: Double
154 multSize = 1 - monoSize
155
156 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
157
158 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
159 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
160
161 -- printDebug "groupedMonoHead" (List.length groupedMonoHead)
162 -- printDebug "groupedMonoTail" (List.length groupedMonoHead)
163 -- printDebug "groupedMultHead" (List.length groupedMultHead)
164 -- printDebug "groupedMultTail" (List.length groupedMultTail)
165
166 let
167 -- Get Local Scores now for selected grouped ngrams
168 selectedTerms = Set.toList $ List.foldl'
169 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
170 $ Set.insert l' g
171 )
172 Set.empty
173 (groupedMonoHead <> groupedMultHead)
174
175 -- TO remove (and remove HasNodeError instance)
176 userListId <- defaultList uCid
177 masterListId <- defaultList mCid
178
179 mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
180
181 let
182 mapGroups = Map.fromList
183 $ map (\g -> (g ^. gt_stem, g))
184 $ groupedMonoHead <> groupedMultHead
185
186 -- grouping with Set NodeId
187 contextsAdded = foldl' (\mapGroups' k -> let k' = ngramsGroup groupParams k
188 in case Map.lookup k' mapGroups' of
189 Nothing -> mapGroups'
190 Just g -> case Map.lookup k mapTextDocIds of
191 Nothing -> mapGroups'
192 Just ns -> Map.insert k' ( g { _gt_nodes = Set.union ns (_gt_nodes g)}) mapGroups'
193 )
194 mapGroups
195 $ Map.keys mapTextDocIds
196
197 -- compute cooccurrences
198 mapCooc = Map.filter (>2)
199 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
200 | (t1, s1) <- mapStemNodeIds
201 , (t2, s2) <- mapStemNodeIds
202 ]
203 where
204 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
205 -- printDebug "mapCooc" mapCooc
206
207 let
208 -- computing scores
209 mapScores f = Map.fromList
210 $ map (\(Scored t g s') -> (t, f (g,s')))
211 $ normalizeGlobal
212 $ map normalizeLocal
213 $ scored' mapCooc
214
215 groupsWithScores = catMaybes
216 $ map (\(stem, g)
217 -> case Map.lookup stem mapScores' of
218 Nothing -> Nothing
219 Just s' -> Just $ g { _gt_score = s'}
220 ) $ Map.toList contextsAdded
221 where
222 mapScores' = mapScores identity
223 -- adapt2 TOCHECK with DC
224 -- printDebug "groupsWithScores" groupsWithScores
225 let
226 -- sort / partition / split
227 -- filter mono/multi again
228 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
229 -- filter with max score
230 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
231
232 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
233 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
234
235 -- splitAt
236 let
237 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
238 inclSize = 0.4 :: Double
239 exclSize = 1 - inclSize
240 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
241
242 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
243 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
244
245 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
246 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
247
248
249 -- Final Step building the Typed list
250 termListHead = maps <> cands
251 where
252 maps = set gt_listType (Just MapTerm)
253 <$> monoScoredInclHead
254 <> monoScoredExclHead
255 <> multScoredInclHead
256 <> multScoredExclHead
257
258 cands = set gt_listType (Just CandidateTerm)
259 <$> monoScoredInclTail
260 <> monoScoredExclTail
261 <> multScoredInclTail
262 <> multScoredExclTail
263
264 termListTail = map (set gt_listType (Just CandidateTerm)) ( groupedMonoTail <> groupedMultTail)
265
266 -- printDebug "monoScoredInclHead" monoScoredInclHead
267 -- printDebug "monoScoredExclHead" monoScoredExclTail
268 --
269 -- printDebug "multScoredInclHead" multScoredInclHead
270 -- printDebug "multScoredExclTail" multScoredExclTail
271
272 let result = Map.unionsWith (<>)
273 [ Map.fromList [( NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
274 <> (List.concat $ map toNgramsElement $ termListTail)
275 <> (List.concat $ map toNgramsElement $ stopTerms)
276 )]
277 ]
278 -- printDebug "\n result \n" r
279 pure result
280
281
282
283 toNgramsElement :: GroupedText a -> [NgramsElement]
284 toNgramsElement (GroupedText listType label _ setNgrams _ _ _) =
285 [parentElem] <> childrenElems
286 where
287 parent = label
288 children = Set.toList setNgrams
289 parentElem = mkNgramsElement (NgramsTerm parent)
290 (fromMaybe CandidateTerm listType)
291 Nothing
292 (mSetFromList (NgramsTerm <$> children))
293 childrenElems = map (\t -> mkNgramsElement t (fromMaybe CandidateTerm $ listType)
294 (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
295 (mSetFromList [])
296 ) (NgramsTerm <$> children)
297
298
299 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
300 toGargList l n = (l,n)
301
302
303 isStopTerm :: StopSize -> Text -> Bool
304 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
305 where
306 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
307
308 ------------------------------------------------------------------------------