]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[FEAT] SocialList Map Text ScoresParent to Map Text Children
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE TemplateHaskell #-}
13
14 module Gargantext.Core.Text.List
15 where
16
17
18 import Control.Lens ((^.), set)
19 import Data.Maybe (fromMaybe, catMaybes)
20 import Data.Ord (Down(..))
21 import Data.Map (Map)
22 import Data.Set (Set)
23 import Data.Text (Text)
24 import qualified Data.Char as Char
25 import qualified Data.List as List
26 import qualified Data.Map as Map
27 import qualified Data.Set as Set
28 import qualified Data.Text as Text
29
30 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
31 import Gargantext.API.Ngrams.Types (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
32 import Gargantext.API.Ngrams.Types (RepoCmdM)
33 import Gargantext.Core.Text (size)
34 import Gargantext.Core.Text.List.Social (flowSocialList, flowSocialList', FlowSocialListPriority(..), invertForw)
35 import Gargantext.Core.Text.List.Social.Scores (FlowListScores)
36 import Gargantext.Core.Text.List.Group
37 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
38 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
39 import Gargantext.Core.Types.Individu (User(..))
40 import Gargantext.Database.Admin.Types.Node (NodeId)
41 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, groupNodesByNgramsWith, getNodesByNgramsOnlyUser)
42 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
43 import Gargantext.Database.Prelude (CmdM)
44 import Gargantext.Database.Query.Table.Node (defaultList)
45 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
46 import Gargantext.Database.Query.Tree.Error (HasTreeError)
47 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
48 import Gargantext.Prelude
49
50
51 -- | TODO improve grouping functions of Authors, Sources, Institutes..
52 buildNgramsLists :: ( RepoCmdM env err m
53 , CmdM env err m
54 , HasTreeError err
55 , HasNodeError err
56 )
57 => User
58 -> GroupParams
59 -> UserCorpusId
60 -> MasterCorpusId
61 -> m (Map NgramsType [NgramsElement])
62 buildNgramsLists user gp uCid mCid = do
63 ngTerms <- buildNgramsTermsList user uCid mCid gp
64 othersTerms <- mapM (buildNgramsOthersList user uCid (ngramsGroup GroupIdentity))
65 [ (Authors , MapListSize 9)
66 , (Sources , MapListSize 9)
67 , (Institutes, MapListSize 9)
68 ]
69
70 pure $ Map.unions $ [ngTerms] <> othersTerms
71
72
73 data MapListSize = MapListSize { unMapListSize :: !Int }
74
75 buildNgramsOthersList ::( HasNodeError err
76 , CmdM env err m
77 , RepoCmdM env err m
78 , HasTreeError err
79 )
80 => User
81 -> UserCorpusId
82 -> (Text -> Text)
83 -> (NgramsType, MapListSize)
84 -> m (Map NgramsType [NgramsElement])
85 buildNgramsOthersList user uCid groupIt (nt, MapListSize mapListSize) = do
86 ngs' :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt
87
88 socialLists' :: Map Text FlowListScores
89 <- flowSocialList' MySelfFirst user nt (Set.fromList $ Map.keys ngs')
90 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
91
92 let
93 groupParams = GroupedTextParams groupIt (Set.size . snd) fst snd {-(size . fst)-}
94 grouped' = toGroupedText groupParams socialLists' ngs'
95
96 -- 8< 8< 8< 8< 8< 8< 8<
97 let
98 ngs :: Map Text (Set Text, Set NodeId) = groupNodesByNgramsWith groupIt ngs'
99 socialLists <- flowSocialList user nt (Set.fromList $ Map.keys ngs)
100 -- >8 >8 >8 >8 >8 >8 >8
101
102 let
103 grouped = groupedTextWithStem (GroupedTextParams groupIt (Set.size . snd) fst snd {-(size . fst)-} ) -- socialLists'
104 $ Map.mapWithKey (\k (a,b) -> (Set.delete k a, b))
105 $ ngs
106
107
108 let
109 groupedWithList = map (addListType (invertForw socialLists)) grouped
110 (stopTerms, tailTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm)
111 groupedWithList
112 (mapTerms, tailTerms') = Map.partition (\t -> t ^. gt_listType == Just MapTerm)
113 tailTerms
114
115 listSize = mapListSize - (List.length mapTerms)
116 (mapTerms', candiTerms) = List.splitAt listSize
117 $ List.sortOn (Down . _gt_score)
118 $ Map.elems tailTerms'
119
120 pure $ Map.fromList [( nt, (List.concat $ map toNgramsElement stopTerms)
121 <> (List.concat $ map toNgramsElement mapTerms )
122 <> (List.concat $ map toNgramsElement
123 $ map (set gt_listType (Just MapTerm )) mapTerms' )
124 <> (List.concat $ map toNgramsElement
125 $ map (set gt_listType (Just CandidateTerm)) candiTerms)
126 )]
127
128 -- TODO use ListIds
129 buildNgramsTermsList :: ( HasNodeError err
130 , CmdM env err m
131 , RepoCmdM env err m
132 , HasTreeError err
133 )
134 => User
135 -> UserCorpusId
136 -> MasterCorpusId
137 -> GroupParams
138 -> m (Map NgramsType [NgramsElement])
139 buildNgramsTermsList user uCid mCid groupParams = do
140
141 -- Computing global speGen score
142 allTerms :: Map Text Double <- getTficf uCid mCid NgramsTerms
143
144 -- printDebug "head candidates" (List.take 10 $ allTerms)
145 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
146
147 -- First remove stops terms
148 socialLists <- flowSocialList user NgramsTerms (Set.fromList $ map fst $ Map.toList allTerms)
149 -- printDebug "\n * socialLists * \n" socialLists
150
151 -- Grouping the ngrams and keeping the maximum score for label
152 let grouped = groupedTextWithStem ( GroupedTextParams (ngramsGroup groupParams) identity (const Set.empty) (const Set.empty) {-(size . _gt_label)-} ) allTerms
153
154 groupedWithList = map (addListType (invertForw socialLists)) grouped
155
156 (stopTerms, candidateTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
157 (groupedMono, groupedMult) = Map.partition (\t -> t ^. gt_size < 2) candidateTerms
158
159 -- printDebug "\n * stopTerms * \n" stopTerms
160 -- splitting monterms and multiterms to take proportional candidates
161 let
162 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if too small
163 monoSize = 0.4 :: Double
164 multSize = 1 - monoSize
165
166 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
167
168 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
169 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
170
171 -- printDebug "groupedMonoHead" (List.length groupedMonoHead)
172 -- printDebug "groupedMonoTail" (List.length groupedMonoHead)
173 -- printDebug "groupedMultHead" (List.length groupedMultHead)
174 -- printDebug "groupedMultTail" (List.length groupedMultTail)
175
176 let
177 -- Get Local Scores now for selected grouped ngrams
178 selectedTerms = Set.toList $ List.foldl'
179 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
180 $ Set.insert l' g
181 )
182 Set.empty
183 (groupedMonoHead <> groupedMultHead)
184
185 -- TO remove (and remove HasNodeError instance)
186 userListId <- defaultList uCid
187 masterListId <- defaultList mCid
188
189 mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
190
191 let
192 mapGroups = Map.fromList
193 $ map (\g -> (g ^. gt_stem, g))
194 $ groupedMonoHead <> groupedMultHead
195
196 -- grouping with Set NodeId
197 contextsAdded = foldl' (\mapGroups' k ->
198 let k' = ngramsGroup groupParams k in
199 case Map.lookup k' mapGroups' of
200 Nothing -> mapGroups'
201 Just g -> case Map.lookup k mapTextDocIds of
202 Nothing -> mapGroups'
203 Just ns -> Map.insert k' ( g { _gt_nodes = Set.union ns (_gt_nodes g)}) mapGroups'
204 )
205 mapGroups
206 $ Map.keys mapTextDocIds
207
208 -- compute cooccurrences
209 mapCooc = Map.filter (>2)
210 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
211 | (t1, s1) <- mapStemNodeIds
212 , (t2, s2) <- mapStemNodeIds
213 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
214 ]
215 where
216 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
217 -- printDebug "mapCooc" mapCooc
218
219 let
220 -- computing scores
221 mapScores f = Map.fromList
222 $ map (\(Scored t g s') -> (t, f (g,s')))
223 $ normalizeGlobal
224 $ map normalizeLocal
225 $ scored' mapCooc
226
227 groupsWithScores = catMaybes
228 $ map (\(stem, g)
229 -> case Map.lookup stem mapScores' of
230 Nothing -> Nothing
231 Just s' -> Just $ g { _gt_score = s'}
232 ) $ Map.toList contextsAdded
233 where
234 mapScores' = mapScores identity
235 -- adapt2 TOCHECK with DC
236 -- printDebug "groupsWithScores" groupsWithScores
237 let
238 -- sort / partition / split
239 -- filter mono/multi again
240 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
241 -- filter with max score
242 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
243
244 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
245 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
246
247 -- splitAt
248 let
249 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
250 inclSize = 0.4 :: Double
251 exclSize = 1 - inclSize
252 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
253
254 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
255 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
256
257 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
258 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
259
260
261 -- Final Step building the Typed list
262 termListHead = maps <> cands
263 where
264 maps = set gt_listType (Just MapTerm)
265 <$> monoScoredInclHead
266 <> monoScoredExclHead
267 <> multScoredInclHead
268 <> multScoredExclHead
269
270 cands = set gt_listType (Just CandidateTerm)
271 <$> monoScoredInclTail
272 <> monoScoredExclTail
273 <> multScoredInclTail
274 <> multScoredExclTail
275
276 termListTail = map (set gt_listType (Just CandidateTerm)) ( groupedMonoTail <> groupedMultTail)
277
278 -- printDebug "monoScoredInclHead" monoScoredInclHead
279 -- printDebug "monoScoredExclHead" monoScoredExclTail
280 -- printDebug "multScoredInclHead" multScoredInclHead
281 -- printDebug "multScoredExclTail" multScoredExclTail
282
283 let result = Map.unionsWith (<>)
284 [ Map.fromList [( NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
285 <> (List.concat $ map toNgramsElement $ termListTail)
286 <> (List.concat $ map toNgramsElement $ stopTerms)
287 )]
288 ]
289 -- printDebug "\n result \n" r
290 pure result
291
292
293
294 toNgramsElement :: GroupedText a -> [NgramsElement]
295 toNgramsElement (GroupedText listType label _ setNgrams _ _ _) =
296 [parentElem] <> childrenElems
297 where
298 parent = label
299 children = Set.toList setNgrams
300 parentElem = mkNgramsElement (NgramsTerm parent)
301 (fromMaybe CandidateTerm listType)
302 Nothing
303 (mSetFromList (NgramsTerm <$> children))
304 childrenElems = map (\t -> mkNgramsElement t (fromMaybe CandidateTerm $ listType)
305 (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
306 (mSetFromList [])
307 ) (NgramsTerm <$> children)
308
309
310 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
311 toGargList l n = (l,n)
312
313
314 isStopTerm :: StopSize -> Text -> Bool
315 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
316 where
317 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
318
319 ------------------------------------------------------------------------------