]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[Type] Instance FlowLists
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
14
15 module Gargantext.Core.Text.List
16 where
17
18
19 import Control.Lens ((^.), set, view, over)
20 import Data.Map (Map)
21 import Data.Maybe (fromMaybe, catMaybes)
22 import Data.Ord (Down(..))
23 import Data.Set (Set)
24 import Data.Text (Text)
25 import Gargantext.API.Ngrams.Types (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
26 import Gargantext.API.Ngrams.Types (RepoCmdM)
27 import Gargantext.Core.Text.List.Group
28 import Gargantext.Core.Text.List.Group.Prelude
29 import Gargantext.Core.Text.List.Group.WithStem
30 import Gargantext.Core.Text.List.Social
31 import Gargantext.Core.Text.List.Social.Prelude
32 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
33 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
34 import Gargantext.Core.Types.Individu (User(..))
35 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
36 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
37 import Gargantext.Database.Admin.Types.Node (NodeId)
38 import Gargantext.Database.Prelude (CmdM)
39 import Gargantext.Database.Query.Table.Node (defaultList)
40 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
41 import Gargantext.Database.Query.Tree.Error (HasTreeError)
42 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
43 import Gargantext.Prelude
44 import qualified Data.Char as Char
45 import qualified Data.List as List
46 import qualified Data.Map as Map
47 import qualified Data.Set as Set
48 import qualified Data.Text as Text
49
50
51 -- | TODO improve grouping functions of Authors, Sources, Institutes..
52 buildNgramsLists :: ( RepoCmdM env err m
53 , CmdM env err m
54 , HasTreeError err
55 , HasNodeError err
56 )
57 => User
58 -> GroupParams
59 -> UserCorpusId
60 -> MasterCorpusId
61 -> m (Map NgramsType [NgramsElement])
62 buildNgramsLists user gp uCid mCid = do
63 ngTerms <- buildNgramsTermsList user uCid mCid gp
64 othersTerms <- mapM (buildNgramsOthersList user uCid (ngramsGroup GroupIdentity))
65 [ (Authors , MapListSize 9)
66 , (Sources , MapListSize 9)
67 , (Institutes, MapListSize 9)
68 ]
69
70 pure $ Map.unions $ [ngTerms] <> othersTerms
71
72
73 data MapListSize = MapListSize { unMapListSize :: !Int }
74
75 buildNgramsOthersList ::( HasNodeError err
76 , CmdM env err m
77 , RepoCmdM env err m
78 , HasTreeError err
79 )
80 => User
81 -> UserCorpusId
82 -> (Text -> Text)
83 -> (NgramsType, MapListSize)
84 -> m (Map NgramsType [NgramsElement])
85 buildNgramsOthersList user uCid groupIt (nt, MapListSize mapListSize) = do
86 ngs' :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt
87
88 socialLists' :: FlowCont Text FlowListScores
89 <- flowSocialList' MySelfFirst user nt (FlowCont Map.empty $ Set.fromList $ Map.keys ngs')
90 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
91
92 {-
93 printDebug "flowSocialList'"
94 $ Map.filter (not . ((==) Map.empty) . view fls_parents)
95 $ view flc_scores socialLists'
96 -}
97
98 let
99 groupParams = GroupedTextParams groupIt (Set.size . snd) fst snd {-(size . fst)-}
100 groupedWithList = toGroupedText groupParams (view flc_scores socialLists') ngs'
101
102 {-
103 printDebug "groupedWithList"
104 $ Map.map (\v -> (view gt_label v, view gt_children v))
105 $ Map.filter (\v -> (Set.size $ view gt_children v) > 0)
106 $ groupedWithList
107 -}
108
109 let
110 (stopTerms, tailTerms) = Map.partition ((== Just StopTerm) . viewListType) groupedWithList
111 (mapTerms, tailTerms') = Map.partition ((== Just MapTerm) . viewListType) tailTerms
112
113 listSize = mapListSize - (List.length mapTerms)
114 (mapTerms', candiTerms) = List.splitAt listSize
115 $ List.sortOn (Down . _gt_score)
116 $ Map.elems tailTerms'
117
118 pure $ Map.fromList [( nt, (List.concat $ map toNgramsElement stopTerms)
119 <> (List.concat $ map toNgramsElement mapTerms )
120 <> (List.concat $ map toNgramsElement
121 $ map (setListType (Just MapTerm )) mapTerms' )
122 <> (List.concat $ map toNgramsElement
123 $ map (setListType (Just CandidateTerm)) candiTerms)
124 )]
125
126
127 -- TODO use ListIds
128 buildNgramsTermsList :: ( HasNodeError err
129 , CmdM env err m
130 , RepoCmdM env err m
131 , HasTreeError err
132 )
133 => User
134 -> UserCorpusId
135 -> MasterCorpusId
136 -> GroupParams
137 -> m (Map NgramsType [NgramsElement])
138 buildNgramsTermsList user uCid mCid groupParams = do
139
140 -- Computing global speGen score
141 allTerms :: Map Text Double <- getTficf uCid mCid NgramsTerms
142
143 -- printDebug "head candidates" (List.take 10 $ allTerms)
144 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
145
146 -- First remove stops terms
147 socialLists <- flowSocialList user NgramsTerms (Set.fromList $ map fst $ Map.toList allTerms)
148 -- printDebug "\n * socialLists * \n" socialLists
149
150 -- Grouping the ngrams and keeping the maximum score for label
151 let grouped = groupedTextWithStem ( GroupedTextParams (ngramsGroup groupParams) identity (const Set.empty) (const Set.empty) {-(size . _gt_label)-} ) allTerms
152
153 groupedWithList = map (addListType (invertForw socialLists)) grouped
154
155 (stopTerms, candidateTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
156 (groupedMono, groupedMult) = Map.partition (\t -> t ^. gt_size < 2) candidateTerms
157
158 -- printDebug "\n * stopTerms * \n" stopTerms
159 -- splitting monterms and multiterms to take proportional candidates
160 let
161 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if too small
162 monoSize = 0.4 :: Double
163 multSize = 1 - monoSize
164
165 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
166
167 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
168 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
169
170 -- printDebug "groupedMonoHead" (List.length groupedMonoHead)
171 -- printDebug "groupedMonoTail" (List.length groupedMonoHead)
172 -- printDebug "groupedMultHead" (List.length groupedMultHead)
173 -- printDebug "groupedMultTail" (List.length groupedMultTail)
174
175 let
176 -- Get Local Scores now for selected grouped ngrams
177 selectedTerms = Set.toList $ List.foldl'
178 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
179 $ Set.insert l' g
180 )
181 Set.empty
182 (groupedMonoHead <> groupedMultHead)
183
184 -- TO remove (and remove HasNodeError instance)
185 userListId <- defaultList uCid
186 masterListId <- defaultList mCid
187
188 mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
189
190 let
191 mapGroups = Map.fromList
192 $ map (\g -> (g ^. gt_stem, g))
193 $ groupedMonoHead <> groupedMultHead
194
195 -- grouping with Set NodeId
196 contextsAdded = foldl' (\mapGroups' k ->
197 let k' = ngramsGroup groupParams k in
198 case Map.lookup k' mapGroups' of
199 Nothing -> mapGroups'
200 Just g -> case Map.lookup k mapTextDocIds of
201 Nothing -> mapGroups'
202 Just ns -> Map.insert k' (over gt_nodes (Set.union ns) g) mapGroups'
203 )
204 mapGroups
205 $ Map.keys mapTextDocIds
206
207 -- compute cooccurrences
208 mapCooc = Map.filter (>2)
209 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
210 | (t1, s1) <- mapStemNodeIds
211 , (t2, s2) <- mapStemNodeIds
212 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
213 ]
214 where
215 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
216 -- printDebug "mapCooc" mapCooc
217
218 let
219 -- computing scores
220 mapScores f = Map.fromList
221 $ map (\(Scored t g s') -> (t, f (g,s')))
222 $ normalizeGlobal
223 $ map normalizeLocal
224 $ scored' mapCooc
225
226 groupsWithScores = catMaybes
227 $ map (\(stem, g)
228 -> case Map.lookup stem mapScores' of
229 Nothing -> Nothing
230 Just s' -> Just $ g { _gt_score = s'}
231 ) $ Map.toList contextsAdded
232 where
233 mapScores' = mapScores identity
234 -- adapt2 TOCHECK with DC
235 -- printDebug "groupsWithScores" groupsWithScores
236 let
237 -- sort / partition / split
238 -- filter mono/multi again
239 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
240 -- filter with max score
241 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
242
243 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
244 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
245
246 -- splitAt
247 let
248 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
249 inclSize = 0.4 :: Double
250 exclSize = 1 - inclSize
251 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
252
253 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
254 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
255
256 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
257 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
258
259
260 -- Final Step building the Typed list
261 termListHead = maps <> cands
262 where
263 maps = set gt_listType (Just MapTerm)
264 <$> monoScoredInclHead
265 <> monoScoredExclHead
266 <> multScoredInclHead
267 <> multScoredExclHead
268
269 cands = set gt_listType (Just CandidateTerm)
270 <$> monoScoredInclTail
271 <> monoScoredExclTail
272 <> multScoredInclTail
273 <> multScoredExclTail
274
275 termListTail = map (set gt_listType (Just CandidateTerm)) ( groupedMonoTail <> groupedMultTail)
276
277 -- printDebug "monoScoredInclHead" monoScoredInclHead
278 -- printDebug "monoScoredExclHead" monoScoredExclTail
279 -- printDebug "multScoredInclHead" multScoredInclHead
280 -- printDebug "multScoredExclTail" multScoredExclTail
281
282 let result = Map.unionsWith (<>)
283 [ Map.fromList [( NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
284 <> (List.concat $ map toNgramsElement $ termListTail)
285 <> (List.concat $ map toNgramsElement $ stopTerms)
286 )]
287 ]
288 -- printDebug "\n result \n" r
289 pure result
290
291
292
293 toNgramsElement :: GroupedText a -> [NgramsElement]
294 toNgramsElement (GroupedText listType label _ setNgrams _ _ _) =
295 [parentElem] <> childrenElems
296 where
297 parent = label
298 children = Set.toList setNgrams
299 parentElem = mkNgramsElement (NgramsTerm parent)
300 (fromMaybe CandidateTerm listType)
301 Nothing
302 (mSetFromList (NgramsTerm <$> children))
303 childrenElems = map (\t -> mkNgramsElement t (fromMaybe CandidateTerm $ listType)
304 (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
305 (mSetFromList [])
306 ) (NgramsTerm <$> children)
307
308
309 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
310 toGargList l n = (l,n)
311
312
313 isStopTerm :: StopSize -> Text -> Bool
314 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
315 where
316 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
317
318 ------------------------------------------------------------------------------