]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[Clean] refact + toGroupedTree WIP
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
14
15 module Gargantext.Core.Text.List
16 where
17
18
19 import Control.Lens ((^.), set, view, over)
20 import Data.Map (Map)
21 import Data.Maybe (fromMaybe, catMaybes)
22 import Data.Ord (Down(..))
23 import Data.Set (Set)
24 import Data.Text (Text)
25 import Gargantext.API.Ngrams.Types (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
26 import Gargantext.API.Ngrams.Types (RepoCmdM)
27 import Gargantext.Core.Text.List.Group
28 import Gargantext.Core.Text.List.Group.Prelude
29 import Gargantext.Core.Text.List.Group.WithStem
30 import Gargantext.Core.Text.List.Social
31 import Gargantext.Core.Text.List.Social.Prelude
32 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
33 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
34 import Gargantext.Core.Types.Individu (User(..))
35 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
36 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
37 import Gargantext.Database.Admin.Types.Node (NodeId)
38 import Gargantext.Database.Prelude (CmdM)
39 import Gargantext.Database.Query.Table.Node (defaultList)
40 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
41 import Gargantext.Database.Query.Tree.Error (HasTreeError)
42 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
43 import Gargantext.Prelude
44 import qualified Data.Char as Char
45 import qualified Data.List as List
46 import qualified Data.Map as Map
47 import qualified Data.Set as Set
48 import qualified Data.Text as Text
49
50
51 -- | TODO improve grouping functions of Authors, Sources, Institutes..
52 buildNgramsLists :: ( RepoCmdM env err m
53 , CmdM env err m
54 , HasTreeError err
55 , HasNodeError err
56 )
57 => User
58 -> GroupParams
59 -> UserCorpusId
60 -> MasterCorpusId
61 -> m (Map NgramsType [NgramsElement])
62 buildNgramsLists user gp uCid mCid = do
63 ngTerms <- buildNgramsTermsList user uCid mCid gp
64 othersTerms <- mapM (buildNgramsOthersList user uCid (ngramsGroup GroupIdentity))
65 [ (Authors , MapListSize 9)
66 , (Sources , MapListSize 9)
67 , (Institutes, MapListSize 9)
68 ]
69
70 pure $ Map.unions $ [ngTerms] <> othersTerms
71
72
73 data MapListSize = MapListSize { unMapListSize :: !Int }
74
75 buildNgramsOthersList ::( HasNodeError err
76 , CmdM env err m
77 , RepoCmdM env err m
78 , HasTreeError err
79 )
80 => User
81 -> UserCorpusId
82 -> (Text -> Text)
83 -> (NgramsType, MapListSize)
84 -> m (Map NgramsType [NgramsElement])
85 buildNgramsOthersList user uCid groupIt (nt, MapListSize mapListSize) = do
86 ngs' :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt
87
88 socialLists' :: FlowCont Text FlowListScores
89 <- flowSocialList' MySelfFirst user nt (FlowCont Map.empty $ Set.fromList $ Map.keys ngs')
90 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
91
92 printDebug "flowSocialList'"
93 $ Map.filter (not . ((==) Map.empty) . view fls_parents)
94 $ view flc_scores socialLists'
95
96 let
97 groupParams = GroupedTextParams groupIt (Set.size . snd) fst snd {-(size . fst)-}
98 groupedWithList = toGroupedText groupParams (view flc_scores socialLists') ngs'
99
100 printDebug "groupedWithList"
101 $ Map.map (\v -> (view gt_label v, view gt_children v))
102 $ Map.filter (\v -> (Set.size $ view gt_children v) > 0)
103 $ groupedWithList
104
105 let
106 (stopTerms, tailTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
107 (mapTerms, tailTerms') = Map.partition (\t -> t ^. gt_listType == Just MapTerm) tailTerms
108
109 listSize = mapListSize - (List.length mapTerms)
110 (mapTerms', candiTerms) = List.splitAt listSize
111 $ List.sortOn (Down . _gt_score)
112 $ Map.elems tailTerms'
113
114 pure $ Map.fromList [( nt, (List.concat $ map toNgramsElement stopTerms)
115 <> (List.concat $ map toNgramsElement mapTerms )
116 <> (List.concat $ map toNgramsElement
117 $ map (set gt_listType (Just MapTerm )) mapTerms' )
118 <> (List.concat $ map toNgramsElement
119 $ map (set gt_listType (Just CandidateTerm)) candiTerms)
120 )]
121
122
123 -- TODO use ListIds
124 buildNgramsTermsList :: ( HasNodeError err
125 , CmdM env err m
126 , RepoCmdM env err m
127 , HasTreeError err
128 )
129 => User
130 -> UserCorpusId
131 -> MasterCorpusId
132 -> GroupParams
133 -> m (Map NgramsType [NgramsElement])
134 buildNgramsTermsList user uCid mCid groupParams = do
135
136 -- Computing global speGen score
137 allTerms :: Map Text Double <- getTficf uCid mCid NgramsTerms
138
139 -- printDebug "head candidates" (List.take 10 $ allTerms)
140 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
141
142 -- First remove stops terms
143 socialLists <- flowSocialList user NgramsTerms (Set.fromList $ map fst $ Map.toList allTerms)
144 -- printDebug "\n * socialLists * \n" socialLists
145
146 -- Grouping the ngrams and keeping the maximum score for label
147 let grouped = groupedTextWithStem ( GroupedTextParams (ngramsGroup groupParams) identity (const Set.empty) (const Set.empty) {-(size . _gt_label)-} ) allTerms
148
149 groupedWithList = map (addListType (invertForw socialLists)) grouped
150
151 (stopTerms, candidateTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
152 (groupedMono, groupedMult) = Map.partition (\t -> t ^. gt_size < 2) candidateTerms
153
154 -- printDebug "\n * stopTerms * \n" stopTerms
155 -- splitting monterms and multiterms to take proportional candidates
156 let
157 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if too small
158 monoSize = 0.4 :: Double
159 multSize = 1 - monoSize
160
161 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
162
163 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
164 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
165
166 -- printDebug "groupedMonoHead" (List.length groupedMonoHead)
167 -- printDebug "groupedMonoTail" (List.length groupedMonoHead)
168 -- printDebug "groupedMultHead" (List.length groupedMultHead)
169 -- printDebug "groupedMultTail" (List.length groupedMultTail)
170
171 let
172 -- Get Local Scores now for selected grouped ngrams
173 selectedTerms = Set.toList $ List.foldl'
174 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
175 $ Set.insert l' g
176 )
177 Set.empty
178 (groupedMonoHead <> groupedMultHead)
179
180 -- TO remove (and remove HasNodeError instance)
181 userListId <- defaultList uCid
182 masterListId <- defaultList mCid
183
184 mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
185
186 let
187 mapGroups = Map.fromList
188 $ map (\g -> (g ^. gt_stem, g))
189 $ groupedMonoHead <> groupedMultHead
190
191 -- grouping with Set NodeId
192 contextsAdded = foldl' (\mapGroups' k ->
193 let k' = ngramsGroup groupParams k in
194 case Map.lookup k' mapGroups' of
195 Nothing -> mapGroups'
196 Just g -> case Map.lookup k mapTextDocIds of
197 Nothing -> mapGroups'
198 Just ns -> Map.insert k' (over gt_nodes (Set.union ns) g) mapGroups'
199 )
200 mapGroups
201 $ Map.keys mapTextDocIds
202
203 -- compute cooccurrences
204 mapCooc = Map.filter (>2)
205 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
206 | (t1, s1) <- mapStemNodeIds
207 , (t2, s2) <- mapStemNodeIds
208 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
209 ]
210 where
211 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
212 -- printDebug "mapCooc" mapCooc
213
214 let
215 -- computing scores
216 mapScores f = Map.fromList
217 $ map (\(Scored t g s') -> (t, f (g,s')))
218 $ normalizeGlobal
219 $ map normalizeLocal
220 $ scored' mapCooc
221
222 groupsWithScores = catMaybes
223 $ map (\(stem, g)
224 -> case Map.lookup stem mapScores' of
225 Nothing -> Nothing
226 Just s' -> Just $ g { _gt_score = s'}
227 ) $ Map.toList contextsAdded
228 where
229 mapScores' = mapScores identity
230 -- adapt2 TOCHECK with DC
231 -- printDebug "groupsWithScores" groupsWithScores
232 let
233 -- sort / partition / split
234 -- filter mono/multi again
235 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
236 -- filter with max score
237 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
238
239 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
240 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
241
242 -- splitAt
243 let
244 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
245 inclSize = 0.4 :: Double
246 exclSize = 1 - inclSize
247 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
248
249 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
250 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
251
252 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
253 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
254
255
256 -- Final Step building the Typed list
257 termListHead = maps <> cands
258 where
259 maps = set gt_listType (Just MapTerm)
260 <$> monoScoredInclHead
261 <> monoScoredExclHead
262 <> multScoredInclHead
263 <> multScoredExclHead
264
265 cands = set gt_listType (Just CandidateTerm)
266 <$> monoScoredInclTail
267 <> monoScoredExclTail
268 <> multScoredInclTail
269 <> multScoredExclTail
270
271 termListTail = map (set gt_listType (Just CandidateTerm)) ( groupedMonoTail <> groupedMultTail)
272
273 -- printDebug "monoScoredInclHead" monoScoredInclHead
274 -- printDebug "monoScoredExclHead" monoScoredExclTail
275 -- printDebug "multScoredInclHead" multScoredInclHead
276 -- printDebug "multScoredExclTail" multScoredExclTail
277
278 let result = Map.unionsWith (<>)
279 [ Map.fromList [( NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
280 <> (List.concat $ map toNgramsElement $ termListTail)
281 <> (List.concat $ map toNgramsElement $ stopTerms)
282 )]
283 ]
284 -- printDebug "\n result \n" r
285 pure result
286
287
288
289 toNgramsElement :: GroupedText a -> [NgramsElement]
290 toNgramsElement (GroupedText listType label _ setNgrams _ _ _) =
291 [parentElem] <> childrenElems
292 where
293 parent = label
294 children = Set.toList setNgrams
295 parentElem = mkNgramsElement (NgramsTerm parent)
296 (fromMaybe CandidateTerm listType)
297 Nothing
298 (mSetFromList (NgramsTerm <$> children))
299 childrenElems = map (\t -> mkNgramsElement t (fromMaybe CandidateTerm $ listType)
300 (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
301 (mSetFromList [])
302 ) (NgramsTerm <$> children)
303
304
305 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
306 toGargList l n = (l,n)
307
308
309 isStopTerm :: StopSize -> Text -> Bool
310 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
311 where
312 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
313
314 ------------------------------------------------------------------------------