]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[DOC] better variable names + warnings
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
14
15 module Gargantext.Core.Text.List
16 where
17
18
19 import Control.Lens ((^.), set, view, over)
20 import Data.Maybe (fromMaybe, catMaybes)
21 import Data.Ord (Down(..))
22 import Data.Map (Map)
23 import Data.Set (Set)
24 import Data.Text (Text)
25 import qualified Data.Char as Char
26 import qualified Data.List as List
27 import qualified Data.Map as Map
28 import qualified Data.Set as Set
29 import qualified Data.Text as Text
30
31 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
32 import Gargantext.API.Ngrams.Types (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
33 import Gargantext.API.Ngrams.Types (RepoCmdM)
34 import Gargantext.Core.Text.List.Social
35 import Gargantext.Core.Text.List.Social.Prelude
36 import Gargantext.Core.Text.List.Group
37 import Gargantext.Core.Text.List.Group.WithStem
38 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
39 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
40 import Gargantext.Core.Types.Individu (User(..))
41 import Gargantext.Database.Admin.Types.Node (NodeId)
42 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
43 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
44 import Gargantext.Database.Prelude (CmdM)
45 import Gargantext.Database.Query.Table.Node (defaultList)
46 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
47 import Gargantext.Database.Query.Tree.Error (HasTreeError)
48 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
49 import Gargantext.Prelude
50
51
52 -- | TODO improve grouping functions of Authors, Sources, Institutes..
53 buildNgramsLists :: ( RepoCmdM env err m
54 , CmdM env err m
55 , HasTreeError err
56 , HasNodeError err
57 )
58 => User
59 -> GroupParams
60 -> UserCorpusId
61 -> MasterCorpusId
62 -> m (Map NgramsType [NgramsElement])
63 buildNgramsLists user gp uCid mCid = do
64 ngTerms <- buildNgramsTermsList user uCid mCid gp
65 othersTerms <- mapM (buildNgramsOthersList user uCid (ngramsGroup GroupIdentity))
66 [ (Authors , MapListSize 9)
67 , (Sources , MapListSize 9)
68 , (Institutes, MapListSize 9)
69 ]
70
71 pure $ Map.unions $ [ngTerms] <> othersTerms
72
73
74 data MapListSize = MapListSize { unMapListSize :: !Int }
75
76 buildNgramsOthersList ::( HasNodeError err
77 , CmdM env err m
78 , RepoCmdM env err m
79 , HasTreeError err
80 )
81 => User
82 -> UserCorpusId
83 -> (Text -> Text)
84 -> (NgramsType, MapListSize)
85 -> m (Map NgramsType [NgramsElement])
86 buildNgramsOthersList user uCid groupIt (nt, MapListSize mapListSize) = do
87 ngs' :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt
88
89 socialLists' :: FlowListCont Text
90 <- flowSocialList' MySelfFirst user nt (FlowListCont Map.empty $ Set.fromList $ Map.keys ngs')
91 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
92
93 printDebug "flowSocialList'"
94 $ Map.filter (not . ((==) Map.empty) . view fls_parents)
95 $ view flc_scores socialLists'
96
97 let
98 groupParams = GroupedTextParams groupIt (Set.size . snd) fst snd {-(size . fst)-}
99 groupedWithList = toGroupedText groupParams (view flc_scores socialLists') ngs'
100
101 printDebug "groupedWithList"
102 $ Map.map (\v -> (view gt_label v, view gt_children v))
103 $ Map.filter (\v -> (Set.size $ view gt_children v) > 0)
104 $ groupedWithList
105
106 let
107 (stopTerms, tailTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
108 (mapTerms, tailTerms') = Map.partition (\t -> t ^. gt_listType == Just MapTerm) tailTerms
109
110 listSize = mapListSize - (List.length mapTerms)
111 (mapTerms', candiTerms) = List.splitAt listSize
112 $ List.sortOn (Down . _gt_score)
113 $ Map.elems tailTerms'
114
115 pure $ Map.fromList [( nt, (List.concat $ map toNgramsElement stopTerms)
116 <> (List.concat $ map toNgramsElement mapTerms )
117 <> (List.concat $ map toNgramsElement
118 $ map (set gt_listType (Just MapTerm )) mapTerms' )
119 <> (List.concat $ map toNgramsElement
120 $ map (set gt_listType (Just CandidateTerm)) candiTerms)
121 )]
122
123
124 -- TODO use ListIds
125 buildNgramsTermsList :: ( HasNodeError err
126 , CmdM env err m
127 , RepoCmdM env err m
128 , HasTreeError err
129 )
130 => User
131 -> UserCorpusId
132 -> MasterCorpusId
133 -> GroupParams
134 -> m (Map NgramsType [NgramsElement])
135 buildNgramsTermsList user uCid mCid groupParams = do
136
137 -- Computing global speGen score
138 allTerms :: Map Text Double <- getTficf uCid mCid NgramsTerms
139
140 -- printDebug "head candidates" (List.take 10 $ allTerms)
141 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
142
143 -- First remove stops terms
144 socialLists <- flowSocialList user NgramsTerms (Set.fromList $ map fst $ Map.toList allTerms)
145 -- printDebug "\n * socialLists * \n" socialLists
146
147 -- Grouping the ngrams and keeping the maximum score for label
148 let grouped = groupedTextWithStem ( GroupedTextParams (ngramsGroup groupParams) identity (const Set.empty) (const Set.empty) {-(size . _gt_label)-} ) allTerms
149
150 groupedWithList = map (addListType (invertForw socialLists)) grouped
151
152 (stopTerms, candidateTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
153 (groupedMono, groupedMult) = Map.partition (\t -> t ^. gt_size < 2) candidateTerms
154
155 -- printDebug "\n * stopTerms * \n" stopTerms
156 -- splitting monterms and multiterms to take proportional candidates
157 let
158 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if too small
159 monoSize = 0.4 :: Double
160 multSize = 1 - monoSize
161
162 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
163
164 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
165 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
166
167 -- printDebug "groupedMonoHead" (List.length groupedMonoHead)
168 -- printDebug "groupedMonoTail" (List.length groupedMonoHead)
169 -- printDebug "groupedMultHead" (List.length groupedMultHead)
170 -- printDebug "groupedMultTail" (List.length groupedMultTail)
171
172 let
173 -- Get Local Scores now for selected grouped ngrams
174 selectedTerms = Set.toList $ List.foldl'
175 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
176 $ Set.insert l' g
177 )
178 Set.empty
179 (groupedMonoHead <> groupedMultHead)
180
181 -- TO remove (and remove HasNodeError instance)
182 userListId <- defaultList uCid
183 masterListId <- defaultList mCid
184
185 mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
186
187 let
188 mapGroups = Map.fromList
189 $ map (\g -> (g ^. gt_stem, g))
190 $ groupedMonoHead <> groupedMultHead
191
192 -- grouping with Set NodeId
193 contextsAdded = foldl' (\mapGroups' k ->
194 let k' = ngramsGroup groupParams k in
195 case Map.lookup k' mapGroups' of
196 Nothing -> mapGroups'
197 Just g -> case Map.lookup k mapTextDocIds of
198 Nothing -> mapGroups'
199 Just ns -> Map.insert k' (over gt_nodes (Set.union ns) g) mapGroups'
200 )
201 mapGroups
202 $ Map.keys mapTextDocIds
203
204 -- compute cooccurrences
205 mapCooc = Map.filter (>2)
206 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
207 | (t1, s1) <- mapStemNodeIds
208 , (t2, s2) <- mapStemNodeIds
209 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
210 ]
211 where
212 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
213 -- printDebug "mapCooc" mapCooc
214
215 let
216 -- computing scores
217 mapScores f = Map.fromList
218 $ map (\(Scored t g s') -> (t, f (g,s')))
219 $ normalizeGlobal
220 $ map normalizeLocal
221 $ scored' mapCooc
222
223 groupsWithScores = catMaybes
224 $ map (\(stem, g)
225 -> case Map.lookup stem mapScores' of
226 Nothing -> Nothing
227 Just s' -> Just $ g { _gt_score = s'}
228 ) $ Map.toList contextsAdded
229 where
230 mapScores' = mapScores identity
231 -- adapt2 TOCHECK with DC
232 -- printDebug "groupsWithScores" groupsWithScores
233 let
234 -- sort / partition / split
235 -- filter mono/multi again
236 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
237 -- filter with max score
238 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
239
240 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
241 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
242
243 -- splitAt
244 let
245 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
246 inclSize = 0.4 :: Double
247 exclSize = 1 - inclSize
248 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
249
250 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
251 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
252
253 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
254 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
255
256
257 -- Final Step building the Typed list
258 termListHead = maps <> cands
259 where
260 maps = set gt_listType (Just MapTerm)
261 <$> monoScoredInclHead
262 <> monoScoredExclHead
263 <> multScoredInclHead
264 <> multScoredExclHead
265
266 cands = set gt_listType (Just CandidateTerm)
267 <$> monoScoredInclTail
268 <> monoScoredExclTail
269 <> multScoredInclTail
270 <> multScoredExclTail
271
272 termListTail = map (set gt_listType (Just CandidateTerm)) ( groupedMonoTail <> groupedMultTail)
273
274 -- printDebug "monoScoredInclHead" monoScoredInclHead
275 -- printDebug "monoScoredExclHead" monoScoredExclTail
276 -- printDebug "multScoredInclHead" multScoredInclHead
277 -- printDebug "multScoredExclTail" multScoredExclTail
278
279 let result = Map.unionsWith (<>)
280 [ Map.fromList [( NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
281 <> (List.concat $ map toNgramsElement $ termListTail)
282 <> (List.concat $ map toNgramsElement $ stopTerms)
283 )]
284 ]
285 -- printDebug "\n result \n" r
286 pure result
287
288
289
290 toNgramsElement :: GroupedText a -> [NgramsElement]
291 toNgramsElement (GroupedText listType label _ setNgrams _ _ _) =
292 [parentElem] <> childrenElems
293 where
294 parent = label
295 children = Set.toList setNgrams
296 parentElem = mkNgramsElement (NgramsTerm parent)
297 (fromMaybe CandidateTerm listType)
298 Nothing
299 (mSetFromList (NgramsTerm <$> children))
300 childrenElems = map (\t -> mkNgramsElement t (fromMaybe CandidateTerm $ listType)
301 (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
302 (mSetFromList [])
303 ) (NgramsTerm <$> children)
304
305
306 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
307 toGargList l n = (l,n)
308
309
310 isStopTerm :: StopSize -> Text -> Bool
311 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
312 where
313 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
314
315 ------------------------------------------------------------------------------