]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[FIX] Social List for all NgramsType implemented (needs groups heritage now).
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE TemplateHaskell #-}
13
14 module Gargantext.Core.Text.List
15 where
16
17
18 import Control.Lens ((^.), set)
19 import Data.Maybe (fromMaybe, catMaybes)
20 import Data.Ord (Down(..))
21 import Data.Map (Map)
22 import Data.Text (Text)
23 import qualified Data.Char as Char
24 import qualified Data.List as List
25 import qualified Data.Map as Map
26 import qualified Data.Set as Set
27 import qualified Data.Text as Text
28
29 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
30 import Gargantext.API.Ngrams.Types (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
31 import Gargantext.API.Ngrams.Types (RepoCmdM)
32 import Gargantext.Core.Text.List.Social (flowSocialList, invertForw)
33 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
34 import Gargantext.Core.Text.Group
35 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
36 import Gargantext.Core.Types.Individu (User(..))
37 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, groupNodesByNgramsWith, getNodesByNgramsOnlyUser)
38 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
39 import Gargantext.Database.Prelude (CmdM)
40 import Gargantext.Database.Query.Table.Node (defaultList)
41 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
42 import Gargantext.Database.Query.Tree.Error (HasTreeError)
43 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
44 import Gargantext.Prelude
45
46
47 -- | TODO improve grouping functions of Authors, Sources, Institutes..
48 buildNgramsLists :: ( RepoCmdM env err m
49 , CmdM env err m
50 , HasTreeError err
51 , HasNodeError err
52 )
53 => User
54 -> GroupParams
55 -> UserCorpusId
56 -> MasterCorpusId
57 -> m (Map NgramsType [NgramsElement])
58 buildNgramsLists user gp uCid mCid = do
59 ngTerms <- buildNgramsTermsList user uCid mCid gp
60 othersTerms <- mapM (buildNgramsOthersList user uCid (ngramsGroup GroupIdentity))
61 [ (Authors, MapListSize 9)
62 , (Sources, MapListSize 9)
63 , (Institutes, MapListSize 9)
64 ]
65
66 pure $ Map.unions $ [ngTerms] <> othersTerms
67
68
69 data MapListSize = MapListSize Int
70
71 buildNgramsOthersList ::( HasNodeError err
72 , CmdM env err m
73 , RepoCmdM env err m
74 , HasTreeError err
75 )
76 => User
77 -> UserCorpusId
78 -> (Text -> Text)
79 -> (NgramsType, MapListSize)
80 -> m (Map NgramsType [NgramsElement])
81 buildNgramsOthersList user uCid groupIt (nt, MapListSize mapListSize) = do
82
83 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
84
85 let
86 grouped = toGroupedText groupIt (Set.size . snd) fst snd (Map.toList $ Map.mapWithKey (\k (a,b) -> (Set.delete k a, b)) $ ngs)
87
88 socialLists <- flowSocialList user nt (Set.fromList $ Map.keys ngs)
89
90 let
91 groupedWithList = map (addListType (invertForw socialLists)) grouped
92 (stopTerms, tailTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
93 (mapTerms, tailTerms') = Map.partition (\t -> t ^. gt_listType == Just MapTerm) tailTerms
94
95 listSize = mapListSize - (List.length mapTerms)
96 (mapTerms', candiTerms) = List.splitAt listSize $ List.sortOn (Down . _gt_score) $ Map.elems tailTerms'
97
98 pure $ Map.fromList [( nt, (List.concat $ map toNgramsElement stopTerms)
99 <> (List.concat $ map toNgramsElement mapTerms)
100 <> (List.concat $ map toNgramsElement $ map (set gt_listType (Just MapTerm)) mapTerms')
101 <> (List.concat $ map toNgramsElement $ map (set gt_listType (Just CandidateTerm)) candiTerms)
102 )]
103
104 -- TODO use ListIds
105 buildNgramsTermsList :: ( HasNodeError err
106 , CmdM env err m
107 , RepoCmdM env err m
108 , HasTreeError err
109 )
110 => User
111 -> UserCorpusId
112 -> MasterCorpusId
113 -> GroupParams
114 -> m (Map NgramsType [NgramsElement])
115 buildNgramsTermsList user uCid mCid groupParams = do
116
117 -- Computing global speGen score
118 allTerms <- Map.toList <$> getTficf uCid mCid NgramsTerms
119
120 -- printDebug "head candidates" (List.take 10 $ allTerms)
121 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
122
123 -- First remove stops terms
124 socialLists <- flowSocialList user NgramsTerms (Set.fromList $ map fst allTerms)
125 -- printDebug "\n * socialLists * \n" socialLists
126
127
128 -- Grouping the ngrams and keeping the maximum score for label
129 let grouped = toGroupedText (ngramsGroup groupParams) identity (const Set.empty) (const Set.empty) allTerms
130
131 groupedWithList = map (addListType (invertForw socialLists)) grouped
132
133 (stopTerms, candidateTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
134 (groupedMono, groupedMult) = Map.partition (\t -> t ^. gt_size < 2) candidateTerms
135
136 -- printDebug "\n * stopTerms * \n" stopTerms
137 -- splitting monterms and multiterms to take proportional candidates
138 let
139 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if to small
140 monoSize = 0.4 :: Double
141 multSize = 1 - monoSize
142
143 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
144
145 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
146 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
147
148 -- printDebug "groupedMonoHead" (List.length groupedMonoHead)
149 -- printDebug "groupedMonoTail" (List.length groupedMonoHead)
150 -- printDebug "groupedMultHead" (List.length groupedMultHead)
151 -- printDebug "groupedMultTail" (List.length groupedMultTail)
152
153 let
154 -- Get Local Scores now for selected grouped ngrams
155 selectedTerms = Set.toList $ List.foldl'
156 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
157 $ Set.insert l' g
158 )
159 Set.empty
160 (groupedMonoHead <> groupedMultHead)
161
162 -- TO remove (and remove HasNodeError instance)
163 userListId <- defaultList uCid
164 masterListId <- defaultList mCid
165
166 mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
167
168 let
169 mapGroups = Map.fromList
170 $ map (\g -> (g ^. gt_stem, g))
171 $ groupedMonoHead <> groupedMultHead
172
173 -- grouping with Set NodeId
174 contextsAdded = foldl' (\mapGroups' k -> let k' = ngramsGroup groupParams k
175 in case Map.lookup k' mapGroups' of
176 Nothing -> mapGroups'
177 Just g -> case Map.lookup k mapTextDocIds of
178 Nothing -> mapGroups'
179 Just ns -> Map.insert k' ( g { _gt_nodes = Set.union ns (_gt_nodes g)}) mapGroups'
180 )
181 mapGroups
182 $ Map.keys mapTextDocIds
183
184 -- compute cooccurrences
185 mapCooc = Map.filter (>2)
186 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
187 | (t1, s1) <- mapStemNodeIds
188 , (t2, s2) <- mapStemNodeIds
189 ]
190 where
191 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
192 -- printDebug "mapCooc" mapCooc
193
194 let
195 -- computing scores
196 mapScores f = Map.fromList
197 $ map (\(Scored t g s') -> (t, f (g,s')))
198 $ normalizeGlobal
199 $ map normalizeLocal
200 $ scored' mapCooc
201
202 groupsWithScores = catMaybes
203 $ map (\(stem, g)
204 -> case Map.lookup stem mapScores' of
205 Nothing -> Nothing
206 Just s' -> Just $ g { _gt_score = s'}
207 ) $ Map.toList contextsAdded
208 where
209 mapScores' = mapScores identity
210 -- adapt2 TOCHECK with DC
211 -- printDebug "groupsWithScores" groupsWithScores
212 let
213 -- sort / partition / split
214 -- filter mono/multi again
215 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
216 -- filter with max score
217 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
218
219 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
220 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
221
222 -- splitAt
223 let
224 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
225 inclSize = 0.4 :: Double
226 exclSize = 1 - inclSize
227 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
228
229 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
230 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
231
232 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
233 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
234
235
236 -- Final Step building the Typed list
237 termListHead = maps <> cands
238 where
239 maps = set gt_listType (Just MapTerm)
240 <$> monoScoredInclHead
241 <> monoScoredExclHead
242 <> multScoredInclHead
243 <> multScoredExclHead
244
245 cands = set gt_listType (Just CandidateTerm)
246 <$> monoScoredInclTail
247 <> monoScoredExclTail
248 <> multScoredInclTail
249 <> multScoredExclTail
250
251 termListTail = map (set gt_listType (Just CandidateTerm)) ( groupedMonoTail <> groupedMultTail)
252
253 -- printDebug "monoScoredInclHead" monoScoredInclHead
254 -- printDebug "monoScoredExclHead" monoScoredExclTail
255 --
256 -- printDebug "multScoredInclHead" multScoredInclHead
257 -- printDebug "multScoredExclTail" multScoredExclTail
258
259 let result = Map.unionsWith (<>)
260 [ Map.fromList [( NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
261 <> (List.concat $ map toNgramsElement $ termListTail)
262 <> (List.concat $ map toNgramsElement $ stopTerms)
263 )]
264 ]
265 -- printDebug "\n result \n" r
266 pure result
267
268
269
270 toNgramsElement :: GroupedText a -> [NgramsElement]
271 toNgramsElement (GroupedText listType label _ setNgrams _ _ _) =
272 [parentElem] <> childrenElems
273 where
274 parent = label
275 children = Set.toList setNgrams
276 parentElem = mkNgramsElement (NgramsTerm parent)
277 (fromMaybe CandidateTerm listType)
278 Nothing
279 (mSetFromList (NgramsTerm <$> children))
280 childrenElems = map (\t -> mkNgramsElement t (fromMaybe CandidateTerm $ listType)
281 (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
282 (mSetFromList [])
283 ) (NgramsTerm <$> children)
284
285
286 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
287 toGargList l n = (l,n)
288
289
290 isStopTerm :: StopSize -> Text -> Bool
291 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
292 where
293 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
294
295 ------------------------------------------------------------------------------