]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
parameter change in the sensibility for WeightedLogSim and WeightedLogJaccard similar...
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE TemplateHaskell #-}
13
14 module Gargantext.Core.Text.List
15 where
16
17
18 import Control.Lens ((^.), set)
19 import Data.Maybe (fromMaybe, catMaybes)
20 import Data.Ord (Down(..))
21 import Data.Map (Map)
22 import Data.Text (Text)
23 import qualified Data.Char as Char
24 import qualified Data.List as List
25 import qualified Data.Map as Map
26 import qualified Data.Set as Set
27 import qualified Data.Text as Text
28
29 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
30 import Gargantext.API.Ngrams.Types (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
31 import Gargantext.API.Ngrams.Types (RepoCmdM)
32 import Gargantext.Core.Text.List.Social (flowSocialList, invertForw)
33 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
34 import Gargantext.Core.Text.Group
35 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
36 import Gargantext.Core.Types.Individu (User(..))
37 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, groupNodesByNgramsWith, getNodesByNgramsOnlyUser)
38 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
39 import Gargantext.Database.Prelude (CmdM)
40 import Gargantext.Database.Query.Table.Node (defaultList)
41 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
42 import Gargantext.Database.Query.Tree.Error (HasTreeError)
43 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
44 import Gargantext.Prelude
45
46
47 -- | TODO improve grouping functions of Authors, Sources, Institutes..
48 buildNgramsLists :: ( RepoCmdM env err m
49 , CmdM env err m
50 , HasTreeError err
51 , HasNodeError err
52 )
53 => User
54 -> GroupParams
55 -> UserCorpusId
56 -> MasterCorpusId
57 -> m (Map NgramsType [NgramsElement])
58 buildNgramsLists user gp uCid mCid = do
59 ngTerms <- buildNgramsTermsList user uCid mCid gp
60 othersTerms <- mapM (buildNgramsOthersList user uCid (ngramsGroup GroupIdentity))
61 [ (Authors , MapListSize 9)
62 , (Sources , MapListSize 9)
63 , (Institutes, MapListSize 9)
64 ]
65
66 pure $ Map.unions $ [ngTerms] <> othersTerms
67
68
69 data MapListSize = MapListSize Int
70
71 buildNgramsOthersList ::( HasNodeError err
72 , CmdM env err m
73 , RepoCmdM env err m
74 , HasTreeError err
75 )
76 => User
77 -> UserCorpusId
78 -> (Text -> Text)
79 -> (NgramsType, MapListSize)
80 -> m (Map NgramsType [NgramsElement])
81 buildNgramsOthersList user uCid groupIt (nt, MapListSize mapListSize) = do
82
83 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
84
85 let
86 grouped = toGroupedText groupIt (Set.size . snd) fst snd
87 (Map.toList $ Map.mapWithKey (\k (a,b) -> (Set.delete k a, b)) $ ngs)
88
89 socialLists <- flowSocialList user nt (Set.fromList $ Map.keys ngs)
90
91 let
92 groupedWithList = map (addListType (invertForw socialLists)) grouped
93 (stopTerms, tailTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
94 (mapTerms, tailTerms') = Map.partition (\t -> t ^. gt_listType == Just MapTerm) tailTerms
95
96 listSize = mapListSize - (List.length mapTerms)
97 (mapTerms', candiTerms) = List.splitAt listSize $ List.sortOn (Down . _gt_score) $ Map.elems tailTerms'
98
99 pure $ Map.fromList [( nt, (List.concat $ map toNgramsElement stopTerms)
100 <> (List.concat $ map toNgramsElement mapTerms)
101 <> (List.concat $ map toNgramsElement $ map (set gt_listType (Just MapTerm)) mapTerms')
102 <> (List.concat $ map toNgramsElement $ map (set gt_listType (Just CandidateTerm)) candiTerms)
103 )]
104
105 -- TODO use ListIds
106 buildNgramsTermsList :: ( HasNodeError err
107 , CmdM env err m
108 , RepoCmdM env err m
109 , HasTreeError err
110 )
111 => User
112 -> UserCorpusId
113 -> MasterCorpusId
114 -> GroupParams
115 -> m (Map NgramsType [NgramsElement])
116 buildNgramsTermsList user uCid mCid groupParams = do
117
118 -- Computing global speGen score
119 allTerms <- Map.toList <$> getTficf uCid mCid NgramsTerms
120
121 -- printDebug "head candidates" (List.take 10 $ allTerms)
122 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
123
124 -- First remove stops terms
125 socialLists <- flowSocialList user NgramsTerms (Set.fromList $ map fst allTerms)
126 -- printDebug "\n * socialLists * \n" socialLists
127
128
129 -- Grouping the ngrams and keeping the maximum score for label
130 let grouped = toGroupedText (ngramsGroup groupParams) identity (const Set.empty) (const Set.empty) allTerms
131
132 groupedWithList = map (addListType (invertForw socialLists)) grouped
133
134 (stopTerms, candidateTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
135 (groupedMono, groupedMult) = Map.partition (\t -> t ^. gt_size < 2) candidateTerms
136
137 -- printDebug "\n * stopTerms * \n" stopTerms
138 -- splitting monterms and multiterms to take proportional candidates
139 let
140 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if to small
141 monoSize = 0.4 :: Double
142 multSize = 1 - monoSize
143
144 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
145
146 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
147 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
148
149 -- printDebug "groupedMonoHead" (List.length groupedMonoHead)
150 -- printDebug "groupedMonoTail" (List.length groupedMonoHead)
151 -- printDebug "groupedMultHead" (List.length groupedMultHead)
152 -- printDebug "groupedMultTail" (List.length groupedMultTail)
153
154 let
155 -- Get Local Scores now for selected grouped ngrams
156 selectedTerms = Set.toList $ List.foldl'
157 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
158 $ Set.insert l' g
159 )
160 Set.empty
161 (groupedMonoHead <> groupedMultHead)
162
163 -- TO remove (and remove HasNodeError instance)
164 userListId <- defaultList uCid
165 masterListId <- defaultList mCid
166
167 mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
168
169 let
170 mapGroups = Map.fromList
171 $ map (\g -> (g ^. gt_stem, g))
172 $ groupedMonoHead <> groupedMultHead
173
174 -- grouping with Set NodeId
175 contextsAdded = foldl' (\mapGroups' k -> let k' = ngramsGroup groupParams k
176 in case Map.lookup k' mapGroups' of
177 Nothing -> mapGroups'
178 Just g -> case Map.lookup k mapTextDocIds of
179 Nothing -> mapGroups'
180 Just ns -> Map.insert k' ( g { _gt_nodes = Set.union ns (_gt_nodes g)}) mapGroups'
181 )
182 mapGroups
183 $ Map.keys mapTextDocIds
184
185 -- compute cooccurrences
186 mapCooc = Map.filter (>2)
187 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
188 | (t1, s1) <- mapStemNodeIds
189 , (t2, s2) <- mapStemNodeIds
190 ]
191 where
192 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
193 -- printDebug "mapCooc" mapCooc
194
195 let
196 -- computing scores
197 mapScores f = Map.fromList
198 $ map (\(Scored t g s') -> (t, f (g,s')))
199 $ normalizeGlobal
200 $ map normalizeLocal
201 $ scored' mapCooc
202
203 groupsWithScores = catMaybes
204 $ map (\(stem, g)
205 -> case Map.lookup stem mapScores' of
206 Nothing -> Nothing
207 Just s' -> Just $ g { _gt_score = s'}
208 ) $ Map.toList contextsAdded
209 where
210 mapScores' = mapScores identity
211 -- adapt2 TOCHECK with DC
212 -- printDebug "groupsWithScores" groupsWithScores
213 let
214 -- sort / partition / split
215 -- filter mono/multi again
216 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
217 -- filter with max score
218 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
219
220 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
221 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
222
223 -- splitAt
224 let
225 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
226 inclSize = 0.4 :: Double
227 exclSize = 1 - inclSize
228 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
229
230 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
231 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
232
233 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
234 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
235
236
237 -- Final Step building the Typed list
238 termListHead = maps <> cands
239 where
240 maps = set gt_listType (Just MapTerm)
241 <$> monoScoredInclHead
242 <> monoScoredExclHead
243 <> multScoredInclHead
244 <> multScoredExclHead
245
246 cands = set gt_listType (Just CandidateTerm)
247 <$> monoScoredInclTail
248 <> monoScoredExclTail
249 <> multScoredInclTail
250 <> multScoredExclTail
251
252 termListTail = map (set gt_listType (Just CandidateTerm)) ( groupedMonoTail <> groupedMultTail)
253
254 -- printDebug "monoScoredInclHead" monoScoredInclHead
255 -- printDebug "monoScoredExclHead" monoScoredExclTail
256 --
257 -- printDebug "multScoredInclHead" multScoredInclHead
258 -- printDebug "multScoredExclTail" multScoredExclTail
259
260 let result = Map.unionsWith (<>)
261 [ Map.fromList [( NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
262 <> (List.concat $ map toNgramsElement $ termListTail)
263 <> (List.concat $ map toNgramsElement $ stopTerms)
264 )]
265 ]
266 -- printDebug "\n result \n" r
267 pure result
268
269
270
271 toNgramsElement :: GroupedText a -> [NgramsElement]
272 toNgramsElement (GroupedText listType label _ setNgrams _ _ _) =
273 [parentElem] <> childrenElems
274 where
275 parent = label
276 children = Set.toList setNgrams
277 parentElem = mkNgramsElement (NgramsTerm parent)
278 (fromMaybe CandidateTerm listType)
279 Nothing
280 (mSetFromList (NgramsTerm <$> children))
281 childrenElems = map (\t -> mkNgramsElement t (fromMaybe CandidateTerm $ listType)
282 (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
283 (mSetFromList [])
284 ) (NgramsTerm <$> children)
285
286
287 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
288 toGargList l n = (l,n)
289
290
291 isStopTerm :: StopSize -> Text -> Bool
292 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
293 where
294 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
295
296 ------------------------------------------------------------------------------