]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
Merge branch 'dev' into dev-social-list
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE TemplateHaskell #-}
13
14 module Gargantext.Core.Text.List
15 where
16
17
18 import Control.Lens ((^.), set)
19 import Data.Maybe (fromMaybe, catMaybes)
20 import Data.Ord (Down(..))
21 import Data.Map (Map)
22 import Data.Text (Text)
23 import qualified Data.Char as Char
24 import qualified Data.List as List
25 import qualified Data.Map as Map
26 import qualified Data.Set as Set
27 import qualified Data.Text as Text
28
29 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
30 import Gargantext.API.Ngrams.Types (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
31 import Gargantext.API.Ngrams.Types (RepoCmdM)
32 import Gargantext.Core.Text.List.Social (flowSocialList, invertForw)
33 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
34 import Gargantext.Core.Text.Group
35 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
36 import Gargantext.Core.Types.Individu (User(..))
37 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, groupNodesByNgramsWith, getNodesByNgramsOnlyUser)
38 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
39 import Gargantext.Database.Prelude (CmdM)
40 import Gargantext.Database.Query.Table.Node (defaultList)
41 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
42 import Gargantext.Database.Query.Tree.Error (HasTreeError)
43 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
44 import Gargantext.Prelude
45
46
47 -- | TODO improve grouping functions of Authors, Sources, Institutes..
48 buildNgramsLists :: ( RepoCmdM env err m
49 , CmdM env err m
50 , HasTreeError err
51 , HasNodeError err
52 )
53 => User
54 -> GroupParams
55 -> UserCorpusId
56 -> MasterCorpusId
57 -> m (Map NgramsType [NgramsElement])
58 buildNgramsLists user gp uCid mCid = do
59 ngTerms <- buildNgramsTermsList user uCid mCid gp
60 othersTerms <- mapM (buildNgramsOthersList user uCid (ngramsGroup GroupIdentity))
61 [ (Authors , MapListSize 9)
62 , (Sources , MapListSize 9)
63 , (Institutes, MapListSize 9)
64 ]
65
66 pure $ Map.unions $ [ngTerms] <> othersTerms
67
68
69 data MapListSize = MapListSize Int
70
71 buildNgramsOthersList ::( HasNodeError err
72 , CmdM env err m
73 , RepoCmdM env err m
74 , HasTreeError err
75 )
76 => User
77 -> UserCorpusId
78 -> (Text -> Text)
79 -> (NgramsType, MapListSize)
80 -> m (Map NgramsType [NgramsElement])
81 buildNgramsOthersList user uCid groupIt (nt, MapListSize mapListSize) = do
82 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
83
84 let
85 grouped = toGroupedText groupIt (Set.size . snd) fst snd
86 (Map.toList $ Map.mapWithKey (\k (a,b) -> (Set.delete k a, b)) $ ngs)
87
88 socialLists <- flowSocialList user nt (Set.fromList $ Map.keys ngs)
89
90 let
91 groupedWithList = map (addListType (invertForw socialLists)) grouped
92 (stopTerms, tailTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
93 (mapTerms, tailTerms') = Map.partition (\t -> t ^. gt_listType == Just MapTerm) tailTerms
94
95 listSize = mapListSize - (List.length mapTerms)
96 (mapTerms', candiTerms) = List.splitAt listSize $ List.sortOn (Down . _gt_score) $ Map.elems tailTerms'
97
98 pure $ Map.fromList [( nt, (List.concat $ map toNgramsElement stopTerms)
99 <> (List.concat $ map toNgramsElement mapTerms)
100 <> (List.concat $ map toNgramsElement $ map (set gt_listType (Just MapTerm)) mapTerms')
101 <> (List.concat $ map toNgramsElement $ map (set gt_listType (Just CandidateTerm)) candiTerms)
102 )]
103
104 -- TODO use ListIds
105 buildNgramsTermsList :: ( HasNodeError err
106 , CmdM env err m
107 , RepoCmdM env err m
108 , HasTreeError err
109 )
110 => User
111 -> UserCorpusId
112 -> MasterCorpusId
113 -> GroupParams
114 -> m (Map NgramsType [NgramsElement])
115 buildNgramsTermsList user uCid mCid groupParams = do
116
117 -- Computing global speGen score
118 allTerms <- Map.toList <$> getTficf uCid mCid NgramsTerms
119
120 -- printDebug "head candidates" (List.take 10 $ allTerms)
121 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
122
123 -- First remove stops terms
124 socialLists <- flowSocialList user NgramsTerms (Set.fromList $ map fst allTerms)
125 -- printDebug "\n * socialLists * \n" socialLists
126
127 printDebug "\n * socialLists * \n" socialLists
128
129 let
130 _socialStop = fromMaybe Set.empty $ Map.lookup StopTerm socialLists
131 _socialMap = fromMaybe Set.empty $ Map.lookup MapTerm socialLists
132 _socialCand = fromMaybe Set.empty $ Map.lookup CandidateTerm socialLists
133 -- stopTerms ignored for now (need to be tagged already)
134 -- (stopTerms, candidateTerms) = List.partition ((\t -> Set.member t socialStop) . fst) allTerms
135 -- (mapTerms, candidateTerms) = List.partition ((\t -> Set.member t socialMap ) . fst) allTerms
136
137 -- printDebug "stopTerms" stopTerms
138
139 -- Grouping the ngrams and keeping the maximum score for label
140 let grouped = toGroupedText (ngramsGroup groupParams) identity (const Set.empty) (const Set.empty) allTerms
141
142 groupedWithList = map (addListType (invertForw socialLists)) grouped
143
144 (stopTerms, candidateTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
145 (groupedMono, groupedMult) = Map.partition (\t -> t ^. gt_size < 2) candidateTerms
146
147 -- printDebug "\n * stopTerms * \n" stopTerms
148 -- splitting monterms and multiterms to take proportional candidates
149 let
150 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if to small
151 monoSize = 0.4 :: Double
152 multSize = 1 - monoSize
153
154 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
155
156 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
157 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
158
159 -- printDebug "groupedMonoHead" (List.length groupedMonoHead)
160 -- printDebug "groupedMonoTail" (List.length groupedMonoHead)
161 -- printDebug "groupedMultHead" (List.length groupedMultHead)
162 -- printDebug "groupedMultTail" (List.length groupedMultTail)
163
164 let
165 -- Get Local Scores now for selected grouped ngrams
166 selectedTerms = Set.toList $ List.foldl'
167 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
168 $ Set.insert l' g
169 )
170 Set.empty
171 (groupedMonoHead <> groupedMultHead)
172
173 -- TO remove (and remove HasNodeError instance)
174 userListId <- defaultList uCid
175 masterListId <- defaultList mCid
176
177 mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
178
179 let
180 mapGroups = Map.fromList
181 $ map (\g -> (g ^. gt_stem, g))
182 $ groupedMonoHead <> groupedMultHead
183
184 -- grouping with Set NodeId
185 contextsAdded = foldl' (\mapGroups' k -> let k' = ngramsGroup groupParams k
186 in case Map.lookup k' mapGroups' of
187 Nothing -> mapGroups'
188 Just g -> case Map.lookup k mapTextDocIds of
189 Nothing -> mapGroups'
190 Just ns -> Map.insert k' ( g { _gt_nodes = Set.union ns (_gt_nodes g)}) mapGroups'
191 )
192 mapGroups
193 $ Map.keys mapTextDocIds
194
195 -- compute cooccurrences
196 mapCooc = Map.filter (>2)
197 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
198 | (t1, s1) <- mapStemNodeIds
199 , (t2, s2) <- mapStemNodeIds
200 ]
201 where
202 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
203 -- printDebug "mapCooc" mapCooc
204
205 let
206 -- computing scores
207 mapScores f = Map.fromList
208 $ map (\(Scored t g s') -> (t, f (g,s')))
209 $ normalizeGlobal
210 $ map normalizeLocal
211 $ scored' mapCooc
212
213 groupsWithScores = catMaybes
214 $ map (\(stem, g)
215 -> case Map.lookup stem mapScores' of
216 Nothing -> Nothing
217 Just s' -> Just $ g { _gt_score = s'}
218 ) $ Map.toList contextsAdded
219 where
220 mapScores' = mapScores identity
221 -- adapt2 TOCHECK with DC
222 -- printDebug "groupsWithScores" groupsWithScores
223 let
224 -- sort / partition / split
225 -- filter mono/multi again
226 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
227 -- filter with max score
228 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
229
230 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
231 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
232
233 -- splitAt
234 let
235 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
236 inclSize = 0.4 :: Double
237 exclSize = 1 - inclSize
238 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
239
240 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
241 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
242
243 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
244 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
245
246
247 -- Final Step building the Typed list
248 termListHead = maps <> cands
249 where
250 maps = set gt_listType (Just MapTerm)
251 <$> monoScoredInclHead
252 <> monoScoredExclHead
253 <> multScoredInclHead
254 <> multScoredExclHead
255
256 cands = set gt_listType (Just CandidateTerm)
257 <$> monoScoredInclTail
258 <> monoScoredExclTail
259 <> multScoredInclTail
260 <> multScoredExclTail
261
262 termListTail = map (set gt_listType (Just CandidateTerm)) ( groupedMonoTail <> groupedMultTail)
263
264 -- printDebug "monoScoredInclHead" monoScoredInclHead
265 -- printDebug "monoScoredExclHead" monoScoredExclTail
266 -- printDebug "multScoredInclHead" multScoredInclHead
267 -- printDebug "multScoredExclTail" multScoredExclTail
268
269 let result = Map.unionsWith (<>)
270 [ Map.fromList [( NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
271 <> (List.concat $ map toNgramsElement $ termListTail)
272 <> (List.concat $ map toNgramsElement $ stopTerms)
273 )]
274 ]
275 -- printDebug "\n result \n" r
276 pure result
277
278
279
280 toNgramsElement :: GroupedText a -> [NgramsElement]
281 toNgramsElement (GroupedText listType label _ setNgrams _ _ _) =
282 [parentElem] <> childrenElems
283 where
284 parent = label
285 children = Set.toList setNgrams
286 parentElem = mkNgramsElement (NgramsTerm parent)
287 (fromMaybe CandidateTerm listType)
288 Nothing
289 (mSetFromList (NgramsTerm <$> children))
290 childrenElems = map (\t -> mkNgramsElement t (fromMaybe CandidateTerm $ listType)
291 (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
292 (mSetFromList [])
293 ) (NgramsTerm <$> children)
294
295
296 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
297 toGargList l n = (l,n)
298
299
300 isStopTerm :: StopSize -> Text -> Bool
301 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
302 where
303 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
304
305 ------------------------------------------------------------------------------