]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[REORG:LIST] Formats.CSV
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE TemplateHaskell #-}
13
14 module Gargantext.Core.Text.List
15 where
16
17
18 import Control.Lens ((^.), set)
19 import Data.Maybe (fromMaybe, catMaybes)
20 import Data.Ord (Down(..))
21 import Data.Map (Map)
22 import Data.Set (Set)
23 import Data.Text (Text)
24 import qualified Data.Char as Char
25 import qualified Data.List as List
26 import qualified Data.Map as Map
27 import qualified Data.Set as Set
28 import qualified Data.Text as Text
29
30 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
31 import Gargantext.API.Ngrams.Types (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
32 import Gargantext.API.Ngrams.Types (RepoCmdM)
33 import Gargantext.Core.Text (size)
34 import Gargantext.Core.Text.List.Social (flowSocialList, flowSocialList', FlowSocialListPriority(..), invertForw)
35 import Gargantext.Core.Text.List.Social.Scores (FlowListScores)
36 import Gargantext.Core.Text.List.Group
37 import Gargantext.Core.Text.List.Group.WithStem
38 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
39 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
40 import Gargantext.Core.Types.Individu (User(..))
41 import Gargantext.Database.Admin.Types.Node (NodeId)
42 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, groupNodesByNgramsWith, getNodesByNgramsOnlyUser)
43 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
44 import Gargantext.Database.Prelude (CmdM)
45 import Gargantext.Database.Query.Table.Node (defaultList)
46 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
47 import Gargantext.Database.Query.Tree.Error (HasTreeError)
48 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
49 import Gargantext.Prelude
50
51
52 -- | TODO improve grouping functions of Authors, Sources, Institutes..
53 buildNgramsLists :: ( RepoCmdM env err m
54 , CmdM env err m
55 , HasTreeError err
56 , HasNodeError err
57 )
58 => User
59 -> GroupParams
60 -> UserCorpusId
61 -> MasterCorpusId
62 -> m (Map NgramsType [NgramsElement])
63 buildNgramsLists user gp uCid mCid = do
64 ngTerms <- buildNgramsTermsList user uCid mCid gp
65 othersTerms <- mapM (buildNgramsOthersList user uCid (ngramsGroup GroupIdentity))
66 [ (Authors , MapListSize 9)
67 , (Sources , MapListSize 9)
68 , (Institutes, MapListSize 9)
69 ]
70
71 pure $ Map.unions $ [ngTerms] <> othersTerms
72
73
74 data MapListSize = MapListSize { unMapListSize :: !Int }
75
76 buildNgramsOthersList ::( HasNodeError err
77 , CmdM env err m
78 , RepoCmdM env err m
79 , HasTreeError err
80 )
81 => User
82 -> UserCorpusId
83 -> (Text -> Text)
84 -> (NgramsType, MapListSize)
85 -> m (Map NgramsType [NgramsElement])
86 buildNgramsOthersList user uCid groupIt (nt, MapListSize mapListSize) = do
87 ngs' :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt
88
89 socialLists' :: Map Text FlowListScores
90 <- flowSocialList' MySelfFirst user nt (Set.fromList $ Map.keys ngs')
91 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
92
93 let
94 groupParams = GroupedTextParams groupIt (Set.size . snd) fst snd {-(size . fst)-}
95 grouped' = toGroupedText groupParams socialLists' ngs'
96
97 -- 8< 8< 8< 8< 8< 8< 8<
98 let
99 ngs :: Map Text (Set Text, Set NodeId) = groupNodesByNgramsWith groupIt ngs'
100 socialLists <- flowSocialList user nt (Set.fromList $ Map.keys ngs)
101 -- >8 >8 >8 >8 >8 >8 >8
102
103 let
104 grouped = groupedTextWithStem (GroupedTextParams groupIt (Set.size . snd) fst snd {-(size . fst)-} ) -- socialLists'
105 $ Map.mapWithKey (\k (a,b) -> (Set.delete k a, b))
106 $ ngs
107
108
109 let
110 groupedWithList = map (addListType (invertForw socialLists)) grouped
111 (stopTerms, tailTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm)
112 groupedWithList
113 (mapTerms, tailTerms') = Map.partition (\t -> t ^. gt_listType == Just MapTerm)
114 tailTerms
115
116 listSize = mapListSize - (List.length mapTerms)
117 (mapTerms', candiTerms) = List.splitAt listSize
118 $ List.sortOn (Down . _gt_score)
119 $ Map.elems tailTerms'
120
121 pure $ Map.fromList [( nt, (List.concat $ map toNgramsElement stopTerms)
122 <> (List.concat $ map toNgramsElement mapTerms )
123 <> (List.concat $ map toNgramsElement
124 $ map (set gt_listType (Just MapTerm )) mapTerms' )
125 <> (List.concat $ map toNgramsElement
126 $ map (set gt_listType (Just CandidateTerm)) candiTerms)
127 )]
128
129 -- TODO use ListIds
130 buildNgramsTermsList :: ( HasNodeError err
131 , CmdM env err m
132 , RepoCmdM env err m
133 , HasTreeError err
134 )
135 => User
136 -> UserCorpusId
137 -> MasterCorpusId
138 -> GroupParams
139 -> m (Map NgramsType [NgramsElement])
140 buildNgramsTermsList user uCid mCid groupParams = do
141
142 -- Computing global speGen score
143 allTerms :: Map Text Double <- getTficf uCid mCid NgramsTerms
144
145 -- printDebug "head candidates" (List.take 10 $ allTerms)
146 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
147
148 -- First remove stops terms
149 socialLists <- flowSocialList user NgramsTerms (Set.fromList $ map fst $ Map.toList allTerms)
150 -- printDebug "\n * socialLists * \n" socialLists
151
152 -- Grouping the ngrams and keeping the maximum score for label
153 let grouped = groupedTextWithStem ( GroupedTextParams (ngramsGroup groupParams) identity (const Set.empty) (const Set.empty) {-(size . _gt_label)-} ) allTerms
154
155 groupedWithList = map (addListType (invertForw socialLists)) grouped
156
157 (stopTerms, candidateTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
158 (groupedMono, groupedMult) = Map.partition (\t -> t ^. gt_size < 2) candidateTerms
159
160 -- printDebug "\n * stopTerms * \n" stopTerms
161 -- splitting monterms and multiterms to take proportional candidates
162 let
163 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if too small
164 monoSize = 0.4 :: Double
165 multSize = 1 - monoSize
166
167 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
168
169 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
170 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
171
172 -- printDebug "groupedMonoHead" (List.length groupedMonoHead)
173 -- printDebug "groupedMonoTail" (List.length groupedMonoHead)
174 -- printDebug "groupedMultHead" (List.length groupedMultHead)
175 -- printDebug "groupedMultTail" (List.length groupedMultTail)
176
177 let
178 -- Get Local Scores now for selected grouped ngrams
179 selectedTerms = Set.toList $ List.foldl'
180 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
181 $ Set.insert l' g
182 )
183 Set.empty
184 (groupedMonoHead <> groupedMultHead)
185
186 -- TO remove (and remove HasNodeError instance)
187 userListId <- defaultList uCid
188 masterListId <- defaultList mCid
189
190 mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
191
192 let
193 mapGroups = Map.fromList
194 $ map (\g -> (g ^. gt_stem, g))
195 $ groupedMonoHead <> groupedMultHead
196
197 -- grouping with Set NodeId
198 contextsAdded = foldl' (\mapGroups' k ->
199 let k' = ngramsGroup groupParams k in
200 case Map.lookup k' mapGroups' of
201 Nothing -> mapGroups'
202 Just g -> case Map.lookup k mapTextDocIds of
203 Nothing -> mapGroups'
204 Just ns -> Map.insert k' ( g { _gt_nodes = Set.union ns (_gt_nodes g)}) mapGroups'
205 )
206 mapGroups
207 $ Map.keys mapTextDocIds
208
209 -- compute cooccurrences
210 mapCooc = Map.filter (>2)
211 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
212 | (t1, s1) <- mapStemNodeIds
213 , (t2, s2) <- mapStemNodeIds
214 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
215 ]
216 where
217 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
218 -- printDebug "mapCooc" mapCooc
219
220 let
221 -- computing scores
222 mapScores f = Map.fromList
223 $ map (\(Scored t g s') -> (t, f (g,s')))
224 $ normalizeGlobal
225 $ map normalizeLocal
226 $ scored' mapCooc
227
228 groupsWithScores = catMaybes
229 $ map (\(stem, g)
230 -> case Map.lookup stem mapScores' of
231 Nothing -> Nothing
232 Just s' -> Just $ g { _gt_score = s'}
233 ) $ Map.toList contextsAdded
234 where
235 mapScores' = mapScores identity
236 -- adapt2 TOCHECK with DC
237 -- printDebug "groupsWithScores" groupsWithScores
238 let
239 -- sort / partition / split
240 -- filter mono/multi again
241 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
242 -- filter with max score
243 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
244
245 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
246 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
247
248 -- splitAt
249 let
250 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
251 inclSize = 0.4 :: Double
252 exclSize = 1 - inclSize
253 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
254
255 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
256 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
257
258 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
259 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
260
261
262 -- Final Step building the Typed list
263 termListHead = maps <> cands
264 where
265 maps = set gt_listType (Just MapTerm)
266 <$> monoScoredInclHead
267 <> monoScoredExclHead
268 <> multScoredInclHead
269 <> multScoredExclHead
270
271 cands = set gt_listType (Just CandidateTerm)
272 <$> monoScoredInclTail
273 <> monoScoredExclTail
274 <> multScoredInclTail
275 <> multScoredExclTail
276
277 termListTail = map (set gt_listType (Just CandidateTerm)) ( groupedMonoTail <> groupedMultTail)
278
279 -- printDebug "monoScoredInclHead" monoScoredInclHead
280 -- printDebug "monoScoredExclHead" monoScoredExclTail
281 -- printDebug "multScoredInclHead" multScoredInclHead
282 -- printDebug "multScoredExclTail" multScoredExclTail
283
284 let result = Map.unionsWith (<>)
285 [ Map.fromList [( NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
286 <> (List.concat $ map toNgramsElement $ termListTail)
287 <> (List.concat $ map toNgramsElement $ stopTerms)
288 )]
289 ]
290 -- printDebug "\n result \n" r
291 pure result
292
293
294
295 toNgramsElement :: GroupedText a -> [NgramsElement]
296 toNgramsElement (GroupedText listType label _ setNgrams _ _ _) =
297 [parentElem] <> childrenElems
298 where
299 parent = label
300 children = Set.toList setNgrams
301 parentElem = mkNgramsElement (NgramsTerm parent)
302 (fromMaybe CandidateTerm listType)
303 Nothing
304 (mSetFromList (NgramsTerm <$> children))
305 childrenElems = map (\t -> mkNgramsElement t (fromMaybe CandidateTerm $ listType)
306 (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
307 (mSetFromList [])
308 ) (NgramsTerm <$> children)
309
310
311 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
312 toGargList l n = (l,n)
313
314
315 isStopTerm :: StopSize -> Text -> Bool
316 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
317 where
318 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
319
320 ------------------------------------------------------------------------------