]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[FEAT] Social Lists possible solution (WIP)
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE TemplateHaskell #-}
13
14 module Gargantext.Core.Text.List
15 where
16
17
18 import Control.Lens ((^.), set)
19 import Data.Maybe (fromMaybe, catMaybes)
20 import Data.Ord (Down(..))
21 import Data.Map (Map)
22 import Data.Set (Set)
23 import Data.Text (Text)
24 import qualified Data.Char as Char
25 import qualified Data.List as List
26 import qualified Data.Map as Map
27 import qualified Data.Set as Set
28 import qualified Data.Text as Text
29
30 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
31 import Gargantext.API.Ngrams.Types (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
32 import Gargantext.API.Ngrams.Types (RepoCmdM)
33 import Gargantext.Core.Text.List.Social (flowSocialList, flowSocialList', FlowSocialListPriority(..), invertForw)
34 import Gargantext.Core.Text.List.Social.Group (FlowListScores)
35 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
36 import Gargantext.Core.Text.Group
37 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
38 import Gargantext.Core.Types.Individu (User(..))
39 import Gargantext.Database.Admin.Types.Node (NodeId)
40 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, groupNodesByNgramsWith, getNodesByNgramsOnlyUser)
41 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
42 import Gargantext.Database.Prelude (CmdM)
43 import Gargantext.Database.Query.Table.Node (defaultList)
44 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
45 import Gargantext.Database.Query.Tree.Error (HasTreeError)
46 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
47 import Gargantext.Prelude
48
49
50 -- | TODO improve grouping functions of Authors, Sources, Institutes..
51 buildNgramsLists :: ( RepoCmdM env err m
52 , CmdM env err m
53 , HasTreeError err
54 , HasNodeError err
55 )
56 => User
57 -> GroupParams
58 -> UserCorpusId
59 -> MasterCorpusId
60 -> m (Map NgramsType [NgramsElement])
61 buildNgramsLists user gp uCid mCid = do
62 ngTerms <- buildNgramsTermsList user uCid mCid gp
63 othersTerms <- mapM (buildNgramsOthersList user uCid (ngramsGroup GroupIdentity))
64 [ (Authors , MapListSize 9)
65 , (Sources , MapListSize 9)
66 , (Institutes, MapListSize 9)
67 ]
68
69 pure $ Map.unions $ [ngTerms] <> othersTerms
70
71
72 data MapListSize = MapListSize { unMapListSize :: !Int }
73
74 buildNgramsOthersList ::( HasNodeError err
75 , CmdM env err m
76 , RepoCmdM env err m
77 , HasTreeError err
78 )
79 => User
80 -> UserCorpusId
81 -> (Text -> Text)
82 -> (NgramsType, MapListSize)
83 -> m (Map NgramsType [NgramsElement])
84 buildNgramsOthersList user uCid groupIt (nt, MapListSize mapListSize) = do
85 ngs' :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt
86
87 socialLists' :: Map Text FlowListScores
88 <- flowSocialList' MySelfFirst user nt (Set.fromList $ Map.keys ngs')
89 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
90
91 -- 8< 8< 8< 8< 8< 8< 8<
92 let
93 ngs :: Map Text (Set Text, Set NodeId) = groupNodesByNgramsWith groupIt ngs'
94 socialLists <- flowSocialList user nt (Set.fromList $ Map.keys ngs)
95 -- >8 >8 >8 >8 >8 >8 >8
96
97 let
98 grouped = toGroupedText (GroupedTextParams groupIt (Set.size . snd) fst snd)
99 $ Map.toList
100 $ Map.mapWithKey (\k (a,b) -> (Set.delete k a, b))
101 $ ngs
102
103
104 let
105 groupedWithList = map (addListType (invertForw socialLists)) grouped
106 (stopTerms, tailTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm)
107 groupedWithList
108 (mapTerms, tailTerms') = Map.partition (\t -> t ^. gt_listType == Just MapTerm)
109 tailTerms
110
111 listSize = mapListSize - (List.length mapTerms)
112 (mapTerms', candiTerms) = List.splitAt listSize
113 $ List.sortOn (Down . _gt_score)
114 $ Map.elems tailTerms'
115
116 pure $ Map.fromList [( nt, (List.concat $ map toNgramsElement stopTerms)
117 <> (List.concat $ map toNgramsElement mapTerms )
118 <> (List.concat $ map toNgramsElement
119 $ map (set gt_listType (Just MapTerm )) mapTerms' )
120 <> (List.concat $ map toNgramsElement
121 $ map (set gt_listType (Just CandidateTerm)) candiTerms)
122 )]
123
124 -- TODO use ListIds
125 buildNgramsTermsList :: ( HasNodeError err
126 , CmdM env err m
127 , RepoCmdM env err m
128 , HasTreeError err
129 )
130 => User
131 -> UserCorpusId
132 -> MasterCorpusId
133 -> GroupParams
134 -> m (Map NgramsType [NgramsElement])
135 buildNgramsTermsList user uCid mCid groupParams = do
136
137 -- Computing global speGen score
138 allTerms :: [(Text, Double)] <- Map.toList <$> getTficf uCid mCid NgramsTerms
139
140 -- printDebug "head candidates" (List.take 10 $ allTerms)
141 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
142
143 -- First remove stops terms
144 socialLists <- flowSocialList user NgramsTerms (Set.fromList $ map fst allTerms)
145 -- printDebug "\n * socialLists * \n" socialLists
146
147 printDebug "\n * socialLists * \n" socialLists
148
149 let
150 _socialStop = fromMaybe Set.empty $ Map.lookup StopTerm socialLists
151 _socialMap = fromMaybe Set.empty $ Map.lookup MapTerm socialLists
152 _socialCand = fromMaybe Set.empty $ Map.lookup CandidateTerm socialLists
153 -- stopTerms ignored for now (need to be tagged already)
154 -- (stopTerms, candidateTerms) = List.partition ((\t -> Set.member t socialStop) . fst) allTerms
155 -- (mapTerms, candidateTerms) = List.partition ((\t -> Set.member t socialMap ) . fst) allTerms
156
157 -- printDebug "stopTerms" stopTerms
158
159 -- Grouping the ngrams and keeping the maximum score for label
160 let grouped = toGroupedText (GroupedTextParams (ngramsGroup groupParams) identity (const Set.empty) (const Set.empty)) allTerms
161
162 groupedWithList = map (addListType (invertForw socialLists)) grouped
163
164 (stopTerms, candidateTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
165 (groupedMono, groupedMult) = Map.partition (\t -> t ^. gt_size < 2) candidateTerms
166
167 -- printDebug "\n * stopTerms * \n" stopTerms
168 -- splitting monterms and multiterms to take proportional candidates
169 let
170 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if to small
171 monoSize = 0.4 :: Double
172 multSize = 1 - monoSize
173
174 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
175
176 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
177 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
178
179 -- printDebug "groupedMonoHead" (List.length groupedMonoHead)
180 -- printDebug "groupedMonoTail" (List.length groupedMonoHead)
181 -- printDebug "groupedMultHead" (List.length groupedMultHead)
182 -- printDebug "groupedMultTail" (List.length groupedMultTail)
183
184 let
185 -- Get Local Scores now for selected grouped ngrams
186 selectedTerms = Set.toList $ List.foldl'
187 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
188 $ Set.insert l' g
189 )
190 Set.empty
191 (groupedMonoHead <> groupedMultHead)
192
193 -- TO remove (and remove HasNodeError instance)
194 userListId <- defaultList uCid
195 masterListId <- defaultList mCid
196
197 mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
198
199 let
200 mapGroups = Map.fromList
201 $ map (\g -> (g ^. gt_stem, g))
202 $ groupedMonoHead <> groupedMultHead
203
204 -- grouping with Set NodeId
205 contextsAdded = foldl' (\mapGroups' k -> let k' = ngramsGroup groupParams k
206 in case Map.lookup k' mapGroups' of
207 Nothing -> mapGroups'
208 Just g -> case Map.lookup k mapTextDocIds of
209 Nothing -> mapGroups'
210 Just ns -> Map.insert k' ( g { _gt_nodes = Set.union ns (_gt_nodes g)}) mapGroups'
211 )
212 mapGroups
213 $ Map.keys mapTextDocIds
214
215 -- compute cooccurrences
216 mapCooc = Map.filter (>2)
217 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
218 | (t1, s1) <- mapStemNodeIds
219 , (t2, s2) <- mapStemNodeIds
220 ]
221 where
222 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
223 -- printDebug "mapCooc" mapCooc
224
225 let
226 -- computing scores
227 mapScores f = Map.fromList
228 $ map (\(Scored t g s') -> (t, f (g,s')))
229 $ normalizeGlobal
230 $ map normalizeLocal
231 $ scored' mapCooc
232
233 groupsWithScores = catMaybes
234 $ map (\(stem, g)
235 -> case Map.lookup stem mapScores' of
236 Nothing -> Nothing
237 Just s' -> Just $ g { _gt_score = s'}
238 ) $ Map.toList contextsAdded
239 where
240 mapScores' = mapScores identity
241 -- adapt2 TOCHECK with DC
242 -- printDebug "groupsWithScores" groupsWithScores
243 let
244 -- sort / partition / split
245 -- filter mono/multi again
246 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
247 -- filter with max score
248 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
249
250 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
251 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
252
253 -- splitAt
254 let
255 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
256 inclSize = 0.4 :: Double
257 exclSize = 1 - inclSize
258 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
259
260 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
261 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
262
263 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
264 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
265
266
267 -- Final Step building the Typed list
268 termListHead = maps <> cands
269 where
270 maps = set gt_listType (Just MapTerm)
271 <$> monoScoredInclHead
272 <> monoScoredExclHead
273 <> multScoredInclHead
274 <> multScoredExclHead
275
276 cands = set gt_listType (Just CandidateTerm)
277 <$> monoScoredInclTail
278 <> monoScoredExclTail
279 <> multScoredInclTail
280 <> multScoredExclTail
281
282 termListTail = map (set gt_listType (Just CandidateTerm)) ( groupedMonoTail <> groupedMultTail)
283
284 -- printDebug "monoScoredInclHead" monoScoredInclHead
285 -- printDebug "monoScoredExclHead" monoScoredExclTail
286 -- printDebug "multScoredInclHead" multScoredInclHead
287 -- printDebug "multScoredExclTail" multScoredExclTail
288
289 let result = Map.unionsWith (<>)
290 [ Map.fromList [( NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
291 <> (List.concat $ map toNgramsElement $ termListTail)
292 <> (List.concat $ map toNgramsElement $ stopTerms)
293 )]
294 ]
295 -- printDebug "\n result \n" r
296 pure result
297
298
299
300 toNgramsElement :: GroupedText a -> [NgramsElement]
301 toNgramsElement (GroupedText listType label _ setNgrams _ _ _) =
302 [parentElem] <> childrenElems
303 where
304 parent = label
305 children = Set.toList setNgrams
306 parentElem = mkNgramsElement (NgramsTerm parent)
307 (fromMaybe CandidateTerm listType)
308 Nothing
309 (mSetFromList (NgramsTerm <$> children))
310 childrenElems = map (\t -> mkNgramsElement t (fromMaybe CandidateTerm $ listType)
311 (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
312 (mSetFromList [])
313 ) (NgramsTerm <$> children)
314
315
316 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
317 toGargList l n = (l,n)
318
319
320 isStopTerm :: StopSize -> Text -> Bool
321 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
322 where
323 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
324
325 ------------------------------------------------------------------------------