]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[Types] organization, using more Lenses
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
14
15 module Gargantext.Core.Text.List
16 where
17
18
19 import Control.Lens ((^.), set, view, over)
20 import Data.Map (Map)
21 import Data.Maybe (fromMaybe, catMaybes)
22 import Data.Ord (Down(..))
23 import Data.Set (Set)
24 import Data.Text (Text)
25 import Gargantext.API.Ngrams.Types (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
26 import Gargantext.API.Ngrams.Types (RepoCmdM)
27 import Gargantext.Core.Text.List.Group
28 import Gargantext.Core.Text.List.Group.WithStem
29 import Gargantext.Core.Text.List.Social
30 import Gargantext.Core.Text.List.Social.Prelude
31 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
32 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
33 import Gargantext.Core.Types.Individu (User(..))
34 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
35 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
36 import Gargantext.Database.Admin.Types.Node (NodeId)
37 import Gargantext.Database.Prelude (CmdM)
38 import Gargantext.Database.Query.Table.Node (defaultList)
39 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
40 import Gargantext.Database.Query.Tree.Error (HasTreeError)
41 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
42 import Gargantext.Prelude
43 import qualified Data.Char as Char
44 import qualified Data.List as List
45 import qualified Data.Map as Map
46 import qualified Data.Set as Set
47 import qualified Data.Text as Text
48
49
50 -- | TODO improve grouping functions of Authors, Sources, Institutes..
51 buildNgramsLists :: ( RepoCmdM env err m
52 , CmdM env err m
53 , HasTreeError err
54 , HasNodeError err
55 )
56 => User
57 -> GroupParams
58 -> UserCorpusId
59 -> MasterCorpusId
60 -> m (Map NgramsType [NgramsElement])
61 buildNgramsLists user gp uCid mCid = do
62 ngTerms <- buildNgramsTermsList user uCid mCid gp
63 othersTerms <- mapM (buildNgramsOthersList user uCid (ngramsGroup GroupIdentity))
64 [ (Authors , MapListSize 9)
65 , (Sources , MapListSize 9)
66 , (Institutes, MapListSize 9)
67 ]
68
69 pure $ Map.unions $ [ngTerms] <> othersTerms
70
71
72 data MapListSize = MapListSize { unMapListSize :: !Int }
73
74 buildNgramsOthersList ::( HasNodeError err
75 , CmdM env err m
76 , RepoCmdM env err m
77 , HasTreeError err
78 )
79 => User
80 -> UserCorpusId
81 -> (Text -> Text)
82 -> (NgramsType, MapListSize)
83 -> m (Map NgramsType [NgramsElement])
84 buildNgramsOthersList user uCid groupIt (nt, MapListSize mapListSize) = do
85 ngs' :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt
86
87 socialLists' :: FlowCont Text FlowListScores
88 <- flowSocialList' MySelfFirst user nt (FlowCont Map.empty $ Set.fromList $ Map.keys ngs')
89 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
90
91 printDebug "flowSocialList'"
92 $ Map.filter (not . ((==) Map.empty) . view fls_parents)
93 $ view flc_scores socialLists'
94
95 let
96 groupParams = GroupedTextParams groupIt (Set.size . snd) fst snd {-(size . fst)-}
97 groupedWithList = toGroupedText groupParams (view flc_scores socialLists') ngs'
98
99 printDebug "groupedWithList"
100 $ Map.map (\v -> (view gt_label v, view gt_children v))
101 $ Map.filter (\v -> (Set.size $ view gt_children v) > 0)
102 $ groupedWithList
103
104 let
105 (stopTerms, tailTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
106 (mapTerms, tailTerms') = Map.partition (\t -> t ^. gt_listType == Just MapTerm) tailTerms
107
108 listSize = mapListSize - (List.length mapTerms)
109 (mapTerms', candiTerms) = List.splitAt listSize
110 $ List.sortOn (Down . _gt_score)
111 $ Map.elems tailTerms'
112
113 pure $ Map.fromList [( nt, (List.concat $ map toNgramsElement stopTerms)
114 <> (List.concat $ map toNgramsElement mapTerms )
115 <> (List.concat $ map toNgramsElement
116 $ map (set gt_listType (Just MapTerm )) mapTerms' )
117 <> (List.concat $ map toNgramsElement
118 $ map (set gt_listType (Just CandidateTerm)) candiTerms)
119 )]
120
121
122 -- TODO use ListIds
123 buildNgramsTermsList :: ( HasNodeError err
124 , CmdM env err m
125 , RepoCmdM env err m
126 , HasTreeError err
127 )
128 => User
129 -> UserCorpusId
130 -> MasterCorpusId
131 -> GroupParams
132 -> m (Map NgramsType [NgramsElement])
133 buildNgramsTermsList user uCid mCid groupParams = do
134
135 -- Computing global speGen score
136 allTerms :: Map Text Double <- getTficf uCid mCid NgramsTerms
137
138 -- printDebug "head candidates" (List.take 10 $ allTerms)
139 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
140
141 -- First remove stops terms
142 socialLists <- flowSocialList user NgramsTerms (Set.fromList $ map fst $ Map.toList allTerms)
143 -- printDebug "\n * socialLists * \n" socialLists
144
145 -- Grouping the ngrams and keeping the maximum score for label
146 let grouped = groupedTextWithStem ( GroupedTextParams (ngramsGroup groupParams) identity (const Set.empty) (const Set.empty) {-(size . _gt_label)-} ) allTerms
147
148 groupedWithList = map (addListType (invertForw socialLists)) grouped
149
150 (stopTerms, candidateTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
151 (groupedMono, groupedMult) = Map.partition (\t -> t ^. gt_size < 2) candidateTerms
152
153 -- printDebug "\n * stopTerms * \n" stopTerms
154 -- splitting monterms and multiterms to take proportional candidates
155 let
156 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if too small
157 monoSize = 0.4 :: Double
158 multSize = 1 - monoSize
159
160 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
161
162 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
163 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
164
165 -- printDebug "groupedMonoHead" (List.length groupedMonoHead)
166 -- printDebug "groupedMonoTail" (List.length groupedMonoHead)
167 -- printDebug "groupedMultHead" (List.length groupedMultHead)
168 -- printDebug "groupedMultTail" (List.length groupedMultTail)
169
170 let
171 -- Get Local Scores now for selected grouped ngrams
172 selectedTerms = Set.toList $ List.foldl'
173 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
174 $ Set.insert l' g
175 )
176 Set.empty
177 (groupedMonoHead <> groupedMultHead)
178
179 -- TO remove (and remove HasNodeError instance)
180 userListId <- defaultList uCid
181 masterListId <- defaultList mCid
182
183 mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
184
185 let
186 mapGroups = Map.fromList
187 $ map (\g -> (g ^. gt_stem, g))
188 $ groupedMonoHead <> groupedMultHead
189
190 -- grouping with Set NodeId
191 contextsAdded = foldl' (\mapGroups' k ->
192 let k' = ngramsGroup groupParams k in
193 case Map.lookup k' mapGroups' of
194 Nothing -> mapGroups'
195 Just g -> case Map.lookup k mapTextDocIds of
196 Nothing -> mapGroups'
197 Just ns -> Map.insert k' (over gt_nodes (Set.union ns) g) mapGroups'
198 )
199 mapGroups
200 $ Map.keys mapTextDocIds
201
202 -- compute cooccurrences
203 mapCooc = Map.filter (>2)
204 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
205 | (t1, s1) <- mapStemNodeIds
206 , (t2, s2) <- mapStemNodeIds
207 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
208 ]
209 where
210 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
211 -- printDebug "mapCooc" mapCooc
212
213 let
214 -- computing scores
215 mapScores f = Map.fromList
216 $ map (\(Scored t g s') -> (t, f (g,s')))
217 $ normalizeGlobal
218 $ map normalizeLocal
219 $ scored' mapCooc
220
221 groupsWithScores = catMaybes
222 $ map (\(stem, g)
223 -> case Map.lookup stem mapScores' of
224 Nothing -> Nothing
225 Just s' -> Just $ g { _gt_score = s'}
226 ) $ Map.toList contextsAdded
227 where
228 mapScores' = mapScores identity
229 -- adapt2 TOCHECK with DC
230 -- printDebug "groupsWithScores" groupsWithScores
231 let
232 -- sort / partition / split
233 -- filter mono/multi again
234 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
235 -- filter with max score
236 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
237
238 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
239 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
240
241 -- splitAt
242 let
243 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
244 inclSize = 0.4 :: Double
245 exclSize = 1 - inclSize
246 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
247
248 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
249 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
250
251 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
252 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
253
254
255 -- Final Step building the Typed list
256 termListHead = maps <> cands
257 where
258 maps = set gt_listType (Just MapTerm)
259 <$> monoScoredInclHead
260 <> monoScoredExclHead
261 <> multScoredInclHead
262 <> multScoredExclHead
263
264 cands = set gt_listType (Just CandidateTerm)
265 <$> monoScoredInclTail
266 <> monoScoredExclTail
267 <> multScoredInclTail
268 <> multScoredExclTail
269
270 termListTail = map (set gt_listType (Just CandidateTerm)) ( groupedMonoTail <> groupedMultTail)
271
272 -- printDebug "monoScoredInclHead" monoScoredInclHead
273 -- printDebug "monoScoredExclHead" monoScoredExclTail
274 -- printDebug "multScoredInclHead" multScoredInclHead
275 -- printDebug "multScoredExclTail" multScoredExclTail
276
277 let result = Map.unionsWith (<>)
278 [ Map.fromList [( NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
279 <> (List.concat $ map toNgramsElement $ termListTail)
280 <> (List.concat $ map toNgramsElement $ stopTerms)
281 )]
282 ]
283 -- printDebug "\n result \n" r
284 pure result
285
286
287
288 toNgramsElement :: GroupedText a -> [NgramsElement]
289 toNgramsElement (GroupedText listType label _ setNgrams _ _ _) =
290 [parentElem] <> childrenElems
291 where
292 parent = label
293 children = Set.toList setNgrams
294 parentElem = mkNgramsElement (NgramsTerm parent)
295 (fromMaybe CandidateTerm listType)
296 Nothing
297 (mSetFromList (NgramsTerm <$> children))
298 childrenElems = map (\t -> mkNgramsElement t (fromMaybe CandidateTerm $ listType)
299 (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
300 (mSetFromList [])
301 ) (NgramsTerm <$> children)
302
303
304 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
305 toGargList l n = (l,n)
306
307
308 isStopTerm :: StopSize -> Text -> Bool
309 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
310 where
311 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
312
313 ------------------------------------------------------------------------------