]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[FEAT] MergeWith stem done (before flow integration).
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
14
15 module Gargantext.Core.Text.List
16 where
17
18
19 import Control.Lens ((^.), set, over)
20 import Data.Map (Map)
21 import Data.Maybe (catMaybes)
22 import Data.Monoid (mempty)
23 import Data.Ord (Down(..))
24 import Data.Set (Set)
25 import Data.Text (Text)
26 import Data.Tuple.Extra (both)
27 import Gargantext.API.Ngrams.Types (NgramsElement)
28 import Gargantext.API.Ngrams.Types (RepoCmdM)
29 import Gargantext.Core.Text.List.Group
30 import Gargantext.Core.Text.List.Group.Prelude
31 import Gargantext.Core.Text.List.Group.WithStem
32 import Gargantext.Core.Text.List.Social
33 import Gargantext.Core.Text.List.Social.Prelude
34 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
35 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
36 import Gargantext.Core.Types.Individu (User(..))
37 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
38 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
39 import Gargantext.Database.Admin.Types.Node (NodeId)
40 import Gargantext.Database.Prelude (CmdM)
41 import Gargantext.Database.Query.Table.Node (defaultList)
42 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
43 import Gargantext.Database.Query.Tree.Error (HasTreeError)
44 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
45 import Gargantext.Prelude
46 import qualified Data.Char as Char
47 import qualified Data.List as List
48 import qualified Data.Map as Map
49 import qualified Data.Set as Set
50 import qualified Data.Text as Text
51
52
53 -- | TODO improve grouping functions of Authors, Sources, Institutes..
54 buildNgramsLists :: ( RepoCmdM env err m
55 , CmdM env err m
56 , HasTreeError err
57 , HasNodeError err
58 )
59 => User
60 -> GroupParams
61 -> UserCorpusId
62 -> MasterCorpusId
63 -> m (Map NgramsType [NgramsElement])
64 buildNgramsLists user gp uCid mCid = do
65 ngTerms <- buildNgramsTermsList user uCid mCid gp
66 othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
67 [ (Authors , MapListSize 9)
68 , (Sources , MapListSize 9)
69 , (Institutes, MapListSize 9)
70 ]
71
72 pure $ Map.unions $ [ngTerms] <> othersTerms
73
74
75 data MapListSize = MapListSize { unMapListSize :: !Int }
76
77 buildNgramsOthersList ::( HasNodeError err
78 , CmdM env err m
79 , RepoCmdM env err m
80 , HasTreeError err
81 )
82 => User
83 -> UserCorpusId
84 -> GroupParams
85 -> (NgramsType, MapListSize)
86 -> m (Map NgramsType [NgramsElement])
87 buildNgramsOthersList user uCid groupIt (nt, MapListSize mapListSize) = do
88 ngs' :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt
89
90 socialLists' :: FlowCont Text FlowListScores
91 <- flowSocialList' MySelfFirst user nt ( FlowCont Map.empty
92 $ Map.fromList
93 $ List.zip (Map.keys ngs')
94 (List.cycle [mempty])
95 )
96 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
97
98 {-
99 printDebug "flowSocialList'"
100 $ Map.filter (not . ((==) Map.empty) . view fls_parents)
101 $ view flc_scores socialLists'
102 -}
103
104 let
105 groupedWithList = toGroupedTreeText groupIt socialLists' ngs'
106
107 {-
108 printDebug "groupedWithList"
109 $ Map.map (\v -> (view gt_label v, view gt_children v))
110 $ Map.filter (\v -> (Set.size $ view gt_children v) > 0)
111 $ groupedWithList
112 -}
113
114 let
115 (stopTerms, tailTerms) = Map.partition ((== Just StopTerm) . viewListType) groupedWithList
116 (mapTerms, tailTerms') = Map.partition ((== Just MapTerm) . viewListType) tailTerms
117
118 listSize = mapListSize - (List.length mapTerms)
119 (mapTerms', candiTerms) = both Map.fromList
120 $ List.splitAt listSize
121 $ List.sortOn (Down . viewScore . snd)
122 $ Map.toList tailTerms'
123
124 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
125 <> (toNgramsElement mapTerms )
126 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
127 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
128 )]
129
130
131 -- TODO use ListIds
132 buildNgramsTermsList :: ( HasNodeError err
133 , CmdM env err m
134 , RepoCmdM env err m
135 , HasTreeError err
136 )
137 => User
138 -> UserCorpusId
139 -> MasterCorpusId
140 -> GroupParams
141 -> m (Map NgramsType [NgramsElement])
142 buildNgramsTermsList user uCid mCid groupParams = do
143
144 -- Computing global speGen score
145 allTerms :: Map Text Double <- getTficf uCid mCid NgramsTerms
146
147 -- printDebug "head candidates" (List.take 10 $ allTerms)
148 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
149
150 -- First remove stops terms
151 socialLists <- flowSocialList user NgramsTerms (Set.fromList $ map fst $ Map.toList allTerms)
152 -- printDebug "\n * socialLists * \n" socialLists
153
154 -- Grouping the ngrams and keeping the maximum score for label
155 let grouped = groupedTextWithStem ( GroupedTextParams (groupWith groupParams) identity (const Set.empty) (const Set.empty) {-(size . _gt_label)-} ) allTerms
156
157 groupedWithList = map (addListType (invertForw socialLists)) grouped
158
159 (stopTerms, candidateTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
160 (groupedMono, groupedMult) = Map.partition (\t -> t ^. gt_size < 2) candidateTerms
161
162 -- printDebug "\n * stopTerms * \n" stopTerms
163 -- splitting monterms and multiterms to take proportional candidates
164 let
165 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if too small
166 monoSize = 0.4 :: Double
167 multSize = 1 - monoSize
168
169 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
170
171 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
172 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
173
174 -- printDebug "groupedMonoHead" (List.length groupedMonoHead)
175 -- printDebug "groupedMonoTail" (List.length groupedMonoHead)
176 -- printDebug "groupedMultHead" (List.length groupedMultHead)
177 -- printDebug "groupedMultTail" (List.length groupedMultTail)
178
179 let
180 -- Get Local Scores now for selected grouped ngrams
181 selectedTerms = Set.toList $ List.foldl'
182 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
183 $ Set.insert l' g
184 )
185 Set.empty
186 (groupedMonoHead <> groupedMultHead)
187
188 -- TO remove (and remove HasNodeError instance)
189 userListId <- defaultList uCid
190 masterListId <- defaultList mCid
191
192 mapTextDocIds <- getNodesByNgramsOnlyUser uCid
193 [userListId, masterListId]
194 NgramsTerms
195 selectedTerms
196
197 let
198 mapGroups = Map.fromList
199 $ map (\g -> (g ^. gt_stem, g))
200 $ groupedMonoHead <> groupedMultHead
201
202 -- grouping with Set NodeId
203 contextsAdded = foldl' (\mapGroups' k ->
204 let k' = groupWith groupParams k in
205 case Map.lookup k' mapGroups' of
206 Nothing -> mapGroups'
207 Just g -> case Map.lookup k mapTextDocIds of
208 Nothing -> mapGroups'
209 Just ns -> Map.insert k' (over gt_nodes (Set.union ns) g) mapGroups'
210 )
211 mapGroups
212 $ Map.keys mapTextDocIds
213
214 -- compute cooccurrences
215 mapCooc = Map.filter (>2)
216 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
217 | (t1, s1) <- mapStemNodeIds
218 , (t2, s2) <- mapStemNodeIds
219 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
220 ]
221 where
222 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
223 -- printDebug "mapCooc" mapCooc
224
225 let
226 -- computing scores
227 mapScores f = Map.fromList
228 $ map (\(Scored t g s') -> (t, f (g,s')))
229 $ normalizeGlobal
230 $ map normalizeLocal
231 $ scored' mapCooc
232
233 groupsWithScores = catMaybes
234 $ map (\(stem, g)
235 -> case Map.lookup stem mapScores' of
236 Nothing -> Nothing
237 Just s' -> Just $ g { _gt_score = s'}
238 ) $ Map.toList contextsAdded
239 where
240 mapScores' = mapScores identity
241 -- adapt2 TOCHECK with DC
242 -- printDebug "groupsWithScores" groupsWithScores
243 let
244 -- sort / partition / split
245 -- filter mono/multi again
246 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
247 -- filter with max score
248 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
249
250 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
251 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
252
253 -- splitAt
254 let
255 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
256 inclSize = 0.4 :: Double
257 exclSize = 1 - inclSize
258 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
259
260 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
261 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
262
263 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
264 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
265
266
267 -- Final Step building the Typed list
268 termListHead = maps <> cands
269 where
270 maps = set gt_listType (Just MapTerm)
271 <$> monoScoredInclHead
272 <> monoScoredExclHead
273 <> multScoredInclHead
274 <> multScoredExclHead
275
276 cands = set gt_listType (Just CandidateTerm)
277 <$> monoScoredInclTail
278 <> monoScoredExclTail
279 <> multScoredInclTail
280 <> multScoredExclTail
281
282 termListTail = map (set gt_listType (Just CandidateTerm)) ( groupedMonoTail <> groupedMultTail)
283
284 -- printDebug "monoScoredInclHead" monoScoredInclHead
285 -- printDebug "monoScoredExclHead" monoScoredExclTail
286 -- printDebug "multScoredInclHead" multScoredInclHead
287 -- printDebug "multScoredExclTail" multScoredExclTail
288
289 let result = Map.unionsWith (<>)
290 [ Map.fromList [( NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
291 <> (List.concat $ map toNgramsElement $ termListTail)
292 <> (List.concat $ map toNgramsElement $ stopTerms)
293 )]
294 ]
295 -- printDebug "\n result \n" r
296 pure result
297
298
299
300 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
301 toGargList l n = (l,n)
302
303
304 isStopTerm :: StopSize -> Text -> Bool
305 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
306 where
307 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
308
309 ------------------------------------------------------------------------------