]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[FlowCont] improving Type (to prepare group terms)
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
14
15 module Gargantext.Core.Text.List
16 where
17
18
19 import Control.Lens ((^.), set, over)
20 import Data.Map (Map)
21 import Data.Maybe (catMaybes)
22 import Data.Monoid (mempty)
23 import Data.Ord (Down(..))
24 import Data.Set (Set)
25 import Data.Text (Text)
26 import Data.Tuple.Extra (both)
27 import Gargantext.API.Ngrams.Types (NgramsElement)
28 import Gargantext.API.Ngrams.Types (RepoCmdM)
29 import Gargantext.Core.Text.List.Group
30 import Gargantext.Core.Text.List.Group.Prelude
31 import Gargantext.Core.Text.List.Group.WithStem
32 import Gargantext.Core.Text.List.Social
33 import Gargantext.Core.Text.List.Social.Prelude
34 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
35 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
36 import Gargantext.Core.Types.Individu (User(..))
37 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
38 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
39 import Gargantext.Database.Admin.Types.Node (NodeId)
40 import Gargantext.Database.Prelude (CmdM)
41 import Gargantext.Database.Query.Table.Node (defaultList)
42 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
43 import Gargantext.Database.Query.Tree.Error (HasTreeError)
44 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
45 import Gargantext.Prelude
46 import qualified Data.Char as Char
47 import qualified Data.List as List
48 import qualified Data.Map as Map
49 import qualified Data.Set as Set
50 import qualified Data.Text as Text
51
52
53 -- | TODO improve grouping functions of Authors, Sources, Institutes..
54 buildNgramsLists :: ( RepoCmdM env err m
55 , CmdM env err m
56 , HasTreeError err
57 , HasNodeError err
58 )
59 => User
60 -> GroupParams
61 -> UserCorpusId
62 -> MasterCorpusId
63 -> m (Map NgramsType [NgramsElement])
64 buildNgramsLists user gp uCid mCid = do
65 ngTerms <- buildNgramsTermsList user uCid mCid gp
66 othersTerms <- mapM (buildNgramsOthersList user uCid (ngramsGroup GroupIdentity))
67 [ (Authors , MapListSize 9)
68 , (Sources , MapListSize 9)
69 , (Institutes, MapListSize 9)
70 ]
71
72 pure $ Map.unions $ [ngTerms] <> othersTerms
73
74
75 data MapListSize = MapListSize { unMapListSize :: !Int }
76
77 buildNgramsOthersList ::( HasNodeError err
78 , CmdM env err m
79 , RepoCmdM env err m
80 , HasTreeError err
81 )
82 => User
83 -> UserCorpusId
84 -> (Text -> Text)
85 -> (NgramsType, MapListSize)
86 -> m (Map NgramsType [NgramsElement])
87 buildNgramsOthersList user uCid groupIt (nt, MapListSize mapListSize) = do
88 ngs' :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt
89
90 socialLists' :: FlowCont Text FlowListScores
91 <- flowSocialList' MySelfFirst user nt ( FlowCont Map.empty
92 $ Map.fromList
93 $ List.zip (Map.keys ngs')
94 (List.cycle [mempty])
95 )
96 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
97
98 {-
99 printDebug "flowSocialList'"
100 $ Map.filter (not . ((==) Map.empty) . view fls_parents)
101 $ view flc_scores socialLists'
102 -}
103
104 let
105 groupParams = GroupedTextParams groupIt (Set.size . snd) fst snd {-(size . fst)-}
106 groupedWithList = toGroupedTreeText groupParams socialLists' ngs'
107
108 {-
109 printDebug "groupedWithList"
110 $ Map.map (\v -> (view gt_label v, view gt_children v))
111 $ Map.filter (\v -> (Set.size $ view gt_children v) > 0)
112 $ groupedWithList
113 -}
114
115 let
116 (stopTerms, tailTerms) = Map.partition ((== Just StopTerm) . viewListType) groupedWithList
117 (mapTerms, tailTerms') = Map.partition ((== Just MapTerm) . viewListType) tailTerms
118
119 listSize = mapListSize - (List.length mapTerms)
120 (mapTerms', candiTerms) = both Map.fromList
121 $ List.splitAt listSize
122 $ List.sortOn (Down . viewScore . snd)
123 $ Map.toList tailTerms'
124
125 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
126 <> (toNgramsElement mapTerms )
127 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
128 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
129 )]
130
131
132 -- TODO use ListIds
133 buildNgramsTermsList :: ( HasNodeError err
134 , CmdM env err m
135 , RepoCmdM env err m
136 , HasTreeError err
137 )
138 => User
139 -> UserCorpusId
140 -> MasterCorpusId
141 -> GroupParams
142 -> m (Map NgramsType [NgramsElement])
143 buildNgramsTermsList user uCid mCid groupParams = do
144
145 -- Computing global speGen score
146 allTerms :: Map Text Double <- getTficf uCid mCid NgramsTerms
147
148 -- printDebug "head candidates" (List.take 10 $ allTerms)
149 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
150
151 -- First remove stops terms
152 socialLists <- flowSocialList user NgramsTerms (Set.fromList $ map fst $ Map.toList allTerms)
153 -- printDebug "\n * socialLists * \n" socialLists
154
155 -- Grouping the ngrams and keeping the maximum score for label
156 let grouped = groupedTextWithStem ( GroupedTextParams (ngramsGroup groupParams) identity (const Set.empty) (const Set.empty) {-(size . _gt_label)-} ) allTerms
157
158 groupedWithList = map (addListType (invertForw socialLists)) grouped
159
160 (stopTerms, candidateTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
161 (groupedMono, groupedMult) = Map.partition (\t -> t ^. gt_size < 2) candidateTerms
162
163 -- printDebug "\n * stopTerms * \n" stopTerms
164 -- splitting monterms and multiterms to take proportional candidates
165 let
166 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if too small
167 monoSize = 0.4 :: Double
168 multSize = 1 - monoSize
169
170 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
171
172 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
173 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
174
175 -- printDebug "groupedMonoHead" (List.length groupedMonoHead)
176 -- printDebug "groupedMonoTail" (List.length groupedMonoHead)
177 -- printDebug "groupedMultHead" (List.length groupedMultHead)
178 -- printDebug "groupedMultTail" (List.length groupedMultTail)
179
180 let
181 -- Get Local Scores now for selected grouped ngrams
182 selectedTerms = Set.toList $ List.foldl'
183 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
184 $ Set.insert l' g
185 )
186 Set.empty
187 (groupedMonoHead <> groupedMultHead)
188
189 -- TO remove (and remove HasNodeError instance)
190 userListId <- defaultList uCid
191 masterListId <- defaultList mCid
192
193 mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
194
195 let
196 mapGroups = Map.fromList
197 $ map (\g -> (g ^. gt_stem, g))
198 $ groupedMonoHead <> groupedMultHead
199
200 -- grouping with Set NodeId
201 contextsAdded = foldl' (\mapGroups' k ->
202 let k' = ngramsGroup groupParams k in
203 case Map.lookup k' mapGroups' of
204 Nothing -> mapGroups'
205 Just g -> case Map.lookup k mapTextDocIds of
206 Nothing -> mapGroups'
207 Just ns -> Map.insert k' (over gt_nodes (Set.union ns) g) mapGroups'
208 )
209 mapGroups
210 $ Map.keys mapTextDocIds
211
212 -- compute cooccurrences
213 mapCooc = Map.filter (>2)
214 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
215 | (t1, s1) <- mapStemNodeIds
216 , (t2, s2) <- mapStemNodeIds
217 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
218 ]
219 where
220 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
221 -- printDebug "mapCooc" mapCooc
222
223 let
224 -- computing scores
225 mapScores f = Map.fromList
226 $ map (\(Scored t g s') -> (t, f (g,s')))
227 $ normalizeGlobal
228 $ map normalizeLocal
229 $ scored' mapCooc
230
231 groupsWithScores = catMaybes
232 $ map (\(stem, g)
233 -> case Map.lookup stem mapScores' of
234 Nothing -> Nothing
235 Just s' -> Just $ g { _gt_score = s'}
236 ) $ Map.toList contextsAdded
237 where
238 mapScores' = mapScores identity
239 -- adapt2 TOCHECK with DC
240 -- printDebug "groupsWithScores" groupsWithScores
241 let
242 -- sort / partition / split
243 -- filter mono/multi again
244 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
245 -- filter with max score
246 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
247
248 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
249 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
250
251 -- splitAt
252 let
253 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
254 inclSize = 0.4 :: Double
255 exclSize = 1 - inclSize
256 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
257
258 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
259 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
260
261 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
262 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
263
264
265 -- Final Step building the Typed list
266 termListHead = maps <> cands
267 where
268 maps = set gt_listType (Just MapTerm)
269 <$> monoScoredInclHead
270 <> monoScoredExclHead
271 <> multScoredInclHead
272 <> multScoredExclHead
273
274 cands = set gt_listType (Just CandidateTerm)
275 <$> monoScoredInclTail
276 <> monoScoredExclTail
277 <> multScoredInclTail
278 <> multScoredExclTail
279
280 termListTail = map (set gt_listType (Just CandidateTerm)) ( groupedMonoTail <> groupedMultTail)
281
282 -- printDebug "monoScoredInclHead" monoScoredInclHead
283 -- printDebug "monoScoredExclHead" monoScoredExclTail
284 -- printDebug "multScoredInclHead" multScoredInclHead
285 -- printDebug "multScoredExclTail" multScoredExclTail
286
287 let result = Map.unionsWith (<>)
288 [ Map.fromList [( NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
289 <> (List.concat $ map toNgramsElement $ termListTail)
290 <> (List.concat $ map toNgramsElement $ stopTerms)
291 )]
292 ]
293 -- printDebug "\n result \n" r
294 pure result
295
296
297
298 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
299 toGargList l n = (l,n)
300
301
302 isStopTerm :: StopSize -> Text -> Bool
303 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
304 where
305 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
306
307 ------------------------------------------------------------------------------