]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[FlowList] All instances for GroupedText Int (to be removed)
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
14
15 module Gargantext.Core.Text.List
16 where
17
18
19 import Control.Lens ((^.), set, view, over)
20 import Data.Map (Map)
21 import Data.Maybe (catMaybes)
22 import Data.Ord (Down(..))
23 import Data.Set (Set)
24 import Data.Text (Text)
25 import Gargantext.API.Ngrams.Types (NgramsElement)
26 import Gargantext.API.Ngrams.Types (RepoCmdM)
27 import Gargantext.Core.Text.List.Group
28 import Gargantext.Core.Text.List.Group.Prelude
29 import Gargantext.Core.Text.List.Group.WithStem
30 import Gargantext.Core.Text.List.Social
31 import Gargantext.Core.Text.List.Social.Prelude
32 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
33 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
34 import Gargantext.Core.Types.Individu (User(..))
35 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
36 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
37 import Gargantext.Database.Admin.Types.Node (NodeId)
38 import Gargantext.Database.Prelude (CmdM)
39 import Gargantext.Database.Query.Table.Node (defaultList)
40 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
41 import Gargantext.Database.Query.Tree.Error (HasTreeError)
42 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
43 import Gargantext.Prelude
44 import qualified Data.Char as Char
45 import qualified Data.List as List
46 import qualified Data.Map as Map
47 import qualified Data.Set as Set
48 import qualified Data.Text as Text
49
50
51 -- | TODO improve grouping functions of Authors, Sources, Institutes..
52 buildNgramsLists :: ( RepoCmdM env err m
53 , CmdM env err m
54 , HasTreeError err
55 , HasNodeError err
56 )
57 => User
58 -> GroupParams
59 -> UserCorpusId
60 -> MasterCorpusId
61 -> m (Map NgramsType [NgramsElement])
62 buildNgramsLists user gp uCid mCid = do
63 ngTerms <- buildNgramsTermsList user uCid mCid gp
64 othersTerms <- mapM (buildNgramsOthersList user uCid (ngramsGroup GroupIdentity))
65 [ (Authors , MapListSize 9)
66 , (Sources , MapListSize 9)
67 , (Institutes, MapListSize 9)
68 ]
69
70 pure $ Map.unions $ [ngTerms] <> othersTerms
71
72
73 data MapListSize = MapListSize { unMapListSize :: !Int }
74
75 buildNgramsOthersList ::( HasNodeError err
76 , CmdM env err m
77 , RepoCmdM env err m
78 , HasTreeError err
79 )
80 => User
81 -> UserCorpusId
82 -> (Text -> Text)
83 -> (NgramsType, MapListSize)
84 -> m (Map NgramsType [NgramsElement])
85 buildNgramsOthersList user uCid groupIt (nt, MapListSize mapListSize) = do
86 ngs' :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt
87
88 socialLists' :: FlowCont Text FlowListScores
89 <- flowSocialList' MySelfFirst user nt (FlowCont Map.empty $ Set.fromList $ Map.keys ngs')
90 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
91
92 {-
93 printDebug "flowSocialList'"
94 $ Map.filter (not . ((==) Map.empty) . view fls_parents)
95 $ view flc_scores socialLists'
96 -}
97
98 let
99 groupParams = GroupedTextParams groupIt (Set.size . snd) fst snd {-(size . fst)-}
100 groupedWithList = toGroupedText groupParams (view flc_scores socialLists') ngs'
101
102 {-
103 printDebug "groupedWithList"
104 $ Map.map (\v -> (view gt_label v, view gt_children v))
105 $ Map.filter (\v -> (Set.size $ view gt_children v) > 0)
106 $ groupedWithList
107 -}
108
109 let
110 (stopTerms, tailTerms) = Map.partition ((== Just StopTerm) . viewListType) groupedWithList
111 (mapTerms, tailTerms') = Map.partition ((== Just MapTerm) . viewListType) tailTerms
112
113 listSize = mapListSize - (List.length mapTerms)
114 (mapTerms', candiTerms) = List.splitAt listSize
115 $ List.sortOn (Down . viewScore)
116 $ Map.elems tailTerms'
117
118 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
119 <> (toNgramsElement mapTerms )
120 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
121 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
122 )]
123
124
125 -- TODO use ListIds
126 buildNgramsTermsList :: ( HasNodeError err
127 , CmdM env err m
128 , RepoCmdM env err m
129 , HasTreeError err
130 )
131 => User
132 -> UserCorpusId
133 -> MasterCorpusId
134 -> GroupParams
135 -> m (Map NgramsType [NgramsElement])
136 buildNgramsTermsList user uCid mCid groupParams = do
137
138 -- Computing global speGen score
139 allTerms :: Map Text Double <- getTficf uCid mCid NgramsTerms
140
141 -- printDebug "head candidates" (List.take 10 $ allTerms)
142 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
143
144 -- First remove stops terms
145 socialLists <- flowSocialList user NgramsTerms (Set.fromList $ map fst $ Map.toList allTerms)
146 -- printDebug "\n * socialLists * \n" socialLists
147
148 -- Grouping the ngrams and keeping the maximum score for label
149 let grouped = groupedTextWithStem ( GroupedTextParams (ngramsGroup groupParams) identity (const Set.empty) (const Set.empty) {-(size . _gt_label)-} ) allTerms
150
151 groupedWithList = map (addListType (invertForw socialLists)) grouped
152
153 (stopTerms, candidateTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
154 (groupedMono, groupedMult) = Map.partition (\t -> t ^. gt_size < 2) candidateTerms
155
156 -- printDebug "\n * stopTerms * \n" stopTerms
157 -- splitting monterms and multiterms to take proportional candidates
158 let
159 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if too small
160 monoSize = 0.4 :: Double
161 multSize = 1 - monoSize
162
163 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
164
165 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
166 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
167
168 -- printDebug "groupedMonoHead" (List.length groupedMonoHead)
169 -- printDebug "groupedMonoTail" (List.length groupedMonoHead)
170 -- printDebug "groupedMultHead" (List.length groupedMultHead)
171 -- printDebug "groupedMultTail" (List.length groupedMultTail)
172
173 let
174 -- Get Local Scores now for selected grouped ngrams
175 selectedTerms = Set.toList $ List.foldl'
176 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
177 $ Set.insert l' g
178 )
179 Set.empty
180 (groupedMonoHead <> groupedMultHead)
181
182 -- TO remove (and remove HasNodeError instance)
183 userListId <- defaultList uCid
184 masterListId <- defaultList mCid
185
186 mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
187
188 let
189 mapGroups = Map.fromList
190 $ map (\g -> (g ^. gt_stem, g))
191 $ groupedMonoHead <> groupedMultHead
192
193 -- grouping with Set NodeId
194 contextsAdded = foldl' (\mapGroups' k ->
195 let k' = ngramsGroup groupParams k in
196 case Map.lookup k' mapGroups' of
197 Nothing -> mapGroups'
198 Just g -> case Map.lookup k mapTextDocIds of
199 Nothing -> mapGroups'
200 Just ns -> Map.insert k' (over gt_nodes (Set.union ns) g) mapGroups'
201 )
202 mapGroups
203 $ Map.keys mapTextDocIds
204
205 -- compute cooccurrences
206 mapCooc = Map.filter (>2)
207 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
208 | (t1, s1) <- mapStemNodeIds
209 , (t2, s2) <- mapStemNodeIds
210 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
211 ]
212 where
213 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
214 -- printDebug "mapCooc" mapCooc
215
216 let
217 -- computing scores
218 mapScores f = Map.fromList
219 $ map (\(Scored t g s') -> (t, f (g,s')))
220 $ normalizeGlobal
221 $ map normalizeLocal
222 $ scored' mapCooc
223
224 groupsWithScores = catMaybes
225 $ map (\(stem, g)
226 -> case Map.lookup stem mapScores' of
227 Nothing -> Nothing
228 Just s' -> Just $ g { _gt_score = s'}
229 ) $ Map.toList contextsAdded
230 where
231 mapScores' = mapScores identity
232 -- adapt2 TOCHECK with DC
233 -- printDebug "groupsWithScores" groupsWithScores
234 let
235 -- sort / partition / split
236 -- filter mono/multi again
237 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
238 -- filter with max score
239 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
240
241 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
242 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
243
244 -- splitAt
245 let
246 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
247 inclSize = 0.4 :: Double
248 exclSize = 1 - inclSize
249 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
250
251 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
252 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
253
254 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
255 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
256
257
258 -- Final Step building the Typed list
259 termListHead = maps <> cands
260 where
261 maps = set gt_listType (Just MapTerm)
262 <$> monoScoredInclHead
263 <> monoScoredExclHead
264 <> multScoredInclHead
265 <> multScoredExclHead
266
267 cands = set gt_listType (Just CandidateTerm)
268 <$> monoScoredInclTail
269 <> monoScoredExclTail
270 <> multScoredInclTail
271 <> multScoredExclTail
272
273 termListTail = map (set gt_listType (Just CandidateTerm)) ( groupedMonoTail <> groupedMultTail)
274
275 -- printDebug "monoScoredInclHead" monoScoredInclHead
276 -- printDebug "monoScoredExclHead" monoScoredExclTail
277 -- printDebug "multScoredInclHead" multScoredInclHead
278 -- printDebug "multScoredExclTail" multScoredExclTail
279
280 let result = Map.unionsWith (<>)
281 [ Map.fromList [( NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
282 <> (List.concat $ map toNgramsElement $ termListTail)
283 <> (List.concat $ map toNgramsElement $ stopTerms)
284 )]
285 ]
286 -- printDebug "\n result \n" r
287 pure result
288
289
290
291 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
292 toGargList l n = (l,n)
293
294
295 isStopTerm :: StopSize -> Text -> Bool
296 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
297 where
298 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
299
300 ------------------------------------------------------------------------------