]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[FIX] bug in FlowCont Semigroup instance (intersection for cont)
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
14
15 module Gargantext.Core.Text.List
16 where
17
18
19 import Control.Lens ((^.), view, set, over)
20 import Data.Map (Map)
21 import Data.Maybe (catMaybes)
22 import Data.Monoid (mempty)
23 import Data.Ord (Down(..))
24 import Data.Set (Set)
25 import Data.Text (Text)
26 import Data.Tuple.Extra (both)
27 import Gargantext.API.Ngrams.Types (NgramsElement)
28 import Gargantext.API.Ngrams.Types (RepoCmdM)
29 import Gargantext.Core.Text.List.Group
30 import Gargantext.Core.Text.List.Group.Prelude
31 import Gargantext.Core.Text.List.Group.WithStem
32 import Gargantext.Core.Text.List.Social
33 import Gargantext.Core.Text.List.Social.Prelude
34 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
35 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
36 import Gargantext.Core.Types.Individu (User(..))
37 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
38 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
39 import Gargantext.Database.Admin.Types.Node (NodeId)
40 import Gargantext.Database.Prelude (CmdM)
41 import Gargantext.Database.Query.Table.Node (defaultList)
42 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
43 import Gargantext.Database.Query.Tree.Error (HasTreeError)
44 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
45 import Gargantext.Prelude
46 import qualified Data.Char as Char
47 import qualified Data.List as List
48 import qualified Data.Map as Map
49 import qualified Data.Set as Set
50 import qualified Data.Text as Text
51
52
53 -- | TODO improve grouping functions of Authors, Sources, Institutes..
54 buildNgramsLists :: ( RepoCmdM env err m
55 , CmdM env err m
56 , HasTreeError err
57 , HasNodeError err
58 )
59 => User
60 -> GroupParams
61 -> UserCorpusId
62 -> MasterCorpusId
63 -> m (Map NgramsType [NgramsElement])
64 buildNgramsLists user gp uCid mCid = do
65 ngTerms <- buildNgramsTermsList user uCid mCid gp
66 othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
67 [ (Authors , MapListSize 9)
68 , (Sources , MapListSize 9)
69 , (Institutes, MapListSize 9)
70 ]
71
72 pure $ Map.unions $ [ngTerms] <> othersTerms
73
74
75 data MapListSize = MapListSize { unMapListSize :: !Int }
76
77 buildNgramsOthersList ::( HasNodeError err
78 , CmdM env err m
79 , RepoCmdM env err m
80 , HasTreeError err
81 )
82 => User
83 -> UserCorpusId
84 -> GroupParams
85 -> (NgramsType, MapListSize)
86 -> m (Map NgramsType [NgramsElement])
87 buildNgramsOthersList user uCid groupIt (nt, MapListSize mapListSize) = do
88 ngs' :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt
89
90 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
91 socialLists' :: FlowCont Text FlowListScores
92 <- flowSocialList' MySelfFirst user nt ( FlowCont Map.empty
93 $ Map.fromList
94 $ List.zip (Map.keys ngs')
95 (List.cycle [mempty])
96 )
97
98 {-
99 printDebug "flowSocialList'"
100 $ Map.filter (not . ((==) Map.empty) . (view fls_parents))
101 $ view flc_scores socialLists'
102 -}
103 let
104 groupedWithList = toGroupedTreeText groupIt socialLists' ngs'
105
106 {-
107 printDebug "groupedWithList"
108 $ view flc_scores groupedWithList
109 -}
110
111 let
112 (stopTerms, tailTerms) = Map.partition ((== Just StopTerm) . viewListType) $ view flc_scores groupedWithList
113 (mapTerms, tailTerms') = Map.partition ((== Just MapTerm) . viewListType) tailTerms
114
115 listSize = mapListSize - (List.length mapTerms)
116 (mapTerms', candiTerms) = both Map.fromList
117 $ List.splitAt listSize
118 $ List.sortOn (Down . viewScore . snd)
119 $ Map.toList tailTerms'
120
121 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
122 <> (toNgramsElement mapTerms )
123 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
124 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
125 )]
126
127
128 -- TODO use ListIds
129 buildNgramsTermsList :: ( HasNodeError err
130 , CmdM env err m
131 , RepoCmdM env err m
132 , HasTreeError err
133 )
134 => User
135 -> UserCorpusId
136 -> MasterCorpusId
137 -> GroupParams
138 -> m (Map NgramsType [NgramsElement])
139 buildNgramsTermsList user uCid mCid groupParams = do
140
141 -- Computing global speGen score
142 allTerms :: Map Text Double <- getTficf uCid mCid NgramsTerms
143
144 -- printDebug "head candidates" (List.take 10 $ allTerms)
145 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
146
147 -- First remove stops terms
148 socialLists <- flowSocialList user NgramsTerms (Set.fromList $ map fst $ Map.toList allTerms)
149 -- printDebug "\n * socialLists * \n" socialLists
150
151 -- Grouping the ngrams and keeping the maximum score for label
152 let grouped = groupedTextWithStem ( GroupedTextParams (groupWith groupParams) identity (const Set.empty) (const Set.empty) {-(size . _gt_label)-} ) allTerms
153
154 groupedWithList = map (addListType (invertForw socialLists)) grouped
155
156 (stopTerms, candidateTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
157 (groupedMono, groupedMult) = Map.partition (\t -> t ^. gt_size < 2) candidateTerms
158
159 -- printDebug "\n * stopTerms * \n" stopTerms
160 -- splitting monterms and multiterms to take proportional candidates
161 let
162 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if too small
163 monoSize = 0.4 :: Double
164 multSize = 1 - monoSize
165
166 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
167
168 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
169 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
170
171 -- printDebug "groupedMonoHead" (List.length groupedMonoHead)
172 -- printDebug "groupedMonoTail" (List.length groupedMonoHead)
173 -- printDebug "groupedMultHead" (List.length groupedMultHead)
174 -- printDebug "groupedMultTail" (List.length groupedMultTail)
175
176 let
177 -- Get Local Scores now for selected grouped ngrams
178 -- TODO HasTerms
179 selectedTerms = Set.toList $ List.foldl'
180 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
181 $ Set.insert l' g
182 )
183 Set.empty
184 (groupedMonoHead <> groupedMultHead)
185
186 -- TO remove (and remove HasNodeError instance)
187 userListId <- defaultList uCid
188 masterListId <- defaultList mCid
189
190 mapTextDocIds <- getNodesByNgramsOnlyUser uCid
191 [userListId, masterListId]
192 NgramsTerms
193 selectedTerms
194
195 let
196 mapGroups = Map.fromList
197 $ map (\g -> (g ^. gt_stem, g))
198 $ groupedMonoHead <> groupedMultHead
199
200 -- grouping with Set NodeId
201 contextsAdded = foldl' (\mapGroups' k ->
202 let k' = groupWith groupParams k in
203 case Map.lookup k' mapGroups' of
204 Nothing -> mapGroups'
205 Just g -> case Map.lookup k mapTextDocIds of
206 Nothing -> mapGroups'
207 Just ns -> Map.insert k' (over gt_nodes (Set.union ns) g) mapGroups'
208 )
209 mapGroups
210 $ Map.keys mapTextDocIds
211
212 -- compute cooccurrences
213 mapCooc = Map.filter (>2)
214 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
215 | (t1, s1) <- mapStemNodeIds
216 , (t2, s2) <- mapStemNodeIds
217 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
218 ]
219 where
220 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
221 -- printDebug "mapCooc" mapCooc
222
223 let
224 -- computing scores
225 mapScores f = Map.fromList
226 $ map (\(Scored t g s') -> (t, f (g,s')))
227 $ normalizeGlobal
228 $ map normalizeLocal
229 $ scored' mapCooc
230
231 groupsWithScores = catMaybes
232 $ map (\(stem, g)
233 -> case Map.lookup stem mapScores' of
234 Nothing -> Nothing
235 Just s' -> Just $ g { _gt_score = s'}
236 ) $ Map.toList contextsAdded
237 where
238 mapScores' = mapScores identity
239 -- adapt2 TOCHECK with DC
240 -- printDebug "groupsWithScores" groupsWithScores
241 let
242 -- sort / partition / split
243 -- filter mono/multi again
244 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
245 -- filter with max score
246 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
247
248 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
249 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
250
251 -- splitAt
252 let
253 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
254 inclSize = 0.4 :: Double
255 exclSize = 1 - inclSize
256 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
257
258 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
259 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
260
261 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
262 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
263
264
265 -- Final Step building the Typed list
266 termListHead = maps <> cands
267 where
268 maps = set gt_listType (Just MapTerm)
269 <$> monoScoredInclHead
270 <> monoScoredExclHead
271 <> multScoredInclHead
272 <> multScoredExclHead
273
274 cands = set gt_listType (Just CandidateTerm)
275 <$> monoScoredInclTail
276 <> monoScoredExclTail
277 <> multScoredInclTail
278 <> multScoredExclTail
279
280 termListTail = map (set gt_listType (Just CandidateTerm)) ( groupedMonoTail <> groupedMultTail)
281
282 -- printDebug "monoScoredInclHead" monoScoredInclHead
283 -- printDebug "monoScoredExclHead" monoScoredExclTail
284 -- printDebug "multScoredInclHead" multScoredInclHead
285 -- printDebug "multScoredExclTail" multScoredExclTail
286
287 let result = Map.unionsWith (<>)
288 [ Map.fromList [( NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
289 <> (List.concat $ map toNgramsElement $ termListTail)
290 <> (List.concat $ map toNgramsElement $ stopTerms)
291 )]
292 ]
293 -- printDebug "\n result \n" r
294 pure result
295
296
297
298 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
299 toGargList l n = (l,n)
300
301
302 isStopTerm :: StopSize -> Text -> Bool
303 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
304 where
305 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
306
307 ------------------------------------------------------------------------------