]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[FIX] Shared lists is taken into account now
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
14
15 module Gargantext.Core.Text.List
16 where
17
18
19 import Control.Lens ((^.), view, over)
20 import Data.Map (Map)
21 import Data.Maybe (catMaybes)
22 import Data.Monoid (mempty)
23 import Data.Ord (Down(..))
24 import Data.Set (Set)
25 import Data.Text (Text)
26 import Data.Tuple.Extra (both)
27 import Gargantext.API.Ngrams.Types (NgramsElement)
28 import Gargantext.API.Ngrams.Types (RepoCmdM)
29 import Gargantext.Core.Text.List.Group
30 import Gargantext.Core.Text.List.Group.Prelude
31 import Gargantext.Core.Text.List.Group.WithStem
32 import Gargantext.Core.Text.List.Social
33 import Gargantext.Core.Text.List.Social.Prelude
34 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
35 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
36 import Gargantext.Core.Types.Individu (User(..))
37 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
38 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
39 import Gargantext.Database.Admin.Types.Node (NodeId)
40 import Gargantext.Database.Prelude (CmdM)
41 import Gargantext.Database.Query.Table.Node (defaultList)
42 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
43 import Gargantext.Database.Query.Tree.Error (HasTreeError)
44 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
45 import Gargantext.Prelude
46 import qualified Data.Char as Char
47 import qualified Data.List as List
48 import qualified Data.Map as Map
49 import qualified Data.Set as Set
50 import qualified Data.Text as Text
51
52
53 -- | TODO improve grouping functions of Authors, Sources, Institutes..
54 buildNgramsLists :: ( RepoCmdM env err m
55 , CmdM env err m
56 , HasTreeError err
57 , HasNodeError err
58 )
59 => User
60 -> GroupParams
61 -> UserCorpusId
62 -> MasterCorpusId
63 -> m (Map NgramsType [NgramsElement])
64 buildNgramsLists user gp uCid mCid = do
65 ngTerms <- buildNgramsTermsList user uCid mCid gp
66 othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
67 [ (Authors , MapListSize 9)
68 , (Sources , MapListSize 9)
69 , (Institutes, MapListSize 9)
70 ]
71
72 pure $ Map.unions $ [ngTerms] <> othersTerms
73
74
75 data MapListSize = MapListSize { unMapListSize :: !Int }
76
77 buildNgramsOthersList ::( HasNodeError err
78 , CmdM env err m
79 , RepoCmdM env err m
80 , HasTreeError err
81 )
82 => User
83 -> UserCorpusId
84 -> GroupParams
85 -> (NgramsType, MapListSize)
86 -> m (Map NgramsType [NgramsElement])
87 buildNgramsOthersList user uCid groupIt (nt, MapListSize mapListSize) = do
88 ngs' :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt
89
90 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
91 socialLists' :: FlowCont Text FlowListScores
92 <- flowSocialList' MySelfFirst user nt ( FlowCont Map.empty
93 $ Map.fromList
94 $ List.zip (Map.keys ngs')
95 (List.cycle [mempty])
96 )
97
98 {-
99 printDebug "flowSocialList'"
100 $ Map.filter (not . ((==) Map.empty) . (view fls_parents))
101 $ view flc_scores socialLists'
102 -}
103 let
104 groupedWithList = toGroupedTreeText groupIt socialLists' ngs'
105
106 printDebug "groupedWithList"
107 $ view flc_cont groupedWithList
108
109 let
110 (stopTerms, tailTerms) = Map.partition ((== Just StopTerm) . viewListType) $ view flc_scores groupedWithList
111 (mapTerms, tailTerms') = Map.partition ((== Just MapTerm) . viewListType) tailTerms
112
113 listSize = mapListSize - (List.length mapTerms)
114 (mapTerms', candiTerms) = both Map.fromList
115 $ List.splitAt listSize
116 $ List.sortOn (Down . viewScore . snd)
117 $ Map.toList tailTerms'
118
119 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
120 <> (toNgramsElement mapTerms )
121 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
122 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
123 )]
124
125
126 -- TODO use ListIds
127 buildNgramsTermsList :: ( HasNodeError err
128 , CmdM env err m
129 , RepoCmdM env err m
130 , HasTreeError err
131 )
132 => User
133 -> UserCorpusId
134 -> MasterCorpusId
135 -> GroupParams
136 -> m (Map NgramsType [NgramsElement])
137 buildNgramsTermsList user uCid mCid groupParams = do
138
139 -- Computing global speGen score
140 allTerms :: Map Text Double <- getTficf uCid mCid NgramsTerms
141
142 -- printDebug "head candidates" (List.take 10 $ allTerms)
143 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
144
145 -- First remove stops terms
146 socialLists <- flowSocialList user NgramsTerms (Set.fromList $ map fst $ Map.toList allTerms)
147 -- printDebug "\n * socialLists * \n" socialLists
148
149 -- Grouping the ngrams and keeping the maximum score for label
150 let grouped = groupedTextWithStem ( GroupedTextParams (groupWith groupParams) identity (const Set.empty) (const Set.empty) {-(size . _gt_label)-} ) allTerms
151
152 groupedWithList = map (addListType (invertForw socialLists)) grouped
153
154 (stopTerms, candidateTerms) = Map.partition ((== Just StopTerm) . viewListType) groupedWithList
155 (groupedMono, groupedMult) = Map.partition (\t -> t ^. gt_size < 2) candidateTerms
156 -- (groupedMono, groupedMult) = Map.partitionWithKey (\t -> t ^. gt_size < 2) candidateTerms
157
158 -- printDebug "\n * stopTerms * \n" stopTerms
159 -- splitting monterms and multiterms to take proportional candidates
160 let
161 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if too small
162 monoSize = 0.4 :: Double
163 multSize = 1 - monoSize
164
165 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
166
167 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
168 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
169
170 -- printDebug "groupedMonoHead" (List.length groupedMonoHead)
171 -- printDebug "groupedMonoTail" (List.length groupedMonoHead)
172 -- printDebug "groupedMultHead" (List.length groupedMultHead)
173 -- printDebug "groupedMultTail" (List.length groupedMultTail)
174
175 let
176 -- Get Local Scores now for selected grouped ngrams
177 -- TODO HasTerms
178 selectedTerms = Set.toList $ List.foldl'
179 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
180 $ Set.insert l' g
181 )
182 Set.empty
183 (groupedMonoHead <> groupedMultHead)
184 -- selectedTerms = hasTerms (groupedMonoHead <> groupedMultHead)
185
186 -- TO remove (and remove HasNodeError instance)
187 userListId <- defaultList uCid
188 masterListId <- defaultList mCid
189
190 mapTextDocIds <- getNodesByNgramsOnlyUser uCid
191 [userListId, masterListId]
192 NgramsTerms
193 selectedTerms
194
195 let
196 mapGroups = Map.fromList
197 $ map (\g -> (g ^. gt_stem, g))
198 $ groupedMonoHead <> groupedMultHead
199
200 -- grouping with Set NodeId
201 contextsAdded = foldl' (\mapGroups' k ->
202 let k' = groupWith groupParams k in
203 case Map.lookup k' mapGroups' of
204 Nothing -> mapGroups'
205 Just g -> case Map.lookup k mapTextDocIds of
206 Nothing -> mapGroups'
207 Just ns -> Map.insert k' (over gt_nodes (Set.union ns) g) mapGroups'
208 )
209 mapGroups
210 $ Map.keys mapTextDocIds
211
212 -- compute cooccurrences
213 mapCooc = Map.filter (>2)
214 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
215 | (t1, s1) <- mapStemNodeIds
216 , (t2, s2) <- mapStemNodeIds
217 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
218 ]
219 where
220 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
221 -- printDebug "mapCooc" mapCooc
222
223 let
224 -- computing scores
225 mapScores f = Map.fromList
226 $ map (\(Scored t g s') -> (t, f (g,s')))
227 $ normalizeGlobal
228 $ map normalizeLocal
229 $ scored' mapCooc
230
231 groupsWithScores = catMaybes
232 $ map (\(stem, g)
233 -> case Map.lookup stem mapScores' of
234 Nothing -> Nothing
235 Just s' -> Just $ g { _gt_score = s'}
236 ) $ Map.toList contextsAdded
237 where
238 mapScores' = mapScores identity
239 -- adapt2 TOCHECK with DC
240 -- printDebug "groupsWithScores" groupsWithScores
241 let
242 -- sort / partition / split
243 -- filter mono/multi again
244 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
245 -- filter with max score
246 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = viewScore g in s1 > s2 )
247
248 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
249 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
250
251 -- splitAt
252 let
253 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
254 inclSize = 0.4 :: Double
255 exclSize = 1 - inclSize
256 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
257
258 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . viewScore) monoScoredIncl
259 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . viewScore) monoScoredExcl
260
261 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . viewScore) multScoredIncl
262 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . viewScore) multScoredExcl
263
264
265 -- Final Step building the Typed list
266 termListHead = maps <> cands
267 where
268 maps = setListType (Just MapTerm)
269 <$> monoScoredInclHead
270 <> monoScoredExclHead
271 <> multScoredInclHead
272 <> multScoredExclHead
273
274 cands = setListType (Just CandidateTerm)
275 <$> monoScoredInclTail
276 <> monoScoredExclTail
277 <> multScoredInclTail
278 <> multScoredExclTail
279
280 termListTail = map (setListType (Just CandidateTerm)) ( groupedMonoTail <> groupedMultTail)
281
282 -- printDebug "monoScoredInclHead" monoScoredInclHead
283 -- printDebug "monoScoredExclHead" monoScoredExclTail
284 -- printDebug "multScoredInclHead" multScoredInclHead
285 -- printDebug "multScoredExclTail" multScoredExclTail
286
287 let result = Map.unionsWith (<>)
288 [ Map.fromList [( NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
289 <> (List.concat $ map toNgramsElement $ termListTail)
290 <> (List.concat $ map toNgramsElement $ stopTerms)
291 )]
292 ]
293 -- printDebug "\n result \n" r
294 pure result
295
296
297
298 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
299 toGargList l n = (l,n)
300
301
302 isStopTerm :: StopSize -> Text -> Bool
303 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
304 where
305 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
306
307 ------------------------------------------------------------------------------