]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[FIX] Cont can be empty now.
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
14
15 module Gargantext.Core.Text.List
16 where
17
18 import Control.Lens ((^.), view, over)
19 import Data.Map (Map)
20 import Data.Maybe (catMaybes)
21 import Data.Monoid (mempty)
22 import Data.Ord (Down(..))
23 import Data.Set (Set)
24 import Data.Text (Text)
25 import Data.Tuple.Extra (both)
26 import Gargantext.API.Ngrams.Types (NgramsElement)
27 import Gargantext.API.Ngrams.Types (RepoCmdM)
28 import Gargantext.Core.Text.List.Group
29 import Gargantext.Core.Text.List.Group.Prelude
30 import Gargantext.Core.Text.List.Group.WithStem
31 import Gargantext.Core.Text.List.Social
32 import Gargantext.Core.Text.List.Social.Prelude
33 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
34 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
35 import Gargantext.Core.Types.Individu (User(..))
36 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
37 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
38 import Gargantext.Database.Admin.Types.Node (NodeId)
39 import Gargantext.Database.Prelude (CmdM)
40 import Gargantext.Database.Query.Table.Node (defaultList)
41 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
42 import Gargantext.Database.Query.Tree.Error (HasTreeError)
43 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
44 import Gargantext.Prelude
45 import qualified Data.Char as Char
46 import qualified Data.List as List
47 import qualified Data.Map as Map
48 import qualified Data.Set as Set
49 import qualified Data.Text as Text
50
51
52 -- | TODO improve grouping functions of Authors, Sources, Institutes..
53 buildNgramsLists :: ( RepoCmdM env err m
54 , CmdM env err m
55 , HasTreeError err
56 , HasNodeError err
57 )
58 => User
59 -> GroupParams
60 -> UserCorpusId
61 -> MasterCorpusId
62 -> m (Map NgramsType [NgramsElement])
63 buildNgramsLists user gp uCid mCid = do
64 ngTerms <- buildNgramsTermsList user uCid mCid gp
65 othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
66 [ (Authors , MapListSize 9)
67 , (Sources , MapListSize 9)
68 , (Institutes, MapListSize 9)
69 ]
70
71 pure $ Map.unions $ [ngTerms] <> othersTerms
72
73
74 data MapListSize = MapListSize { unMapListSize :: !Int }
75
76 buildNgramsOthersList ::( HasNodeError err
77 , CmdM env err m
78 , RepoCmdM env err m
79 , HasTreeError err
80 )
81 => User
82 -> UserCorpusId
83 -> GroupParams
84 -> (NgramsType, MapListSize)
85 -> m (Map NgramsType [NgramsElement])
86 buildNgramsOthersList user uCid groupIt (nt, MapListSize mapListSize) = do
87 ngs' :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt
88
89 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
90 socialLists' :: FlowCont Text FlowListScores
91 <- flowSocialList' MySelfFirst user nt ( FlowCont Map.empty
92 $ Map.fromList
93 $ List.zip (Map.keys ngs')
94 (List.cycle [mempty])
95 )
96 {-
97 printDebug "flowSocialList'"
98 $ Map.filter (not . ((==) Map.empty) . (view fls_parents))
99 $ view flc_scores socialLists'
100 -}
101
102 let
103 groupedWithList = toGroupedTreeText groupIt socialLists' ngs'
104
105 printDebug "groupedWithList"
106 $ view flc_cont groupedWithList
107
108 let
109 (stopTerms, tailTerms) = Map.partition ((== Just StopTerm) . viewListType) $ view flc_scores groupedWithList
110 (mapTerms, tailTerms') = Map.partition ((== Just MapTerm) . viewListType) tailTerms
111
112 listSize = mapListSize - (List.length mapTerms)
113 (mapTerms', candiTerms) = both Map.fromList
114 $ List.splitAt listSize
115 $ List.sortOn (Down . viewScore . snd)
116 $ Map.toList tailTerms'
117
118 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
119 <> (toNgramsElement mapTerms )
120 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
121 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
122 )]
123
124
125 -- TODO use ListIds
126 buildNgramsTermsList :: ( HasNodeError err
127 , CmdM env err m
128 , RepoCmdM env err m
129 , HasTreeError err
130 )
131 => User
132 -> UserCorpusId
133 -> MasterCorpusId
134 -> GroupParams
135 -> m (Map NgramsType [NgramsElement])
136 buildNgramsTermsList user uCid mCid groupParams = do
137
138 -- Computing global speGen score
139 allTerms :: Map Text Double <- getTficf uCid mCid NgramsTerms
140
141 -- printDebug "head candidates" (List.take 10 $ allTerms)
142 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
143
144 -- First remove stops terms
145 socialLists <- flowSocialList user NgramsTerms (Set.fromList $ map fst $ Map.toList allTerms)
146 -- printDebug "\n * socialLists * \n" socialLists
147
148 -- Grouping the ngrams and keeping the maximum score for label
149 let grouped = groupedTextWithStem ( GroupedTextParams (groupWith groupParams) identity (const Set.empty) (const Set.empty) {-(size . _gt_label)-} ) allTerms
150
151 groupedWithList = map (addListType (invertForw socialLists)) grouped
152
153 (stopTerms, candidateTerms) = Map.partition ((== Just StopTerm) . viewListType) groupedWithList
154 (groupedMono, groupedMult) = Map.partition (\t -> t ^. gt_size < 2) candidateTerms
155 -- (groupedMono, groupedMult) = Map.partitionWithKey (\t _v -> size t < 2) candidateTerms
156
157 -- printDebug "\n * stopTerms * \n" stopTerms
158 -- splitting monterms and multiterms to take proportional candidates
159 let
160 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if too small
161 monoSize = 0.4 :: Double
162 multSize = 1 - monoSize
163
164 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
165
166 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
167 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
168
169 -- printDebug "groupedMonoHead" (List.length groupedMonoHead)
170 -- printDebug "groupedMonoTail" (List.length groupedMonoHead)
171 -- printDebug "groupedMultHead" (List.length groupedMultHead)
172 -- printDebug "groupedMultTail" (List.length groupedMultTail)
173
174 let
175 -- Get Local Scores now for selected grouped ngrams
176 -- TODO HasTerms
177 selectedTerms = Set.toList $ List.foldl'
178 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
179 $ Set.insert l' g
180 )
181 Set.empty
182 (groupedMonoHead <> groupedMultHead)
183 -- selectedTerms = hasTerms (groupedMonoHead <> groupedMultHead)
184
185 -- TO remove (and remove HasNodeError instance)
186 userListId <- defaultList uCid
187 masterListId <- defaultList mCid
188
189 mapTextDocIds <- getNodesByNgramsOnlyUser uCid
190 [userListId, masterListId]
191 NgramsTerms
192 selectedTerms
193
194 let
195 mapGroups = Map.fromList
196 $ map (\g -> (g ^. gt_stem, g))
197 $ groupedMonoHead <> groupedMultHead
198
199 -- grouping with Set NodeId
200 contextsAdded = foldl' (\mapGroups' k ->
201 let k' = groupWith groupParams k in
202 case Map.lookup k' mapGroups' of
203 Nothing -> mapGroups'
204 Just g -> case Map.lookup k mapTextDocIds of
205 Nothing -> mapGroups'
206 Just ns -> Map.insert k' (over gt_nodes (Set.union ns) g) mapGroups'
207 )
208 mapGroups
209 $ Map.keys mapTextDocIds
210
211 -- compute cooccurrences
212 mapCooc = Map.filter (>2)
213 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
214 | (t1, s1) <- mapStemNodeIds
215 , (t2, s2) <- mapStemNodeIds
216 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
217 ]
218 where
219 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
220 -- printDebug "mapCooc" mapCooc
221
222 let
223 -- computing scores
224 mapScores f = Map.fromList
225 $ map (\(Scored t g s') -> (t, f (g,s')))
226 $ normalizeGlobal
227 $ map normalizeLocal
228 $ scored' mapCooc
229
230 groupsWithScores = catMaybes
231 $ map (\(stem, g)
232 -> case Map.lookup stem mapScores' of
233 Nothing -> Nothing
234 Just s' -> Just $ g { _gt_score = s'}
235 ) $ Map.toList contextsAdded
236 where
237 mapScores' = mapScores identity
238 -- adapt2 TOCHECK with DC
239 -- printDebug "groupsWithScores" groupsWithScores
240 let
241 -- sort / partition / split
242 -- filter mono/multi again
243 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
244 -- filter with max score
245 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = viewScore g in s1 > s2 )
246
247 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
248 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
249
250 -- splitAt
251 let
252 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
253 inclSize = 0.4 :: Double
254 exclSize = 1 - inclSize
255 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
256
257 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . viewScore) monoScoredIncl
258 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . viewScore) monoScoredExcl
259
260 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . viewScore) multScoredIncl
261 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . viewScore) multScoredExcl
262
263
264 -- Final Step building the Typed list
265 termListHead = maps <> cands
266 where
267 maps = setListType (Just MapTerm)
268 <$> monoScoredInclHead
269 <> monoScoredExclHead
270 <> multScoredInclHead
271 <> multScoredExclHead
272
273 cands = setListType (Just CandidateTerm)
274 <$> monoScoredInclTail
275 <> monoScoredExclTail
276 <> multScoredInclTail
277 <> multScoredExclTail
278
279 termListTail = map (setListType (Just CandidateTerm)) ( groupedMonoTail <> groupedMultTail)
280
281 -- printDebug "monoScoredInclHead" monoScoredInclHead
282 -- printDebug "monoScoredExclHead" monoScoredExclTail
283 -- printDebug "multScoredInclHead" multScoredInclHead
284 -- printDebug "multScoredExclTail" multScoredExclTail
285
286 let result = Map.unionsWith (<>)
287 [ Map.fromList [( NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
288 <> (List.concat $ map toNgramsElement $ termListTail)
289 <> (List.concat $ map toNgramsElement $ stopTerms)
290 )]
291 ]
292 -- printDebug "\n result \n" r
293 pure result
294
295
296
297 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
298 toGargList l n = (l,n)
299
300
301 isStopTerm :: StopSize -> Text -> Bool
302 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
303 where
304 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
305
306 ------------------------------------------------------------------------------