2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
15 module Gargantext.Core.Text.List
19 import Control.Lens ((^.), view, over)
21 import Data.Maybe (catMaybes)
22 import Data.Monoid (mempty)
23 import Data.Ord (Down(..))
25 import Data.Text (Text)
26 import Data.Tuple.Extra (both)
27 import Gargantext.API.Ngrams.Types (NgramsElement)
28 import Gargantext.API.Ngrams.Types (RepoCmdM)
29 import Gargantext.Core.Text.List.Group
30 import Gargantext.Core.Text.List.Group.Prelude
31 import Gargantext.Core.Text.List.Group.WithStem
32 import Gargantext.Core.Text.List.Social
33 import Gargantext.Core.Text.List.Social.Prelude
34 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
35 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
36 import Gargantext.Core.Types.Individu (User(..))
37 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
38 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
39 import Gargantext.Database.Admin.Types.Node (NodeId)
40 import Gargantext.Database.Prelude (CmdM)
41 import Gargantext.Database.Query.Table.Node (defaultList)
42 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
43 import Gargantext.Database.Query.Tree.Error (HasTreeError)
44 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
45 import Gargantext.Prelude
46 import qualified Data.Char as Char
47 import qualified Data.List as List
48 import qualified Data.Map as Map
49 import qualified Data.Set as Set
50 import qualified Data.Text as Text
53 -- | TODO improve grouping functions of Authors, Sources, Institutes..
54 buildNgramsLists :: ( RepoCmdM env err m
63 -> m (Map NgramsType [NgramsElement])
64 buildNgramsLists user gp uCid mCid = do
65 ngTerms <- buildNgramsTermsList user uCid mCid gp
66 othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
67 [ (Authors , MapListSize 9)
68 , (Sources , MapListSize 9)
69 , (Institutes, MapListSize 9)
72 pure $ Map.unions $ [ngTerms] <> othersTerms
75 data MapListSize = MapListSize { unMapListSize :: !Int }
77 buildNgramsOthersList ::( HasNodeError err
85 -> (NgramsType, MapListSize)
86 -> m (Map NgramsType [NgramsElement])
87 buildNgramsOthersList user uCid groupIt (nt, MapListSize mapListSize) = do
88 ngs' :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt
90 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
91 socialLists' :: FlowCont Text FlowListScores
92 <- flowSocialList' MySelfFirst user nt ( FlowCont Map.empty
94 $ List.zip (Map.keys ngs')
99 printDebug "flowSocialList'"
100 $ Map.filter (not . ((==) Map.empty) . (view fls_parents))
101 $ view flc_scores socialLists'
104 groupedWithList = toGroupedTreeText groupIt socialLists' ngs'
106 printDebug "groupedWithList"
107 $ view flc_cont groupedWithList
110 (stopTerms, tailTerms) = Map.partition ((== Just StopTerm) . viewListType) $ view flc_scores groupedWithList
111 (mapTerms, tailTerms') = Map.partition ((== Just MapTerm) . viewListType) tailTerms
113 listSize = mapListSize - (List.length mapTerms)
114 (mapTerms', candiTerms) = both Map.fromList
115 $ List.splitAt listSize
116 $ List.sortOn (Down . viewScore . snd)
117 $ Map.toList tailTerms'
119 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
120 <> (toNgramsElement mapTerms )
121 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
122 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
127 buildNgramsTermsList :: ( HasNodeError err
136 -> m (Map NgramsType [NgramsElement])
137 buildNgramsTermsList user uCid mCid groupParams = do
139 -- Computing global speGen score
140 allTerms :: Map Text Double <- getTficf uCid mCid NgramsTerms
142 -- printDebug "head candidates" (List.take 10 $ allTerms)
143 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
145 -- First remove stops terms
146 socialLists <- flowSocialList user NgramsTerms (Set.fromList $ map fst $ Map.toList allTerms)
147 -- printDebug "\n * socialLists * \n" socialLists
149 -- Grouping the ngrams and keeping the maximum score for label
150 let grouped = groupedTextWithStem ( GroupedTextParams (groupWith groupParams) identity (const Set.empty) (const Set.empty) {-(size . _gt_label)-} ) allTerms
152 groupedWithList = map (addListType (invertForw socialLists)) grouped
154 (stopTerms, candidateTerms) = Map.partition ((== Just StopTerm) . viewListType) groupedWithList
155 (groupedMono, groupedMult) = Map.partition (\t -> t ^. gt_size < 2) candidateTerms
156 -- (groupedMono, groupedMult) = Map.partitionWithKey (\t -> t ^. gt_size < 2) candidateTerms
158 -- printDebug "\n * stopTerms * \n" stopTerms
159 -- splitting monterms and multiterms to take proportional candidates
161 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if too small
162 monoSize = 0.4 :: Double
163 multSize = 1 - monoSize
165 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
167 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
168 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
170 -- printDebug "groupedMonoHead" (List.length groupedMonoHead)
171 -- printDebug "groupedMonoTail" (List.length groupedMonoHead)
172 -- printDebug "groupedMultHead" (List.length groupedMultHead)
173 -- printDebug "groupedMultTail" (List.length groupedMultTail)
176 -- Get Local Scores now for selected grouped ngrams
178 selectedTerms = Set.toList $ List.foldl'
179 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
183 (groupedMonoHead <> groupedMultHead)
184 -- selectedTerms = hasTerms (groupedMonoHead <> groupedMultHead)
186 -- TO remove (and remove HasNodeError instance)
187 userListId <- defaultList uCid
188 masterListId <- defaultList mCid
190 mapTextDocIds <- getNodesByNgramsOnlyUser uCid
191 [userListId, masterListId]
196 mapGroups = Map.fromList
197 $ map (\g -> (g ^. gt_stem, g))
198 $ groupedMonoHead <> groupedMultHead
200 -- grouping with Set NodeId
201 contextsAdded = foldl' (\mapGroups' k ->
202 let k' = groupWith groupParams k in
203 case Map.lookup k' mapGroups' of
204 Nothing -> mapGroups'
205 Just g -> case Map.lookup k mapTextDocIds of
206 Nothing -> mapGroups'
207 Just ns -> Map.insert k' (over gt_nodes (Set.union ns) g) mapGroups'
210 $ Map.keys mapTextDocIds
212 -- compute cooccurrences
213 mapCooc = Map.filter (>2)
214 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
215 | (t1, s1) <- mapStemNodeIds
216 , (t2, s2) <- mapStemNodeIds
217 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
220 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
221 -- printDebug "mapCooc" mapCooc
225 mapScores f = Map.fromList
226 $ map (\(Scored t g s') -> (t, f (g,s')))
231 groupsWithScores = catMaybes
233 -> case Map.lookup stem mapScores' of
235 Just s' -> Just $ g { _gt_score = s'}
236 ) $ Map.toList contextsAdded
238 mapScores' = mapScores identity
239 -- adapt2 TOCHECK with DC
240 -- printDebug "groupsWithScores" groupsWithScores
242 -- sort / partition / split
243 -- filter mono/multi again
244 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
245 -- filter with max score
246 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = viewScore g in s1 > s2 )
248 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
249 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
253 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
254 inclSize = 0.4 :: Double
255 exclSize = 1 - inclSize
256 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
258 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . viewScore) monoScoredIncl
259 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . viewScore) monoScoredExcl
261 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . viewScore) multScoredIncl
262 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . viewScore) multScoredExcl
265 -- Final Step building the Typed list
266 termListHead = maps <> cands
268 maps = setListType (Just MapTerm)
269 <$> monoScoredInclHead
270 <> monoScoredExclHead
271 <> multScoredInclHead
272 <> multScoredExclHead
274 cands = setListType (Just CandidateTerm)
275 <$> monoScoredInclTail
276 <> monoScoredExclTail
277 <> multScoredInclTail
278 <> multScoredExclTail
280 termListTail = map (setListType (Just CandidateTerm)) ( groupedMonoTail <> groupedMultTail)
282 -- printDebug "monoScoredInclHead" monoScoredInclHead
283 -- printDebug "monoScoredExclHead" monoScoredExclTail
284 -- printDebug "multScoredInclHead" multScoredInclHead
285 -- printDebug "multScoredExclTail" multScoredExclTail
287 let result = Map.unionsWith (<>)
288 [ Map.fromList [( NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
289 <> (List.concat $ map toNgramsElement $ termListTail)
290 <> (List.concat $ map toNgramsElement $ stopTerms)
293 -- printDebug "\n result \n" r
298 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
299 toGargList l n = (l,n)
302 isStopTerm :: StopSize -> Text -> Bool
303 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
305 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
307 ------------------------------------------------------------------------------