2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
15 module Gargantext.Core.Text.List
19 import Control.Lens ((^.), set, over)
21 import Data.Maybe (catMaybes)
22 import Data.Ord (Down(..))
24 import Data.Text (Text)
25 import Data.Tuple.Extra (both)
26 import Gargantext.API.Ngrams.Types (NgramsElement)
27 import Gargantext.API.Ngrams.Types (RepoCmdM)
28 import Gargantext.Core.Text.List.Group
29 import Gargantext.Core.Text.List.Group.Prelude
30 import Gargantext.Core.Text.List.Group.WithStem
31 import Gargantext.Core.Text.List.Social
32 import Gargantext.Core.Text.List.Social.Prelude
33 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
34 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
35 import Gargantext.Core.Types.Individu (User(..))
36 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
37 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
38 import Gargantext.Database.Admin.Types.Node (NodeId)
39 import Gargantext.Database.Prelude (CmdM)
40 import Gargantext.Database.Query.Table.Node (defaultList)
41 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
42 import Gargantext.Database.Query.Tree.Error (HasTreeError)
43 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
44 import Gargantext.Prelude
45 import qualified Data.Char as Char
46 import qualified Data.List as List
47 import qualified Data.Map as Map
48 import qualified Data.Set as Set
49 import qualified Data.Text as Text
52 -- | TODO improve grouping functions of Authors, Sources, Institutes..
53 buildNgramsLists :: ( RepoCmdM env err m
62 -> m (Map NgramsType [NgramsElement])
63 buildNgramsLists user gp uCid mCid = do
64 ngTerms <- buildNgramsTermsList user uCid mCid gp
65 othersTerms <- mapM (buildNgramsOthersList user uCid (ngramsGroup GroupIdentity))
66 [ (Authors , MapListSize 9)
67 , (Sources , MapListSize 9)
68 , (Institutes, MapListSize 9)
71 pure $ Map.unions $ [ngTerms] <> othersTerms
74 data MapListSize = MapListSize { unMapListSize :: !Int }
76 buildNgramsOthersList ::( HasNodeError err
84 -> (NgramsType, MapListSize)
85 -> m (Map NgramsType [NgramsElement])
86 buildNgramsOthersList user uCid groupIt (nt, MapListSize mapListSize) = do
87 ngs' :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt
89 socialLists' :: FlowCont Text FlowListScores
90 <- flowSocialList' MySelfFirst user nt (FlowCont Map.empty $ Set.fromList $ Map.keys ngs')
91 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
94 printDebug "flowSocialList'"
95 $ Map.filter (not . ((==) Map.empty) . view fls_parents)
96 $ view flc_scores socialLists'
100 groupParams = GroupedTextParams groupIt (Set.size . snd) fst snd {-(size . fst)-}
101 groupedWithList = toGroupedTreeText groupParams socialLists' ngs'
104 printDebug "groupedWithList"
105 $ Map.map (\v -> (view gt_label v, view gt_children v))
106 $ Map.filter (\v -> (Set.size $ view gt_children v) > 0)
111 (stopTerms, tailTerms) = Map.partition ((== Just StopTerm) . viewListType) groupedWithList
112 (mapTerms, tailTerms') = Map.partition ((== Just MapTerm) . viewListType) tailTerms
114 listSize = mapListSize - (List.length mapTerms)
115 (mapTerms', candiTerms) = both Map.fromList
116 $ List.splitAt listSize
117 $ List.sortOn (Down . viewScore . snd)
118 $ Map.toList tailTerms'
120 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
121 <> (toNgramsElement mapTerms )
122 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
123 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
128 buildNgramsTermsList :: ( HasNodeError err
137 -> m (Map NgramsType [NgramsElement])
138 buildNgramsTermsList user uCid mCid groupParams = do
140 -- Computing global speGen score
141 allTerms :: Map Text Double <- getTficf uCid mCid NgramsTerms
143 -- printDebug "head candidates" (List.take 10 $ allTerms)
144 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
146 -- First remove stops terms
147 socialLists <- flowSocialList user NgramsTerms (Set.fromList $ map fst $ Map.toList allTerms)
148 -- printDebug "\n * socialLists * \n" socialLists
150 -- Grouping the ngrams and keeping the maximum score for label
151 let grouped = groupedTextWithStem ( GroupedTextParams (ngramsGroup groupParams) identity (const Set.empty) (const Set.empty) {-(size . _gt_label)-} ) allTerms
153 groupedWithList = map (addListType (invertForw socialLists)) grouped
155 (stopTerms, candidateTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
156 (groupedMono, groupedMult) = Map.partition (\t -> t ^. gt_size < 2) candidateTerms
158 -- printDebug "\n * stopTerms * \n" stopTerms
159 -- splitting monterms and multiterms to take proportional candidates
161 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if too small
162 monoSize = 0.4 :: Double
163 multSize = 1 - monoSize
165 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
167 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
168 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
170 -- printDebug "groupedMonoHead" (List.length groupedMonoHead)
171 -- printDebug "groupedMonoTail" (List.length groupedMonoHead)
172 -- printDebug "groupedMultHead" (List.length groupedMultHead)
173 -- printDebug "groupedMultTail" (List.length groupedMultTail)
176 -- Get Local Scores now for selected grouped ngrams
177 selectedTerms = Set.toList $ List.foldl'
178 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
182 (groupedMonoHead <> groupedMultHead)
184 -- TO remove (and remove HasNodeError instance)
185 userListId <- defaultList uCid
186 masterListId <- defaultList mCid
188 mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
191 mapGroups = Map.fromList
192 $ map (\g -> (g ^. gt_stem, g))
193 $ groupedMonoHead <> groupedMultHead
195 -- grouping with Set NodeId
196 contextsAdded = foldl' (\mapGroups' k ->
197 let k' = ngramsGroup groupParams k in
198 case Map.lookup k' mapGroups' of
199 Nothing -> mapGroups'
200 Just g -> case Map.lookup k mapTextDocIds of
201 Nothing -> mapGroups'
202 Just ns -> Map.insert k' (over gt_nodes (Set.union ns) g) mapGroups'
205 $ Map.keys mapTextDocIds
207 -- compute cooccurrences
208 mapCooc = Map.filter (>2)
209 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
210 | (t1, s1) <- mapStemNodeIds
211 , (t2, s2) <- mapStemNodeIds
212 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
215 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
216 -- printDebug "mapCooc" mapCooc
220 mapScores f = Map.fromList
221 $ map (\(Scored t g s') -> (t, f (g,s')))
226 groupsWithScores = catMaybes
228 -> case Map.lookup stem mapScores' of
230 Just s' -> Just $ g { _gt_score = s'}
231 ) $ Map.toList contextsAdded
233 mapScores' = mapScores identity
234 -- adapt2 TOCHECK with DC
235 -- printDebug "groupsWithScores" groupsWithScores
237 -- sort / partition / split
238 -- filter mono/multi again
239 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
240 -- filter with max score
241 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
243 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
244 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
248 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
249 inclSize = 0.4 :: Double
250 exclSize = 1 - inclSize
251 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
253 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
254 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
256 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
257 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
260 -- Final Step building the Typed list
261 termListHead = maps <> cands
263 maps = set gt_listType (Just MapTerm)
264 <$> monoScoredInclHead
265 <> monoScoredExclHead
266 <> multScoredInclHead
267 <> multScoredExclHead
269 cands = set gt_listType (Just CandidateTerm)
270 <$> monoScoredInclTail
271 <> monoScoredExclTail
272 <> multScoredInclTail
273 <> multScoredExclTail
275 termListTail = map (set gt_listType (Just CandidateTerm)) ( groupedMonoTail <> groupedMultTail)
277 -- printDebug "monoScoredInclHead" monoScoredInclHead
278 -- printDebug "monoScoredExclHead" monoScoredExclTail
279 -- printDebug "multScoredInclHead" multScoredInclHead
280 -- printDebug "multScoredExclTail" multScoredExclTail
282 let result = Map.unionsWith (<>)
283 [ Map.fromList [( NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
284 <> (List.concat $ map toNgramsElement $ termListTail)
285 <> (List.concat $ map toNgramsElement $ stopTerms)
288 -- printDebug "\n result \n" r
293 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
294 toGargList l n = (l,n)
297 isStopTerm :: StopSize -> Text -> Bool
298 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
300 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
302 ------------------------------------------------------------------------------