2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
15 module Gargantext.Core.Text.List
19 import Control.Lens ((^.), set, over)
21 import Data.Maybe (catMaybes)
22 import Data.Monoid (mempty)
23 import Data.Ord (Down(..))
25 import Data.Text (Text)
26 import Data.Tuple.Extra (both)
27 import Gargantext.API.Ngrams.Types (NgramsElement)
28 import Gargantext.API.Ngrams.Types (RepoCmdM)
29 import Gargantext.Core.Text.List.Group
30 import Gargantext.Core.Text.List.Group.Prelude
31 import Gargantext.Core.Text.List.Group.WithStem
32 import Gargantext.Core.Text.List.Social
33 import Gargantext.Core.Text.List.Social.Prelude
34 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
35 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
36 import Gargantext.Core.Types.Individu (User(..))
37 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
38 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
39 import Gargantext.Database.Admin.Types.Node (NodeId)
40 import Gargantext.Database.Prelude (CmdM)
41 import Gargantext.Database.Query.Table.Node (defaultList)
42 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
43 import Gargantext.Database.Query.Tree.Error (HasTreeError)
44 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
45 import Gargantext.Prelude
46 import qualified Data.Char as Char
47 import qualified Data.List as List
48 import qualified Data.Map as Map
49 import qualified Data.Set as Set
50 import qualified Data.Text as Text
53 -- | TODO improve grouping functions of Authors, Sources, Institutes..
54 buildNgramsLists :: ( RepoCmdM env err m
63 -> m (Map NgramsType [NgramsElement])
64 buildNgramsLists user gp uCid mCid = do
65 ngTerms <- buildNgramsTermsList user uCid mCid gp
66 othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
67 [ (Authors , MapListSize 9)
68 , (Sources , MapListSize 9)
69 , (Institutes, MapListSize 9)
72 pure $ Map.unions $ [ngTerms] <> othersTerms
75 data MapListSize = MapListSize { unMapListSize :: !Int }
77 buildNgramsOthersList ::( HasNodeError err
85 -> (NgramsType, MapListSize)
86 -> m (Map NgramsType [NgramsElement])
87 buildNgramsOthersList user uCid groupIt (nt, MapListSize mapListSize) = do
88 ngs' :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt
90 socialLists' :: FlowCont Text FlowListScores
91 <- flowSocialList' MySelfFirst user nt ( FlowCont Map.empty
93 $ List.zip (Map.keys ngs')
96 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
99 printDebug "flowSocialList'"
100 $ Map.filter (not . ((==) Map.empty) . view fls_parents)
101 $ view flc_scores socialLists'
105 groupedWithList = toGroupedTreeText groupIt socialLists' ngs'
108 printDebug "groupedWithList"
109 $ Map.map (\v -> (view gt_label v, view gt_children v))
110 $ Map.filter (\v -> (Set.size $ view gt_children v) > 0)
115 (stopTerms, tailTerms) = Map.partition ((== Just StopTerm) . viewListType) groupedWithList
116 (mapTerms, tailTerms') = Map.partition ((== Just MapTerm) . viewListType) tailTerms
118 listSize = mapListSize - (List.length mapTerms)
119 (mapTerms', candiTerms) = both Map.fromList
120 $ List.splitAt listSize
121 $ List.sortOn (Down . viewScore . snd)
122 $ Map.toList tailTerms'
124 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
125 <> (toNgramsElement mapTerms )
126 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
127 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
132 buildNgramsTermsList :: ( HasNodeError err
141 -> m (Map NgramsType [NgramsElement])
142 buildNgramsTermsList user uCid mCid groupParams = do
144 -- Computing global speGen score
145 allTerms :: Map Text Double <- getTficf uCid mCid NgramsTerms
147 -- printDebug "head candidates" (List.take 10 $ allTerms)
148 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
150 -- First remove stops terms
151 socialLists <- flowSocialList user NgramsTerms (Set.fromList $ map fst $ Map.toList allTerms)
152 -- printDebug "\n * socialLists * \n" socialLists
154 -- Grouping the ngrams and keeping the maximum score for label
155 let grouped = groupedTextWithStem ( GroupedTextParams (groupWith groupParams) identity (const Set.empty) (const Set.empty) {-(size . _gt_label)-} ) allTerms
157 groupedWithList = map (addListType (invertForw socialLists)) grouped
159 (stopTerms, candidateTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
160 (groupedMono, groupedMult) = Map.partition (\t -> t ^. gt_size < 2) candidateTerms
162 -- printDebug "\n * stopTerms * \n" stopTerms
163 -- splitting monterms and multiterms to take proportional candidates
165 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if too small
166 monoSize = 0.4 :: Double
167 multSize = 1 - monoSize
169 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
171 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
172 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
174 -- printDebug "groupedMonoHead" (List.length groupedMonoHead)
175 -- printDebug "groupedMonoTail" (List.length groupedMonoHead)
176 -- printDebug "groupedMultHead" (List.length groupedMultHead)
177 -- printDebug "groupedMultTail" (List.length groupedMultTail)
180 -- Get Local Scores now for selected grouped ngrams
181 selectedTerms = Set.toList $ List.foldl'
182 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
186 (groupedMonoHead <> groupedMultHead)
188 -- TO remove (and remove HasNodeError instance)
189 userListId <- defaultList uCid
190 masterListId <- defaultList mCid
192 mapTextDocIds <- getNodesByNgramsOnlyUser uCid
193 [userListId, masterListId]
198 mapGroups = Map.fromList
199 $ map (\g -> (g ^. gt_stem, g))
200 $ groupedMonoHead <> groupedMultHead
202 -- grouping with Set NodeId
203 contextsAdded = foldl' (\mapGroups' k ->
204 let k' = groupWith groupParams k in
205 case Map.lookup k' mapGroups' of
206 Nothing -> mapGroups'
207 Just g -> case Map.lookup k mapTextDocIds of
208 Nothing -> mapGroups'
209 Just ns -> Map.insert k' (over gt_nodes (Set.union ns) g) mapGroups'
212 $ Map.keys mapTextDocIds
214 -- compute cooccurrences
215 mapCooc = Map.filter (>2)
216 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
217 | (t1, s1) <- mapStemNodeIds
218 , (t2, s2) <- mapStemNodeIds
219 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
222 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
223 -- printDebug "mapCooc" mapCooc
227 mapScores f = Map.fromList
228 $ map (\(Scored t g s') -> (t, f (g,s')))
233 groupsWithScores = catMaybes
235 -> case Map.lookup stem mapScores' of
237 Just s' -> Just $ g { _gt_score = s'}
238 ) $ Map.toList contextsAdded
240 mapScores' = mapScores identity
241 -- adapt2 TOCHECK with DC
242 -- printDebug "groupsWithScores" groupsWithScores
244 -- sort / partition / split
245 -- filter mono/multi again
246 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
247 -- filter with max score
248 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
250 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
251 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
255 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
256 inclSize = 0.4 :: Double
257 exclSize = 1 - inclSize
258 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
260 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
261 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
263 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
264 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
267 -- Final Step building the Typed list
268 termListHead = maps <> cands
270 maps = set gt_listType (Just MapTerm)
271 <$> monoScoredInclHead
272 <> monoScoredExclHead
273 <> multScoredInclHead
274 <> multScoredExclHead
276 cands = set gt_listType (Just CandidateTerm)
277 <$> monoScoredInclTail
278 <> monoScoredExclTail
279 <> multScoredInclTail
280 <> multScoredExclTail
282 termListTail = map (set gt_listType (Just CandidateTerm)) ( groupedMonoTail <> groupedMultTail)
284 -- printDebug "monoScoredInclHead" monoScoredInclHead
285 -- printDebug "monoScoredExclHead" monoScoredExclTail
286 -- printDebug "multScoredInclHead" multScoredInclHead
287 -- printDebug "multScoredExclTail" multScoredExclTail
289 let result = Map.unionsWith (<>)
290 [ Map.fromList [( NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
291 <> (List.concat $ map toNgramsElement $ termListTail)
292 <> (List.concat $ map toNgramsElement $ stopTerms)
295 -- printDebug "\n result \n" r
300 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
301 toGargList l n = (l,n)
304 isStopTerm :: StopSize -> Text -> Bool
305 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
307 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
309 ------------------------------------------------------------------------------