2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
15 module Gargantext.Core.Text.List
19 import Control.Lens ((^.), set, over)
21 import Data.Maybe (catMaybes)
22 import Data.Monoid (mempty)
23 import Data.Ord (Down(..))
25 import Data.Text (Text)
26 import Data.Tuple.Extra (both)
27 import Gargantext.API.Ngrams.Types (NgramsElement)
28 import Gargantext.API.Ngrams.Types (RepoCmdM)
29 import Gargantext.Core.Text.List.Group
30 import Gargantext.Core.Text.List.Group.Prelude
31 import Gargantext.Core.Text.List.Group.WithStem
32 import Gargantext.Core.Text.List.Social
33 import Gargantext.Core.Text.List.Social.Prelude
34 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
35 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
36 import Gargantext.Core.Types.Individu (User(..))
37 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
38 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
39 import Gargantext.Database.Admin.Types.Node (NodeId)
40 import Gargantext.Database.Prelude (CmdM)
41 import Gargantext.Database.Query.Table.Node (defaultList)
42 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
43 import Gargantext.Database.Query.Tree.Error (HasTreeError)
44 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
45 import Gargantext.Prelude
46 import qualified Data.Char as Char
47 import qualified Data.List as List
48 import qualified Data.Map as Map
49 import qualified Data.Set as Set
50 import qualified Data.Text as Text
53 -- | TODO improve grouping functions of Authors, Sources, Institutes..
54 buildNgramsLists :: ( RepoCmdM env err m
63 -> m (Map NgramsType [NgramsElement])
64 buildNgramsLists user gp uCid mCid = do
65 ngTerms <- buildNgramsTermsList user uCid mCid gp
66 othersTerms <- mapM (buildNgramsOthersList user uCid (ngramsGroup GroupIdentity))
67 [ (Authors , MapListSize 9)
68 , (Sources , MapListSize 9)
69 , (Institutes, MapListSize 9)
72 pure $ Map.unions $ [ngTerms] <> othersTerms
75 data MapListSize = MapListSize { unMapListSize :: !Int }
77 buildNgramsOthersList ::( HasNodeError err
85 -> (NgramsType, MapListSize)
86 -> m (Map NgramsType [NgramsElement])
87 buildNgramsOthersList user uCid groupIt (nt, MapListSize mapListSize) = do
88 ngs' :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt
90 socialLists' :: FlowCont Text FlowListScores
91 <- flowSocialList' MySelfFirst user nt ( FlowCont Map.empty
93 $ List.zip (Map.keys ngs')
96 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
99 printDebug "flowSocialList'"
100 $ Map.filter (not . ((==) Map.empty) . view fls_parents)
101 $ view flc_scores socialLists'
105 groupParams = GroupedTextParams groupIt (Set.size . snd) fst snd {-(size . fst)-}
106 groupedWithList = toGroupedTreeText groupParams socialLists' ngs'
109 printDebug "groupedWithList"
110 $ Map.map (\v -> (view gt_label v, view gt_children v))
111 $ Map.filter (\v -> (Set.size $ view gt_children v) > 0)
116 (stopTerms, tailTerms) = Map.partition ((== Just StopTerm) . viewListType) groupedWithList
117 (mapTerms, tailTerms') = Map.partition ((== Just MapTerm) . viewListType) tailTerms
119 listSize = mapListSize - (List.length mapTerms)
120 (mapTerms', candiTerms) = both Map.fromList
121 $ List.splitAt listSize
122 $ List.sortOn (Down . viewScore . snd)
123 $ Map.toList tailTerms'
125 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
126 <> (toNgramsElement mapTerms )
127 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
128 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
133 buildNgramsTermsList :: ( HasNodeError err
142 -> m (Map NgramsType [NgramsElement])
143 buildNgramsTermsList user uCid mCid groupParams = do
145 -- Computing global speGen score
146 allTerms :: Map Text Double <- getTficf uCid mCid NgramsTerms
148 -- printDebug "head candidates" (List.take 10 $ allTerms)
149 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
151 -- First remove stops terms
152 socialLists <- flowSocialList user NgramsTerms (Set.fromList $ map fst $ Map.toList allTerms)
153 -- printDebug "\n * socialLists * \n" socialLists
155 -- Grouping the ngrams and keeping the maximum score for label
156 let grouped = groupedTextWithStem ( GroupedTextParams (ngramsGroup groupParams) identity (const Set.empty) (const Set.empty) {-(size . _gt_label)-} ) allTerms
158 groupedWithList = map (addListType (invertForw socialLists)) grouped
160 (stopTerms, candidateTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
161 (groupedMono, groupedMult) = Map.partition (\t -> t ^. gt_size < 2) candidateTerms
163 -- printDebug "\n * stopTerms * \n" stopTerms
164 -- splitting monterms and multiterms to take proportional candidates
166 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if too small
167 monoSize = 0.4 :: Double
168 multSize = 1 - monoSize
170 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
172 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
173 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
175 -- printDebug "groupedMonoHead" (List.length groupedMonoHead)
176 -- printDebug "groupedMonoTail" (List.length groupedMonoHead)
177 -- printDebug "groupedMultHead" (List.length groupedMultHead)
178 -- printDebug "groupedMultTail" (List.length groupedMultTail)
181 -- Get Local Scores now for selected grouped ngrams
182 selectedTerms = Set.toList $ List.foldl'
183 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
187 (groupedMonoHead <> groupedMultHead)
189 -- TO remove (and remove HasNodeError instance)
190 userListId <- defaultList uCid
191 masterListId <- defaultList mCid
193 mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
196 mapGroups = Map.fromList
197 $ map (\g -> (g ^. gt_stem, g))
198 $ groupedMonoHead <> groupedMultHead
200 -- grouping with Set NodeId
201 contextsAdded = foldl' (\mapGroups' k ->
202 let k' = ngramsGroup groupParams k in
203 case Map.lookup k' mapGroups' of
204 Nothing -> mapGroups'
205 Just g -> case Map.lookup k mapTextDocIds of
206 Nothing -> mapGroups'
207 Just ns -> Map.insert k' (over gt_nodes (Set.union ns) g) mapGroups'
210 $ Map.keys mapTextDocIds
212 -- compute cooccurrences
213 mapCooc = Map.filter (>2)
214 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
215 | (t1, s1) <- mapStemNodeIds
216 , (t2, s2) <- mapStemNodeIds
217 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
220 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
221 -- printDebug "mapCooc" mapCooc
225 mapScores f = Map.fromList
226 $ map (\(Scored t g s') -> (t, f (g,s')))
231 groupsWithScores = catMaybes
233 -> case Map.lookup stem mapScores' of
235 Just s' -> Just $ g { _gt_score = s'}
236 ) $ Map.toList contextsAdded
238 mapScores' = mapScores identity
239 -- adapt2 TOCHECK with DC
240 -- printDebug "groupsWithScores" groupsWithScores
242 -- sort / partition / split
243 -- filter mono/multi again
244 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
245 -- filter with max score
246 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
248 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
249 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
253 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
254 inclSize = 0.4 :: Double
255 exclSize = 1 - inclSize
256 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
258 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
259 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
261 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
262 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
265 -- Final Step building the Typed list
266 termListHead = maps <> cands
268 maps = set gt_listType (Just MapTerm)
269 <$> monoScoredInclHead
270 <> monoScoredExclHead
271 <> multScoredInclHead
272 <> multScoredExclHead
274 cands = set gt_listType (Just CandidateTerm)
275 <$> monoScoredInclTail
276 <> monoScoredExclTail
277 <> multScoredInclTail
278 <> multScoredExclTail
280 termListTail = map (set gt_listType (Just CandidateTerm)) ( groupedMonoTail <> groupedMultTail)
282 -- printDebug "monoScoredInclHead" monoScoredInclHead
283 -- printDebug "monoScoredExclHead" monoScoredExclTail
284 -- printDebug "multScoredInclHead" multScoredInclHead
285 -- printDebug "multScoredExclTail" multScoredExclTail
287 let result = Map.unionsWith (<>)
288 [ Map.fromList [( NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
289 <> (List.concat $ map toNgramsElement $ termListTail)
290 <> (List.concat $ map toNgramsElement $ stopTerms)
293 -- printDebug "\n result \n" r
298 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
299 toGargList l n = (l,n)
302 isStopTerm :: StopSize -> Text -> Bool
303 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
305 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
307 ------------------------------------------------------------------------------