2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
15 module Gargantext.Core.Text.List
19 import Control.Lens ((^.), view, set, over)
21 import Data.Maybe (catMaybes)
22 import Data.Monoid (mempty)
23 import Data.Ord (Down(..))
25 import Data.Text (Text)
26 import Data.Tuple.Extra (both)
27 import Gargantext.API.Ngrams.Types (NgramsElement)
28 import Gargantext.API.Ngrams.Types (RepoCmdM)
29 import Gargantext.Core.Text.List.Group
30 import Gargantext.Core.Text.List.Group.Prelude
31 import Gargantext.Core.Text.List.Group.WithStem
32 import Gargantext.Core.Text.List.Social
33 import Gargantext.Core.Text.List.Social.Prelude
34 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
35 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
36 import Gargantext.Core.Types.Individu (User(..))
37 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
38 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
39 import Gargantext.Database.Admin.Types.Node (NodeId)
40 import Gargantext.Database.Prelude (CmdM)
41 import Gargantext.Database.Query.Table.Node (defaultList)
42 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
43 import Gargantext.Database.Query.Tree.Error (HasTreeError)
44 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
45 import Gargantext.Prelude
46 import qualified Data.Char as Char
47 import qualified Data.List as List
48 import qualified Data.Map as Map
49 import qualified Data.Set as Set
50 import qualified Data.Text as Text
53 -- | TODO improve grouping functions of Authors, Sources, Institutes..
54 buildNgramsLists :: ( RepoCmdM env err m
63 -> m (Map NgramsType [NgramsElement])
64 buildNgramsLists user gp uCid mCid = do
65 ngTerms <- buildNgramsTermsList user uCid mCid gp
66 othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
67 [ (Authors , MapListSize 9)
68 , (Sources , MapListSize 9)
69 , (Institutes, MapListSize 9)
72 pure $ Map.unions $ [ngTerms] <> othersTerms
75 data MapListSize = MapListSize { unMapListSize :: !Int }
77 buildNgramsOthersList ::( HasNodeError err
85 -> (NgramsType, MapListSize)
86 -> m (Map NgramsType [NgramsElement])
87 buildNgramsOthersList user uCid groupIt (nt, MapListSize mapListSize) = do
88 ngs' :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt
90 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
91 socialLists' :: FlowCont Text FlowListScores
92 <- flowSocialList' MySelfFirst user nt ( FlowCont Map.empty
94 $ List.zip (Map.keys ngs')
99 printDebug "flowSocialList'"
100 $ Map.filter (not . ((==) Map.empty) . (view fls_parents))
101 $ view flc_scores socialLists'
104 groupedWithList = toGroupedTreeText groupIt socialLists' ngs'
107 printDebug "groupedWithList"
108 $ view flc_scores groupedWithList
112 (stopTerms, tailTerms) = Map.partition ((== Just StopTerm) . viewListType) $ view flc_scores groupedWithList
113 (mapTerms, tailTerms') = Map.partition ((== Just MapTerm) . viewListType) tailTerms
115 listSize = mapListSize - (List.length mapTerms)
116 (mapTerms', candiTerms) = both Map.fromList
117 $ List.splitAt listSize
118 $ List.sortOn (Down . viewScore . snd)
119 $ Map.toList tailTerms'
121 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
122 <> (toNgramsElement mapTerms )
123 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
124 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
129 buildNgramsTermsList :: ( HasNodeError err
138 -> m (Map NgramsType [NgramsElement])
139 buildNgramsTermsList user uCid mCid groupParams = do
141 -- Computing global speGen score
142 allTerms :: Map Text Double <- getTficf uCid mCid NgramsTerms
144 -- printDebug "head candidates" (List.take 10 $ allTerms)
145 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
147 -- First remove stops terms
148 socialLists <- flowSocialList user NgramsTerms (Set.fromList $ map fst $ Map.toList allTerms)
149 -- printDebug "\n * socialLists * \n" socialLists
151 -- Grouping the ngrams and keeping the maximum score for label
152 let grouped = groupedTextWithStem ( GroupedTextParams (groupWith groupParams) identity (const Set.empty) (const Set.empty) {-(size . _gt_label)-} ) allTerms
154 groupedWithList = map (addListType (invertForw socialLists)) grouped
156 (stopTerms, candidateTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
157 (groupedMono, groupedMult) = Map.partition (\t -> t ^. gt_size < 2) candidateTerms
159 -- printDebug "\n * stopTerms * \n" stopTerms
160 -- splitting monterms and multiterms to take proportional candidates
162 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if too small
163 monoSize = 0.4 :: Double
164 multSize = 1 - monoSize
166 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
168 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
169 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
171 -- printDebug "groupedMonoHead" (List.length groupedMonoHead)
172 -- printDebug "groupedMonoTail" (List.length groupedMonoHead)
173 -- printDebug "groupedMultHead" (List.length groupedMultHead)
174 -- printDebug "groupedMultTail" (List.length groupedMultTail)
177 -- Get Local Scores now for selected grouped ngrams
179 selectedTerms = Set.toList $ List.foldl'
180 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
184 (groupedMonoHead <> groupedMultHead)
186 -- TO remove (and remove HasNodeError instance)
187 userListId <- defaultList uCid
188 masterListId <- defaultList mCid
190 mapTextDocIds <- getNodesByNgramsOnlyUser uCid
191 [userListId, masterListId]
196 mapGroups = Map.fromList
197 $ map (\g -> (g ^. gt_stem, g))
198 $ groupedMonoHead <> groupedMultHead
200 -- grouping with Set NodeId
201 contextsAdded = foldl' (\mapGroups' k ->
202 let k' = groupWith groupParams k in
203 case Map.lookup k' mapGroups' of
204 Nothing -> mapGroups'
205 Just g -> case Map.lookup k mapTextDocIds of
206 Nothing -> mapGroups'
207 Just ns -> Map.insert k' (over gt_nodes (Set.union ns) g) mapGroups'
210 $ Map.keys mapTextDocIds
212 -- compute cooccurrences
213 mapCooc = Map.filter (>2)
214 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
215 | (t1, s1) <- mapStemNodeIds
216 , (t2, s2) <- mapStemNodeIds
217 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
220 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
221 -- printDebug "mapCooc" mapCooc
225 mapScores f = Map.fromList
226 $ map (\(Scored t g s') -> (t, f (g,s')))
231 groupsWithScores = catMaybes
233 -> case Map.lookup stem mapScores' of
235 Just s' -> Just $ g { _gt_score = s'}
236 ) $ Map.toList contextsAdded
238 mapScores' = mapScores identity
239 -- adapt2 TOCHECK with DC
240 -- printDebug "groupsWithScores" groupsWithScores
242 -- sort / partition / split
243 -- filter mono/multi again
244 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
245 -- filter with max score
246 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
248 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
249 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
253 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
254 inclSize = 0.4 :: Double
255 exclSize = 1 - inclSize
256 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
258 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
259 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
261 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
262 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
265 -- Final Step building the Typed list
266 termListHead = maps <> cands
268 maps = set gt_listType (Just MapTerm)
269 <$> monoScoredInclHead
270 <> monoScoredExclHead
271 <> multScoredInclHead
272 <> multScoredExclHead
274 cands = set gt_listType (Just CandidateTerm)
275 <$> monoScoredInclTail
276 <> monoScoredExclTail
277 <> multScoredInclTail
278 <> multScoredExclTail
280 termListTail = map (set gt_listType (Just CandidateTerm)) ( groupedMonoTail <> groupedMultTail)
282 -- printDebug "monoScoredInclHead" monoScoredInclHead
283 -- printDebug "monoScoredExclHead" monoScoredExclTail
284 -- printDebug "multScoredInclHead" multScoredInclHead
285 -- printDebug "multScoredExclTail" multScoredExclTail
287 let result = Map.unionsWith (<>)
288 [ Map.fromList [( NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
289 <> (List.concat $ map toNgramsElement $ termListTail)
290 <> (List.concat $ map toNgramsElement $ stopTerms)
293 -- printDebug "\n result \n" r
298 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
299 toGargList l n = (l,n)
302 isStopTerm :: StopSize -> Text -> Bool
303 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
305 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
307 ------------------------------------------------------------------------------