2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
15 module Gargantext.Core.Text.List
18 import Control.Lens ((^.), view, over)
20 import Data.Maybe (catMaybes)
21 import Data.Monoid (mempty)
22 import Data.Ord (Down(..))
24 import Data.Text (Text)
25 import Data.Tuple.Extra (both)
26 import Gargantext.API.Ngrams.Types (NgramsElement)
27 import Gargantext.API.Ngrams.Types (RepoCmdM)
28 import Gargantext.Core.Text.List.Group
29 import Gargantext.Core.Text.List.Group.Prelude
30 import Gargantext.Core.Text.List.Group.WithStem
31 import Gargantext.Core.Text.List.Social
32 import Gargantext.Core.Text.List.Social.Prelude
33 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
34 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
35 import Gargantext.Core.Types.Individu (User(..))
36 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
37 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
38 import Gargantext.Database.Admin.Types.Node (NodeId)
39 import Gargantext.Database.Prelude (CmdM)
40 import Gargantext.Database.Query.Table.Node (defaultList)
41 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
42 import Gargantext.Database.Query.Tree.Error (HasTreeError)
43 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
44 import Gargantext.Prelude
45 import qualified Data.Char as Char
46 import qualified Data.List as List
47 import qualified Data.Map as Map
48 import qualified Data.Set as Set
49 import qualified Data.Text as Text
52 -- | TODO improve grouping functions of Authors, Sources, Institutes..
53 buildNgramsLists :: ( RepoCmdM env err m
62 -> m (Map NgramsType [NgramsElement])
63 buildNgramsLists user gp uCid mCid = do
64 ngTerms <- buildNgramsTermsList user uCid mCid gp
65 othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
66 [ (Authors , MapListSize 9)
67 , (Sources , MapListSize 9)
68 , (Institutes, MapListSize 9)
71 pure $ Map.unions $ [ngTerms] <> othersTerms
74 data MapListSize = MapListSize { unMapListSize :: !Int }
76 buildNgramsOthersList ::( HasNodeError err
84 -> (NgramsType, MapListSize)
85 -> m (Map NgramsType [NgramsElement])
86 buildNgramsOthersList user uCid groupIt (nt, MapListSize mapListSize) = do
87 ngs' :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt
89 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
90 socialLists' :: FlowCont Text FlowListScores
91 <- flowSocialList' MySelfFirst user nt ( FlowCont Map.empty
93 $ List.zip (Map.keys ngs')
97 printDebug "flowSocialList'"
98 $ Map.filter (not . ((==) Map.empty) . (view fls_parents))
99 $ view flc_scores socialLists'
103 groupedWithList = toGroupedTreeText groupIt socialLists' ngs'
105 printDebug "groupedWithList"
106 $ view flc_cont groupedWithList
109 (stopTerms, tailTerms) = Map.partition ((== Just StopTerm) . viewListType) $ view flc_scores groupedWithList
110 (mapTerms, tailTerms') = Map.partition ((== Just MapTerm) . viewListType) tailTerms
112 listSize = mapListSize - (List.length mapTerms)
113 (mapTerms', candiTerms) = both Map.fromList
114 $ List.splitAt listSize
115 $ List.sortOn (Down . viewScore . snd)
116 $ Map.toList tailTerms'
118 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
119 <> (toNgramsElement mapTerms )
120 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
121 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
126 buildNgramsTermsList :: ( HasNodeError err
135 -> m (Map NgramsType [NgramsElement])
136 buildNgramsTermsList user uCid mCid groupParams = do
138 -- Computing global speGen score
139 allTerms :: Map Text Double <- getTficf uCid mCid NgramsTerms
141 -- printDebug "head candidates" (List.take 10 $ allTerms)
142 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
144 -- First remove stops terms
145 socialLists <- flowSocialList user NgramsTerms (Set.fromList $ map fst $ Map.toList allTerms)
146 -- printDebug "\n * socialLists * \n" socialLists
148 -- Grouping the ngrams and keeping the maximum score for label
149 let grouped = groupedTextWithStem ( GroupedTextParams (groupWith groupParams) identity (const Set.empty) (const Set.empty) {-(size . _gt_label)-} ) allTerms
151 groupedWithList = map (addListType (invertForw socialLists)) grouped
153 (stopTerms, candidateTerms) = Map.partition ((== Just StopTerm) . viewListType) groupedWithList
154 (groupedMono, groupedMult) = Map.partition (\t -> t ^. gt_size < 2) candidateTerms
155 -- (groupedMono, groupedMult) = Map.partitionWithKey (\t _v -> size t < 2) candidateTerms
157 -- printDebug "\n * stopTerms * \n" stopTerms
158 -- splitting monterms and multiterms to take proportional candidates
160 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if too small
161 monoSize = 0.4 :: Double
162 multSize = 1 - monoSize
164 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
166 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
167 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
169 -- printDebug "groupedMonoHead" (List.length groupedMonoHead)
170 -- printDebug "groupedMonoTail" (List.length groupedMonoHead)
171 -- printDebug "groupedMultHead" (List.length groupedMultHead)
172 -- printDebug "groupedMultTail" (List.length groupedMultTail)
175 -- Get Local Scores now for selected grouped ngrams
177 selectedTerms = Set.toList $ List.foldl'
178 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
182 (groupedMonoHead <> groupedMultHead)
183 -- selectedTerms = hasTerms (groupedMonoHead <> groupedMultHead)
185 -- TO remove (and remove HasNodeError instance)
186 userListId <- defaultList uCid
187 masterListId <- defaultList mCid
189 mapTextDocIds <- getNodesByNgramsOnlyUser uCid
190 [userListId, masterListId]
195 mapGroups = Map.fromList
196 $ map (\g -> (g ^. gt_stem, g))
197 $ groupedMonoHead <> groupedMultHead
199 -- grouping with Set NodeId
200 contextsAdded = foldl' (\mapGroups' k ->
201 let k' = groupWith groupParams k in
202 case Map.lookup k' mapGroups' of
203 Nothing -> mapGroups'
204 Just g -> case Map.lookup k mapTextDocIds of
205 Nothing -> mapGroups'
206 Just ns -> Map.insert k' (over gt_nodes (Set.union ns) g) mapGroups'
209 $ Map.keys mapTextDocIds
211 -- compute cooccurrences
212 mapCooc = Map.filter (>2)
213 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
214 | (t1, s1) <- mapStemNodeIds
215 , (t2, s2) <- mapStemNodeIds
216 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
219 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
220 -- printDebug "mapCooc" mapCooc
224 mapScores f = Map.fromList
225 $ map (\(Scored t g s') -> (t, f (g,s')))
230 groupsWithScores = catMaybes
232 -> case Map.lookup stem mapScores' of
234 Just s' -> Just $ g { _gt_score = s'}
235 ) $ Map.toList contextsAdded
237 mapScores' = mapScores identity
238 -- adapt2 TOCHECK with DC
239 -- printDebug "groupsWithScores" groupsWithScores
241 -- sort / partition / split
242 -- filter mono/multi again
243 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
244 -- filter with max score
245 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = viewScore g in s1 > s2 )
247 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
248 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
252 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
253 inclSize = 0.4 :: Double
254 exclSize = 1 - inclSize
255 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
257 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . viewScore) monoScoredIncl
258 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . viewScore) monoScoredExcl
260 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . viewScore) multScoredIncl
261 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . viewScore) multScoredExcl
264 -- Final Step building the Typed list
265 termListHead = maps <> cands
267 maps = setListType (Just MapTerm)
268 <$> monoScoredInclHead
269 <> monoScoredExclHead
270 <> multScoredInclHead
271 <> multScoredExclHead
273 cands = setListType (Just CandidateTerm)
274 <$> monoScoredInclTail
275 <> monoScoredExclTail
276 <> multScoredInclTail
277 <> multScoredExclTail
279 termListTail = map (setListType (Just CandidateTerm)) ( groupedMonoTail <> groupedMultTail)
281 -- printDebug "monoScoredInclHead" monoScoredInclHead
282 -- printDebug "monoScoredExclHead" monoScoredExclTail
283 -- printDebug "multScoredInclHead" multScoredInclHead
284 -- printDebug "multScoredExclTail" multScoredExclTail
286 let result = Map.unionsWith (<>)
287 [ Map.fromList [( NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
288 <> (List.concat $ map toNgramsElement $ termListTail)
289 <> (List.concat $ map toNgramsElement $ stopTerms)
292 -- printDebug "\n result \n" r
297 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
298 toGargList l n = (l,n)
301 isStopTerm :: StopSize -> Text -> Bool
302 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
304 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
306 ------------------------------------------------------------------------------