]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[CLEAN] smal refact, renaming, doc
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
14
15 module Gargantext.Core.Text.List
16 where
17
18 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
19 import Data.Map (Map)
20 import Data.Monoid (mempty)
21 import Data.Ord (Down(..))
22 import Data.Set (Set)
23 import Data.Text (Text)
24 import Data.Tuple.Extra (both)
25 import Gargantext.API.Ngrams.Types (NgramsElement)
26 import Gargantext.API.Ngrams.Types (RepoCmdM)
27 import Gargantext.Core.Text (size)
28 import Gargantext.Core.Text.List.Group
29 import Gargantext.Core.Text.List.Group.Prelude
30 import Gargantext.Core.Text.List.Group.WithStem
31 import Gargantext.Core.Text.List.Social
32 import Gargantext.Core.Text.List.Social.Prelude
33 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
34 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
35 import Gargantext.Core.Types.Individu (User(..))
36 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
37 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
38 import Gargantext.Database.Admin.Types.Node (NodeId)
39 import Gargantext.Database.Prelude (CmdM)
40 import Gargantext.Database.Query.Table.Node (defaultList)
41 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
42 import Gargantext.Database.Query.Tree.Error (HasTreeError)
43 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
44 import Gargantext.Prelude
45 import qualified Data.List as List
46 import qualified Data.Map as Map
47 import qualified Data.Set as Set
48
49
50 {-
51 -- TODO maybe useful for later
52 isStopTerm :: StopSize -> Text -> Bool
53 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
54 where
55 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
56 -}
57
58
59
60 -- | TODO improve grouping functions of Authors, Sources, Institutes..
61 buildNgramsLists :: ( RepoCmdM env err m
62 , CmdM env err m
63 , HasTreeError err
64 , HasNodeError err
65 )
66 => User
67 -> GroupParams
68 -> UserCorpusId
69 -> MasterCorpusId
70 -> m (Map NgramsType [NgramsElement])
71 buildNgramsLists user gp uCid mCid = do
72 ngTerms <- buildNgramsTermsList user uCid mCid gp (NgramsTerms, MapListSize 350)
73 othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
74 [ (Authors , MapListSize 9)
75 , (Sources , MapListSize 9)
76 , (Institutes, MapListSize 9)
77 ]
78
79 pure $ Map.unions $ [ngTerms] <> othersTerms
80
81
82 data MapListSize = MapListSize { unMapListSize :: !Int }
83
84 buildNgramsOthersList ::( HasNodeError err
85 , CmdM env err m
86 , RepoCmdM env err m
87 , HasTreeError err
88 )
89 => User
90 -> UserCorpusId
91 -> GroupParams
92 -> (NgramsType, MapListSize)
93 -> m (Map NgramsType [NgramsElement])
94 buildNgramsOthersList user uCid groupParams (nt, MapListSize mapListSize) = do
95 allTerms :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt
96
97 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
98 socialLists' :: FlowCont Text FlowListScores
99 <- flowSocialList' MySelfFirst user nt ( FlowCont Map.empty
100 $ Map.fromList
101 $ List.zip (Map.keys allTerms)
102 (List.cycle [mempty])
103 )
104 let
105 groupedWithList = toGroupedTree groupParams socialLists' allTerms
106
107 let
108 (stopTerms, tailTerms) = Map.partition ((== Just StopTerm) . viewListType) $ view flc_scores groupedWithList
109 (mapTerms, tailTerms') = Map.partition ((== Just MapTerm) . viewListType) tailTerms
110
111 listSize = mapListSize - (List.length mapTerms)
112 (mapTerms', candiTerms) = both Map.fromList
113 $ List.splitAt listSize
114 $ List.sortOn (Down . viewScore . snd)
115 $ Map.toList tailTerms'
116
117 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
118 <> (toNgramsElement mapTerms )
119 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
120 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
121 )]
122
123
124 -- TODO use ListIds
125 buildNgramsTermsList :: ( HasNodeError err
126 , CmdM env err m
127 , RepoCmdM env err m
128 , HasTreeError err
129 )
130 => User
131 -> UserCorpusId
132 -> MasterCorpusId
133 -> GroupParams
134 -> (NgramsType, MapListSize)
135 -> m (Map NgramsType [NgramsElement])
136 buildNgramsTermsList user uCid mCid groupParams (nt, mapListSize)= do
137
138 -- | Filter 0 With Double
139
140 -- Computing global speGen score
141 allTerms :: Map Text Double <- getTficf uCid mCid nt
142
143 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
144 socialLists' :: FlowCont Text FlowListScores
145 <- flowSocialList' MySelfFirst user nt ( FlowCont Map.empty
146 $ Map.fromList
147 $ List.zip (Map.keys allTerms)
148 (List.cycle [mempty])
149 )
150 let groupedWithList = toGroupedTree groupParams socialLists' allTerms
151 (stopTerms, candidateTerms) = Map.partition ((== Just StopTerm) . viewListType)
152 $ view flc_scores groupedWithList
153 (groupedMono, groupedMult) = Map.partitionWithKey (\t _v -> size t < 2) candidateTerms
154
155 -- splitting monterms and multiterms to take proportional candidates
156 let
157 -- use % of list if to big, or Int if too small
158 listSizeGlobal = 2000 :: Double
159 monoSize = 0.4 :: Double
160 multSize = 1 - monoSize
161
162 splitAt n' ns = both (Map.fromListWith (<>))
163 $ List.splitAt (round $ n' * listSizeGlobal)
164 $ List.sortOn (viewScore . snd)
165 $ Map.toList ns
166
167 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
168 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
169
170 -------------------------
171 -- Filter 1 With Set NodeId and SpeGen
172 selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
173
174 -- TO remove (and remove HasNodeError instance)
175 userListId <- defaultList uCid
176 masterListId <- defaultList mCid
177
178
179 mapTextDocIds <- getNodesByNgramsOnlyUser uCid
180 [userListId, masterListId]
181 nt
182 selectedTerms
183
184 let
185 groupedTreeScores_SetNodeId :: Map Text (GroupedTreeScores (Set NodeId))
186 groupedTreeScores_SetNodeId = undefined
187 -- setScoresWith (\_ _ -> mempty) (groupedMonoHead <> groupedMultHead)
188 -- groupedTreeScores_SetNodeId = setScoresWith ((fromMaybe mempty) . ((flip Map.lookup) mapTextDocIds)) (groupedMonoHead <> groupedMultHead)
189
190 -- | Coocurrences computation
191 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
192 let mapCooc = Map.filter (>2)
193 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
194 | (t1, s1) <- mapStemNodeIds
195 , (t2, s2) <- mapStemNodeIds
196 ]
197 where
198 mapStemNodeIds = Map.toList
199 $ Map.map viewScores
200 $ groupedTreeScores_SetNodeId
201 let
202 -- computing scores
203 mapScores f = Map.fromList
204 $ map (\g -> (view scored_terms g, f g))
205 $ normalizeGlobal
206 $ map normalizeLocal
207 $ scored' mapCooc
208
209 let
210 groupedTreeScores_SpeGen :: Map Text (GroupedTreeScores (Scored Double))
211 groupedTreeScores_SpeGen = undefined
212 -- setScoresWith (\k v -> set gts'_score (Scored "" 0 0) v) (groupedMonoHead <> groupedMultHead)
213 -- groupedTreeScores_SpeGen = setScoresWith (\k v -> set gts'_score (fromMaybe (Scored "" 0 0) $ Map.lookup k (mapScores identity)) v) (groupedMonoHead <> groupedMultHead)
214
215 let
216 -- sort / partition / split
217 -- filter mono/multi again
218 (monoScored, multScored) = Map.partitionWithKey (\t _v -> size t < 2) groupedTreeScores_SpeGen
219 -- (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
220
221 -- filter with max score
222 partitionWithMaxScore = Map.partition (\g -> (view scored_genInc $ view gts'_score g)
223 > (view scored_speExc $ view gts'_score g)
224 )
225
226 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
227 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
228
229 -- splitAt
230 let
231 -- use % of list if to big, or Int if to small
232 listSizeLocal = 1000 :: Double
233 inclSize = 0.4 :: Double
234 exclSize = 1 - inclSize
235
236 splitAt' n' = (both (Map.fromList)) . (List.splitAt (round $ n' * listSizeLocal))
237 sortOn f = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . Map.toList
238
239 monoInc_size = splitAt' $ monoSize * inclSize / 2
240 (monoScoredInclHead, monoScoredInclTail) = monoInc_size $ (sortOn scored_genInc) monoScoredIncl
241 (monoScoredExclHead, monoScoredExclTail) = monoInc_size $ (sortOn scored_speExc) monoScoredExcl
242
243 multExc_size = splitAt' $ multSize * exclSize / 2
244 (multScoredInclHead, multScoredInclTail) = multExc_size $ (sortOn scored_genInc) multScoredIncl
245 (multScoredExclHead, multScoredExclTail) = multExc_size $ (sortOn scored_speExc) multScoredExcl
246
247 -- Final Step building the Typed list
248 termListHead = maps <> cands
249 where
250 maps = setListType (Just MapTerm)
251 $ monoScoredInclHead
252 <> monoScoredExclHead
253 <> multScoredInclHead
254 <> multScoredExclHead
255
256 cands = setListType (Just CandidateTerm)
257 $ monoScoredInclTail
258 <> monoScoredExclTail
259 <> multScoredInclTail
260 <> multScoredExclTail
261
262 termListTail = (setListType (Just CandidateTerm)) (groupedMonoTail <> groupedMultTail)
263
264 let result = Map.unionsWith (<>)
265 [ Map.fromList [( nt, toNgramsElement termListHead
266 <> toNgramsElement termListTail
267 <> toNgramsElement stopTerms
268 )]
269 ]
270
271 pure result