]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[Warnings+FEAT] insert for new created ngrams
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
14
15 module Gargantext.Core.Text.List
16 where
17
18 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
19 import Data.Map (Map)
20 import Data.Monoid (mempty)
21 import Data.Ord (Down(..))
22 import Data.Set (Set)
23 import Data.Text (Text)
24 import Data.Tuple.Extra (both)
25 import Gargantext.API.Ngrams.Types (NgramsElement)
26 import Gargantext.API.Ngrams.Types (RepoCmdM)
27 import Gargantext.Core.Text (size)
28 import Gargantext.Core.Text.List.Group
29 import Gargantext.Core.Text.List.Group.Prelude
30 import Gargantext.Core.Text.List.Group.WithStem
31 import Gargantext.Core.Text.List.Social
32 import Gargantext.Core.Text.List.Social.Prelude
33 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
34 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
35 import Gargantext.Core.Types.Individu (User(..))
36 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
37 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
38 import Gargantext.Database.Admin.Types.Node (NodeId)
39 import Gargantext.Database.Prelude (CmdM)
40 import Gargantext.Database.Query.Table.Node (defaultList)
41 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
42 import Gargantext.Database.Query.Tree.Error (HasTreeError)
43 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
44 import Gargantext.Prelude
45 import qualified Data.List as List
46 import qualified Data.Map as Map
47 import qualified Data.Set as Set
48
49
50 {-
51 -- TODO maybe useful for later
52 isStopTerm :: StopSize -> Text -> Bool
53 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
54 where
55 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
56 -}
57
58
59
60 -- | TODO improve grouping functions of Authors, Sources, Institutes..
61 buildNgramsLists :: ( RepoCmdM env err m
62 , CmdM env err m
63 , HasTreeError err
64 , HasNodeError err
65 )
66 => User
67 -> GroupParams
68 -> UserCorpusId
69 -> MasterCorpusId
70 -> m (Map NgramsType [NgramsElement])
71 buildNgramsLists user gp uCid mCid = do
72 ngTerms <- buildNgramsTermsList user uCid mCid gp (NgramsTerms, MapListSize 350)
73 othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
74 [ (Authors , MapListSize 9)
75 , (Sources , MapListSize 9)
76 , (Institutes, MapListSize 9)
77 ]
78
79 pure $ Map.unions $ [ngTerms] <> othersTerms
80
81
82 data MapListSize = MapListSize { unMapListSize :: !Int }
83
84 buildNgramsOthersList ::( HasNodeError err
85 , CmdM env err m
86 , RepoCmdM env err m
87 , HasTreeError err
88 )
89 => User
90 -> UserCorpusId
91 -> GroupParams
92 -> (NgramsType, MapListSize)
93 -> m (Map NgramsType [NgramsElement])
94 buildNgramsOthersList user uCid groupParams (nt, MapListSize mapListSize) = do
95 allTerms :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt
96
97 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
98 socialLists' :: FlowCont Text FlowListScores
99 <- flowSocialList' MySelfFirst user nt ( FlowCont Map.empty
100 $ Map.fromList
101 $ List.zip (Map.keys allTerms)
102 (List.cycle [mempty])
103 )
104 let
105 groupedWithList = toGroupedTree groupParams socialLists' allTerms
106
107 let
108 (stopTerms, tailTerms) = Map.partition ((== Just StopTerm) . viewListType) $ view flc_scores groupedWithList
109 (mapTerms, tailTerms') = Map.partition ((== Just MapTerm) . viewListType) tailTerms
110
111 listSize = mapListSize - (List.length mapTerms)
112 (mapTerms', candiTerms) = both Map.fromList
113 $ List.splitAt listSize
114 $ List.sortOn (Down . viewScore . snd)
115 $ Map.toList tailTerms'
116
117 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
118 <> (toNgramsElement mapTerms )
119 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
120 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
121 )]
122
123
124 -- TODO use ListIds
125 buildNgramsTermsList :: ( HasNodeError err
126 , CmdM env err m
127 , RepoCmdM env err m
128 , HasTreeError err
129 )
130 => User
131 -> UserCorpusId
132 -> MasterCorpusId
133 -> GroupParams
134 -> (NgramsType, MapListSize)
135 -> m (Map NgramsType [NgramsElement])
136 buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do
137
138 -- | Filter 0 With Double
139
140 -- Computing global speGen score
141 allTerms :: Map Text Double <- getTficf uCid mCid nt
142
143 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
144 socialLists' :: FlowCont Text FlowListScores
145 <- flowSocialList' MySelfFirst user nt ( FlowCont Map.empty
146 $ Map.fromList
147 $ List.zip (Map.keys allTerms)
148 (List.cycle [mempty])
149 )
150 let groupedWithList = toGroupedTree groupParams socialLists' allTerms
151 (stopTerms, candidateTerms) = Map.partition ((== Just StopTerm) . viewListType)
152 $ view flc_scores groupedWithList
153 (groupedMono, groupedMult) = Map.partitionWithKey (\t _v -> size t < 2) candidateTerms
154
155 -- splitting monterms and multiterms to take proportional candidates
156 let
157 -- use % of list if to big, or Int if too small
158 listSizeGlobal = 2000 :: Double
159 monoSize = 0.4 :: Double
160 multSize = 1 - monoSize
161
162 splitAt n' ns = both (Map.fromListWith (<>))
163 $ List.splitAt (round $ n' * listSizeGlobal)
164 $ List.sortOn (viewScore . snd)
165 $ Map.toList ns
166
167 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
168 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
169
170 -------------------------
171 -- Filter 1 With Set NodeId and SpeGen
172 selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
173
174 -- TO remove (and remove HasNodeError instance)
175 userListId <- defaultList uCid
176 masterListId <- defaultList mCid
177
178
179 mapTextDocIds <- getNodesByNgramsOnlyUser uCid
180 [userListId, masterListId]
181 nt
182 selectedTerms
183
184 let
185 groupedTreeScores_SetNodeId :: Map Text (GroupedTreeScores (Set NodeId))
186 groupedTreeScores_SetNodeId = setScoresWithMap mapTextDocIds (groupedMonoHead <> groupedMultHead)
187
188 -- | Coocurrences computation
189 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
190 let mapCooc = Map.filter (>2)
191 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
192 | (t1, s1) <- mapStemNodeIds
193 , (t2, s2) <- mapStemNodeIds
194 ]
195 where
196 mapStemNodeIds = Map.toList
197 $ Map.map viewScores
198 $ groupedTreeScores_SetNodeId
199 let
200 -- computing scores
201 mapScores f = Map.fromList
202 $ map (\g -> (view scored_terms g, f g))
203 $ normalizeGlobal
204 $ map normalizeLocal
205 $ scored' mapCooc
206
207 let
208 groupedTreeScores_SpeGen :: Map Text (GroupedTreeScores (Scored Text))
209 groupedTreeScores_SpeGen = setScoresWithMap (mapScores identity) (groupedMonoHead <> groupedMultHead)
210
211 let
212 -- sort / partition / split
213 -- filter mono/multi again
214 (monoScored, multScored) = Map.partitionWithKey (\t _v -> size t < 2) groupedTreeScores_SpeGen
215
216 -- filter with max score
217 partitionWithMaxScore = Map.partition (\g -> (view scored_genInc $ view gts'_score g)
218 > (view scored_speExc $ view gts'_score g)
219 )
220
221 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
222 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
223
224 -- splitAt
225 let
226 -- use % of list if to big, or Int if to small
227 listSizeLocal = 1000 :: Double
228 inclSize = 0.4 :: Double
229 exclSize = 1 - inclSize
230
231 splitAt' n' = (both (Map.fromList)) . (List.splitAt (round $ n' * listSizeLocal))
232 sortOn f = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . Map.toList
233
234
235 monoInc_size = splitAt' $ monoSize * inclSize / 2
236 (monoScoredInclHead, monoScoredInclTail) = monoInc_size $ (sortOn scored_genInc) monoScoredIncl
237 (monoScoredExclHead, monoScoredExclTail) = monoInc_size $ (sortOn scored_speExc) monoScoredExcl
238
239 multExc_size = splitAt' $ multSize * exclSize / 2
240 (multScoredInclHead, multScoredInclTail) = multExc_size $ (sortOn scored_genInc) multScoredIncl
241 (multScoredExclHead, multScoredExclTail) = multExc_size $ (sortOn scored_speExc) multScoredExcl
242
243
244 ------------------------------------------------------------
245
246 -- Final Step building the Typed list
247 termListHead = maps <> cands
248 where
249 maps = setListType (Just MapTerm)
250 $ monoScoredInclHead
251 <> monoScoredExclHead
252 <> multScoredInclHead
253 <> multScoredExclHead
254
255 cands = setListType (Just CandidateTerm)
256 $ monoScoredInclTail
257 <> monoScoredExclTail
258 <> multScoredInclTail
259 <> multScoredExclTail
260
261 termListTail = (setListType (Just CandidateTerm)) (groupedMonoTail <> groupedMultTail)
262
263 let result = Map.unionsWith (<>)
264 [ Map.fromList [( nt, toNgramsElement termListHead
265 <> toNgramsElement termListTail
266 <> toNgramsElement stopTerms
267 )]
268 ]
269
270 pure result