]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[FIX] PubMed parser
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
14
15 module Gargantext.Core.Text.List
16 where
17
18 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
19 import Data.Map (Map)
20 import Data.Monoid (mempty)
21 import Data.Ord (Down(..))
22 import Data.Set (Set)
23 import Data.Text (Text)
24 import Data.Tuple.Extra (both)
25 import Gargantext.API.Ngrams.Types (NgramsElement)
26 import Gargantext.API.Ngrams.Types (RepoCmdM)
27 import Gargantext.Core.Text (size)
28 import Gargantext.Core.Text.List.Group
29 import Gargantext.Core.Text.List.Group.Prelude
30 import Gargantext.Core.Text.List.Group.WithStem
31 import Gargantext.Core.Text.List.Social
32 import Gargantext.Core.Text.List.Social.Prelude
33 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
34 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
35 import Gargantext.Core.Types.Individu (User(..))
36 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
37 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
38 import Gargantext.Database.Admin.Types.Node (NodeId)
39 import Gargantext.Database.Prelude (CmdM)
40 import Gargantext.Database.Query.Table.Node (defaultList)
41 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
42 import Gargantext.Database.Query.Tree.Error (HasTreeError)
43 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
44 import Gargantext.Prelude
45 import qualified Data.List as List
46 import qualified Data.Map as Map
47 import qualified Data.Set as Set
48
49
50 {-
51 -- TODO maybe useful for later
52 isStopTerm :: StopSize -> Text -> Bool
53 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
54 where
55 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
56 -}
57
58
59 -- | TODO improve grouping functions of Authors, Sources, Institutes..
60 buildNgramsLists :: ( RepoCmdM env err m
61 , CmdM env err m
62 , HasTreeError err
63 , HasNodeError err
64 )
65 => User
66 -> GroupParams
67 -> UserCorpusId
68 -> MasterCorpusId
69 -> m (Map NgramsType [NgramsElement])
70 buildNgramsLists user gp uCid mCid = do
71 ngTerms <- buildNgramsTermsList user uCid mCid gp (NgramsTerms, MapListSize 350)
72 othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
73 [ (Authors , MapListSize 9)
74 , (Sources , MapListSize 9)
75 , (Institutes, MapListSize 9)
76 ]
77
78 pure $ Map.unions $ [ngTerms] <> othersTerms
79
80
81 data MapListSize = MapListSize { unMapListSize :: !Int }
82
83 buildNgramsOthersList ::( HasNodeError err
84 , CmdM env err m
85 , RepoCmdM env err m
86 , HasTreeError err
87 )
88 => User
89 -> UserCorpusId
90 -> GroupParams
91 -> (NgramsType, MapListSize)
92 -> m (Map NgramsType [NgramsElement])
93 buildNgramsOthersList user uCid groupParams (nt, MapListSize mapListSize) = do
94 allTerms :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt
95
96 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
97 socialLists :: FlowCont Text FlowListScores
98 <- flowSocialList MySelfFirst user nt ( FlowCont Map.empty
99 $ Map.fromList
100 $ List.zip (Map.keys allTerms)
101 (List.cycle [mempty])
102 )
103
104 let
105 groupedWithList = toGroupedTree groupParams socialLists allTerms
106
107 let
108 (stopTerms, tailTerms) = Map.partition ((== Just StopTerm) . viewListType)
109 $ view flc_scores groupedWithList
110
111 (mapTerms, tailTerms') = Map.partition ((== Just MapTerm) . viewListType) tailTerms
112
113 listSize = mapListSize - (List.length mapTerms)
114 (mapTerms', candiTerms) = both Map.fromList
115 $ List.splitAt listSize
116 $ List.sortOn (Down . viewScore . snd)
117 $ Map.toList tailTerms'
118
119 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
120 <> (toNgramsElement mapTerms )
121 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
122 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
123 )]
124
125
126 -- TODO use ListIds
127 buildNgramsTermsList :: ( HasNodeError err
128 , CmdM env err m
129 , RepoCmdM env err m
130 , HasTreeError err
131 )
132 => User
133 -> UserCorpusId
134 -> MasterCorpusId
135 -> GroupParams
136 -> (NgramsType, MapListSize)
137 -> m (Map NgramsType [NgramsElement])
138 buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do
139
140 -- | Filter 0 With Double
141 -- Computing global speGen score
142 allTerms :: Map Text Double <- getTficf uCid mCid nt
143
144 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
145 socialLists :: FlowCont Text FlowListScores
146 <- flowSocialList MySelfFirst user nt ( FlowCont Map.empty
147 $ Map.fromList
148 $ List.zip (Map.keys allTerms)
149 (List.cycle [mempty])
150 )
151
152 let groupedWithList = toGroupedTree groupParams socialLists allTerms
153 (stopTerms, candidateTerms) = Map.partition ((== Just StopTerm) . viewListType)
154 $ view flc_scores groupedWithList
155
156 (groupedMono, groupedMult) = Map.partitionWithKey (\t _v -> size t < 2) candidateTerms
157
158 -- printDebug "stopTerms" stopTerms
159
160 -- splitting monterms and multiterms to take proportional candidates
161 let
162 -- use % of list if to big, or Int if too small
163 listSizeGlobal = 2000 :: Double
164 monoSize = 0.4 :: Double
165 multSize = 1 - monoSize
166
167 splitAt n' ns = both (Map.fromListWith (<>))
168 $ List.splitAt (round $ n' * listSizeGlobal)
169 $ List.sortOn (viewScore . snd)
170 $ Map.toList ns
171
172 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
173 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
174
175 -------------------------
176 -- Filter 1 With Set NodeId and SpeGen
177 selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
178
179
180 -- TO remove (and remove HasNodeError instance)
181 userListId <- defaultList uCid
182 masterListId <- defaultList mCid
183
184
185 mapTextDocIds <- getNodesByNgramsOnlyUser uCid
186 [userListId, masterListId]
187 nt
188 selectedTerms
189
190 let
191 groupedTreeScores_SetNodeId :: Map Text (GroupedTreeScores (Set NodeId))
192 groupedTreeScores_SetNodeId = setScoresWithMap mapTextDocIds (groupedMonoHead <> groupedMultHead)
193
194 -- | Coocurrences computation
195 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
196 let mapCooc = Map.filter (>2)
197 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
198 | (t1, s1) <- mapStemNodeIds
199 , (t2, s2) <- mapStemNodeIds
200 ]
201 where
202 mapStemNodeIds = Map.toList
203 $ Map.map viewScores
204 $ groupedTreeScores_SetNodeId
205 let
206 -- computing scores
207 mapScores f = Map.fromList
208 $ map (\g -> (view scored_terms g, f g))
209 $ normalizeGlobal
210 $ map normalizeLocal
211 $ scored' mapCooc
212
213 let
214 groupedTreeScores_SpeGen :: Map Text (GroupedTreeScores (Scored Text))
215 groupedTreeScores_SpeGen = setScoresWithMap (mapScores identity)
216 ( groupedMonoHead
217 <> groupedMultHead
218 )
219
220 let
221 -- sort / partition / split
222 -- filter mono/multi again
223 (monoScored, multScored) = Map.partitionWithKey (\t _v -> size t < 2) groupedTreeScores_SpeGen
224
225 -- filter with max score
226 partitionWithMaxScore = Map.partition (\g -> (view scored_genInc $ view gts'_score g)
227 > (view scored_speExc $ view gts'_score g)
228 )
229
230 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
231 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
232
233 -- splitAt
234 let
235 -- use % of list if to big, or Int if to small
236 listSizeLocal = 1000 :: Double
237 inclSize = 0.4 :: Double
238 exclSize = 1 - inclSize
239
240 splitAt' n' = (both (Map.fromList)) . (List.splitAt (round $ n' * listSizeLocal))
241 sortOn f = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . Map.toList
242
243
244 monoInc_size = splitAt' $ monoSize * inclSize / 2
245 (monoScoredInclHead, monoScoredInclTail) = monoInc_size $ (sortOn scored_genInc) monoScoredIncl
246 (monoScoredExclHead, monoScoredExclTail) = monoInc_size $ (sortOn scored_speExc) monoScoredExcl
247
248 multExc_size = splitAt' $ multSize * exclSize / 2
249 (multScoredInclHead, multScoredInclTail) = multExc_size $ (sortOn scored_genInc) multScoredIncl
250 (multScoredExclHead, multScoredExclTail) = multExc_size $ (sortOn scored_speExc) multScoredExcl
251
252
253 ------------------------------------------------------------
254
255 -- Final Step building the Typed list
256 termListHead = maps <> cands
257 where
258 maps = setListType (Just MapTerm)
259 $ monoScoredInclHead
260 <> monoScoredExclHead
261 <> multScoredInclHead
262 <> multScoredExclHead
263
264 cands = setListType (Just CandidateTerm)
265 $ monoScoredInclTail
266 <> monoScoredExclTail
267 <> multScoredInclTail
268 <> multScoredExclTail
269
270 termListTail = (setListType (Just CandidateTerm)) (groupedMonoTail <> groupedMultTail)
271
272 let result = Map.unionsWith (<>)
273 [ Map.fromList [( nt, toNgramsElement termListHead
274 <> toNgramsElement termListTail
275 <> toNgramsElement stopTerms
276 )]
277 ]
278
279 -- printDebug "result" result
280
281 pure result