]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[CLEAN] removing previous code (WIP)
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
14
15 module Gargantext.Core.Text.List
16 where
17
18 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
19 import Data.Map (Map)
20 import Data.Maybe (catMaybes, fromMaybe)
21 import Data.Monoid (mempty)
22 import Data.Ord (Down(..))
23 import Data.Set (Set)
24 import Data.Text (Text)
25 import Data.Tuple.Extra (both)
26 import Gargantext.API.Ngrams.Types (NgramsElement)
27 import Gargantext.API.Ngrams.Types (RepoCmdM)
28 import Gargantext.Core.Text (size)
29 import Gargantext.Core.Text.List.Group
30 import Gargantext.Core.Text.List.Group.Prelude
31 import Gargantext.Core.Text.List.Group.WithStem
32 import Gargantext.Core.Text.List.Social
33 import Gargantext.Core.Text.List.Social.Prelude
34 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
35 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
36 import Gargantext.Core.Types.Individu (User(..))
37 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
38 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
39 import Gargantext.Database.Admin.Types.Node (NodeId)
40 import Gargantext.Database.Prelude (CmdM)
41 import Gargantext.Database.Query.Table.Node (defaultList)
42 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
43 import Gargantext.Database.Query.Tree.Error (HasTreeError)
44 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
45 import Gargantext.Prelude
46 import qualified Data.Char as Char
47 import qualified Data.List as List
48 import qualified Data.Map as Map
49 import qualified Data.Set as Set
50 import qualified Data.Text as Text
51
52
53 {-
54 -- TODO maybe useful for later
55 isStopTerm :: StopSize -> Text -> Bool
56 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
57 where
58 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
59 -}
60
61
62
63 -- | TODO improve grouping functions of Authors, Sources, Institutes..
64 buildNgramsLists :: ( RepoCmdM env err m
65 , CmdM env err m
66 , HasTreeError err
67 , HasNodeError err
68 )
69 => User
70 -> GroupParams
71 -> UserCorpusId
72 -> MasterCorpusId
73 -> m (Map NgramsType [NgramsElement])
74 buildNgramsLists user gp uCid mCid = do
75 ngTerms <- buildNgramsTermsList user uCid mCid gp (NgramsTerms, MapListSize 350)
76 othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
77 [ (Authors , MapListSize 9)
78 , (Sources , MapListSize 9)
79 , (Institutes, MapListSize 9)
80 ]
81
82 pure $ Map.unions $ [ngTerms] <> othersTerms
83
84
85 data MapListSize = MapListSize { unMapListSize :: !Int }
86
87 buildNgramsOthersList ::( HasNodeError err
88 , CmdM env err m
89 , RepoCmdM env err m
90 , HasTreeError err
91 )
92 => User
93 -> UserCorpusId
94 -> GroupParams
95 -> (NgramsType, MapListSize)
96 -> m (Map NgramsType [NgramsElement])
97 buildNgramsOthersList user uCid groupParams (nt, MapListSize mapListSize) = do
98 allTerms :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt
99
100 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
101 socialLists' :: FlowCont Text FlowListScores
102 <- flowSocialList' MySelfFirst user nt ( FlowCont Map.empty
103 $ Map.fromList
104 $ List.zip (Map.keys allTerms)
105 (List.cycle [mempty])
106 )
107 let
108 groupedWithList = toGroupedTree groupParams socialLists' allTerms
109
110 let
111 (stopTerms, tailTerms) = Map.partition ((== Just StopTerm) . viewListType) $ view flc_scores groupedWithList
112 (mapTerms, tailTerms') = Map.partition ((== Just MapTerm) . viewListType) tailTerms
113
114 listSize = mapListSize - (List.length mapTerms)
115 (mapTerms', candiTerms) = both Map.fromList
116 $ List.splitAt listSize
117 $ List.sortOn (Down . viewScore . snd)
118 $ Map.toList tailTerms'
119
120 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
121 <> (toNgramsElement mapTerms )
122 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
123 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
124 )]
125
126
127 -- TODO use ListIds
128 buildNgramsTermsList :: ( HasNodeError err
129 , CmdM env err m
130 , RepoCmdM env err m
131 , HasTreeError err
132 )
133 => User
134 -> UserCorpusId
135 -> MasterCorpusId
136 -> GroupParams
137 -> (NgramsType, MapListSize)
138 -> m (Map NgramsType [NgramsElement])
139 buildNgramsTermsList user uCid mCid groupParams (nt, mapListSize)= do
140
141 -- Computing global speGen score
142 allTerms :: Map Text Double <- getTficf uCid mCid nt
143
144 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
145 socialLists' :: FlowCont Text FlowListScores
146 <- flowSocialList' MySelfFirst user nt ( FlowCont Map.empty
147 $ Map.fromList
148 $ List.zip (Map.keys allTerms)
149 (List.cycle [mempty])
150 )
151 let groupedWithList = toGroupedTree groupParams socialLists' allTerms
152 (stopTerms, candidateTerms) = Map.partition ((== Just StopTerm) . viewListType)
153 $ view flc_scores groupedWithList
154 (groupedMono, groupedMult) = Map.partitionWithKey (\t _v -> size t < 2) candidateTerms
155
156 -- splitting monterms and multiterms to take proportional candidates
157 let
158 -- use % of list if to big, or Int if too small
159 listSizeGlobal = 2000 :: Double
160 monoSize = 0.4 :: Double
161 multSize = 1 - monoSize
162
163 splitAt n' ns = both (Map.fromListWith (<>))
164 $ List.splitAt (round $ n' * listSizeGlobal)
165 $ List.sortOn (viewScore . snd)
166 $ Map.toList ns
167
168 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
169 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
170
171 selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
172
173 -- TO remove (and remove HasNodeError instance)
174 userListId <- defaultList uCid
175 masterListId <- defaultList mCid
176
177 mapTextDocIds <- getNodesByNgramsOnlyUser uCid
178 [userListId, masterListId]
179 nt
180 selectedTerms
181
182 let
183 groupedTreeScores_SetNodeId :: Map Text (GroupedTreeScores (Set NodeId))
184 groupedTreeScores_SetNodeId = undefined
185 -- setScoresWith (\_ _ -> mempty) (groupedMonoHead <> groupedMultHead)
186 -- groupedTreeScores_SetNodeId = setScoresWith ((fromMaybe mempty) . ((flip Map.lookup) mapTextDocIds)) (groupedMonoHead <> groupedMultHead)
187
188 -- | Coocurrences computation
189 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
190 let mapCooc = Map.filter (>2)
191 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
192 | (t1, s1) <- mapStemNodeIds
193 , (t2, s2) <- mapStemNodeIds
194 ]
195 where
196 mapStemNodeIds = Map.toList
197 $ Map.map viewScores
198 $ groupedTreeScores_SetNodeId
199 let
200 -- computing scores
201 mapScores f = Map.fromList
202 $ map (\g -> (view scored_terms g, f g))
203 $ normalizeGlobal
204 $ map normalizeLocal
205 $ scored' mapCooc
206
207 let
208 groupedTreeScores_SpeGen :: Map Text (GroupedTreeScores (Scored Double))
209 groupedTreeScores_SpeGen = undefined
210 -- setScoresWith (\k v -> set gts'_score (Scored "" 0 0) v) (groupedMonoHead <> groupedMultHead)
211 -- groupedTreeScores_SpeGen = setScoresWith (\k v -> set gts'_score (fromMaybe (Scored "" 0 0) $ Map.lookup k (mapScores identity)) v) (groupedMonoHead <> groupedMultHead)
212
213 let
214 -- sort / partition / split
215 -- filter mono/multi again
216 (monoScored, multScored) = Map.partitionWithKey (\t _v -> size t < 2) groupedTreeScores_SpeGen
217 -- (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
218
219 -- filter with max score
220 partitionWithMaxScore = Map.partition (\g -> (view scored_genInc $ view gts'_score g)
221 > (view scored_speExc $ view gts'_score g)
222 )
223
224 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
225 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
226
227 -- splitAt
228 let
229 -- use % of list if to big, or Int if to small
230 listSizeLocal = 1000 :: Double
231 inclSize = 0.4 :: Double
232 exclSize = 1 - inclSize
233
234 splitAt' n' = (both (Map.fromList)) . (List.splitAt (round $ n' * listSizeLocal))
235 sortOn f = (List.sortOn (Down . f . _gts'_score . snd)) . Map.toList
236
237 monoInc_size = monoSize * inclSize / 2
238 (monoScoredInclHead, monoScoredInclTail) = splitAt' monoInc_size $ (sortOn _scored_genInc) monoScoredIncl
239 (monoScoredExclHead, monoScoredExclTail) = splitAt' monoInc_size $ (sortOn _scored_speExc) monoScoredExcl
240
241 multExc_size = multSize * exclSize / 2
242 (multScoredInclHead, multScoredInclTail) = splitAt' multExc_size $ (sortOn _scored_genInc) multScoredIncl
243 (multScoredExclHead, multScoredExclTail) = splitAt' multExc_size $ (sortOn _scored_speExc) multScoredExcl
244
245 -- Final Step building the Typed list
246 termListHead = maps <> cands
247 where
248 maps = setListType (Just MapTerm)
249 $ monoScoredInclHead
250 <> monoScoredExclHead
251 <> multScoredInclHead
252 <> multScoredExclHead
253
254 cands = setListType (Just CandidateTerm)
255 $ monoScoredInclTail
256 <> monoScoredExclTail
257 <> multScoredInclTail
258 <> multScoredExclTail
259
260 termListTail = (setListType (Just CandidateTerm)) (groupedMonoTail <> groupedMultTail)
261
262 let result = Map.unionsWith (<>)
263 [ Map.fromList [( nt, toNgramsElement termListHead
264 <> toNgramsElement termListTail
265 <> toNgramsElement stopTerms
266 )]
267 ]
268
269 pure result
270
271 ------------------------------------------------------------------------------