]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Metrics/NgramsByNode.hs
Eleve: improve splitting which passes 5/7 tests but still lacks a crucial point
[gargantext.git] / src / Gargantext / Database / Metrics / NgramsByNode.hs
1 {-|
2 Module : Gargantext.Database.Metrics.NgramsByNode
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams by node enable contextual metrics.
11
12 -}
13
14 {-# LANGUAGE QuasiQuotes #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE OverloadedStrings #-}
17 {-# LANGUAGE RankNTypes #-}
18
19 module Gargantext.Database.Metrics.NgramsByNode
20 where
21
22 import Data.Map.Strict (Map, fromListWith, elems, toList, fromList)
23 import Data.Map.Strict.Patch (PatchMap, Replace, diff)
24 import Data.Set (Set)
25 import Data.Text (Text)
26 import Data.Tuple.Extra (second, swap)
27 import Database.PostgreSQL.Simple.SqlQQ (sql)
28 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
29 import Gargantext.Core (Lang(..))
30 import Gargantext.Database.Config (nodeTypeId)
31 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
32 import Gargantext.Database.Types.Node -- (ListId, CorpusId, NodeId)
33 import Gargantext.Database.Utils (Cmd, runPGSQuery)
34 import Gargantext.Prelude
35 import Gargantext.Text.Metrics.TFICF
36 import Gargantext.Text.Terms.Mono.Stem (stem)
37 import qualified Data.List as List
38 import qualified Data.Map.Strict as Map
39 import qualified Data.Set as Set
40 import qualified Data.Text as Text
41 import qualified Database.PostgreSQL.Simple as DPS
42
43 -- | TODO: group with 2 terms only can be
44 -- discussed. Main purpose of this is offering
45 -- a first grouping option to user and get some
46 -- enriched data to better learn and improve that algo
47 ngramsGroup :: Lang -> Int -> Int -> Text -> Text
48 ngramsGroup l _m _n = Text.intercalate " "
49 . map (stem l)
50 -- . take n
51 . List.sort
52 -- . (List.filter (\t -> Text.length t > m))
53 . Text.splitOn " "
54 . Text.replace "-" " "
55
56
57 sortTficf :: (Map Text (Double, Set Text))
58 -> [ (Text,(Double, Set Text))]
59 sortTficf = List.sortOn (fst . snd) . toList
60
61
62 getTficf' :: UserCorpusId -> MasterCorpusId -> NgramsType -> (Text -> Text)
63 -> Cmd err (Map Text (Double, Set Text))
64 getTficf' u m nt f = do
65 u' <- getNodesByNgramsUser u nt
66 m' <- getNodesByNgramsMaster u m
67
68 pure $ toTficfData (countNodesByNgramsWith f u')
69 (countNodesByNgramsWith f m')
70
71 --{-
72 getTficfWith :: UserCorpusId -> MasterCorpusId -> [ListId]
73 -> NgramsType -> Map Text (Maybe Text)
74 -> Cmd err (Map Text (Double, Set Text))
75 getTficfWith u m ls nt mtxt = do
76 u' <- getNodesByNgramsOnlyUser u ls nt (Map.keys mtxt)
77 m' <- getNodesByNgramsMaster u m
78
79 let f x = case Map.lookup x mtxt of
80 Nothing -> x
81 Just x' -> maybe x identity x'
82
83 pure $ toTficfData (countNodesByNgramsWith f u')
84 (countNodesByNgramsWith f m')
85 --}
86
87
88 type Context = (Double, Map Text (Double, Set Text))
89 type Supra = Context
90 type Infra = Context
91
92 toTficfData :: Infra -> Supra
93 -> Map Text (Double, Set Text)
94 toTficfData (ti, mi) (ts, ms) =
95 fromList [ (t, ( tficf (TficfInfra (Count n )(Total ti))
96 (TficfSupra (Count $ maybe 0 fst $ Map.lookup t ms)(Total ts))
97 , ns
98 )
99 )
100 | (t, (n,ns)) <- toList mi
101 ]
102
103
104 -- | fst is size of Supra Corpus
105 -- snd is Texts and size of Occurrences (different docs)
106 countNodesByNgramsWith :: (Text -> Text)
107 -> Map Text (Set NodeId)
108 -> (Double, Map Text (Double, Set Text))
109 countNodesByNgramsWith f m = (total, m')
110 where
111 total = fromIntegral $ Set.size $ Set.unions $ elems m
112 m' = Map.map ( swap . second (fromIntegral . Set.size))
113 $ groupNodesByNgramsWith f m
114
115
116 groupNodesByNgramsWith :: (Text -> Text)
117 -> Map Text (Set NodeId)
118 -> Map Text (Set Text, Set NodeId)
119 groupNodesByNgramsWith f m =
120 fromListWith (<>) $ map (\(t,ns) -> (f t, (Set.singleton t, ns)))
121 $ toList m
122
123 ------------------------------------------------------------------------
124 getNodesByNgramsUser :: CorpusId -> NgramsType
125 -> Cmd err (Map Text (Set NodeId))
126 getNodesByNgramsUser cId nt =
127 fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
128 <$> selectNgramsByNodeUser cId nt
129 where
130
131 selectNgramsByNodeUser :: CorpusId -> NgramsType
132 -> Cmd err [(NodeId, Text)]
133 selectNgramsByNodeUser cId' nt' =
134 runPGSQuery queryNgramsByNodeUser
135 ( cId'
136 , nodeTypeId NodeDocument
137 , ngramsTypeId nt'
138 -- , 100 :: Int -- limit
139 -- , 0 :: Int -- offset
140 )
141
142 queryNgramsByNodeUser :: DPS.Query
143 queryNgramsByNodeUser = [sql|
144
145 SELECT nng.node2_id, ng.terms FROM node_node_ngrams nng
146 JOIN ngrams ng ON nng.ngrams_id = ng.id
147 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
148 JOIN nodes n ON nn.node2_id = n.id
149 WHERE nn.node1_id = ? -- CorpusId
150 AND n.typename = ? -- NodeTypeId
151 AND nng.ngrams_type = ? -- NgramsTypeId
152 AND nn.delete = False
153 GROUP BY nng.node2_id, ng.terms
154 ORDER BY (nng.node2_id, ng.terms) DESC
155 -- LIMIT ?
156 -- OFFSET ?
157 |]
158 ------------------------------------------------------------------------
159 -- TODO add groups
160 getOccByNgramsOnlyFast :: CorpusId -> NgramsType -> [Text]
161 -> Cmd err (Map Text Int)
162 getOccByNgramsOnlyFast cId nt ngs =
163 fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
164
165 -- just slower than getOccByNgramsOnlyFast
166 getOccByNgramsOnlySlow :: NodeType -> CorpusId -> [ListId] -> NgramsType -> [Text]
167 -> Cmd err (Map Text Int)
168 getOccByNgramsOnlySlow t cId ls nt ngs =
169 Map.map Set.size <$> getScore' t cId ls nt ngs
170 where
171 getScore' NodeCorpus = getNodesByNgramsOnlyUser
172 getScore' NodeDocument = getNgramsByDocOnlyUser
173 getScore' _ = getNodesByNgramsOnlyUser
174
175 getOccByNgramsOnlySafe :: CorpusId -> [ListId] -> NgramsType -> [Text]
176 -> Cmd err (Map Text Int)
177 getOccByNgramsOnlySafe cId ls nt ngs = do
178 printDebug "getOccByNgramsOnlySafe" (cId, nt, length ngs)
179 fast <- getOccByNgramsOnlyFast cId nt ngs
180 slow <- getOccByNgramsOnlySlow NodeCorpus cId ls nt ngs
181 when (fast /= slow) $
182 printDebug "getOccByNgramsOnlySafe: difference" (diff slow fast :: PatchMap Text (Replace (Maybe Int)))
183 pure slow
184
185
186 selectNgramsOccurrencesOnlyByNodeUser :: CorpusId -> NgramsType -> [Text]
187 -> Cmd err [(Text, Int)]
188 selectNgramsOccurrencesOnlyByNodeUser cId nt tms =
189 runPGSQuery queryNgramsOccurrencesOnlyByNodeUser
190 ( Values fields (DPS.Only <$> tms)
191 , cId
192 , nodeTypeId NodeDocument
193 , ngramsTypeId nt
194 )
195 where
196 fields = [QualifiedIdentifier Nothing "text"]
197
198 -- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids.
199 -- Question: with the grouping is the result exactly the same (since Set NodeId for
200 -- equivalent ngrams intersections are not empty)
201 queryNgramsOccurrencesOnlyByNodeUser :: DPS.Query
202 queryNgramsOccurrencesOnlyByNodeUser = [sql|
203
204 WITH input_rows(terms) AS (?)
205 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
206 JOIN ngrams ng ON nng.ngrams_id = ng.id
207 JOIN input_rows ir ON ir.terms = ng.terms
208 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
209 JOIN nodes n ON nn.node2_id = n.id
210 WHERE nn.node1_id = ? -- CorpusId
211 AND n.typename = ? -- NodeTypeId
212 AND nng.ngrams_type = ? -- NgramsTypeId
213 AND nn.delete = False
214 GROUP BY nng.node2_id, ng.terms
215 |]
216
217 getNodesByNgramsOnlyUser :: NodeId -> [ListId] -> NgramsType -> [Text]
218 -> Cmd err (Map Text (Set NodeId))
219 getNodesByNgramsOnlyUser cId ls nt ngs = Map.unionsWith (<>)
220 . map (fromListWith (<>) . map (second Set.singleton))
221 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt) (splitEvery 1000 ngs)
222
223 selectNgramsOnlyByNodeUser :: CorpusId -> [ListId] -> NgramsType -> [Text]
224 -> Cmd err [(Text, NodeId)]
225 selectNgramsOnlyByNodeUser cId ls nt tms =
226 runPGSQuery queryNgramsOnlyByNodeUser
227 ( Values fields (DPS.Only <$> tms)
228 , Values [QualifiedIdentifier Nothing "int4"] (DPS.Only <$> (map (\(NodeId n) -> n) ls))
229 , cId
230 , nodeTypeId NodeDocument
231 , ngramsTypeId nt
232 )
233 where
234 fields = [QualifiedIdentifier Nothing "text"]
235
236 queryNgramsOnlyByNodeUser :: DPS.Query
237 queryNgramsOnlyByNodeUser = [sql|
238
239 WITH input_rows(terms) AS (?),
240 input_list(id) AS (?)
241 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
242 JOIN ngrams ng ON nng.ngrams_id = ng.id
243 JOIN input_rows ir ON ir.terms = ng.terms
244 JOIN input_list il ON il.id = nng.node1_id
245 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
246 JOIN nodes n ON nn.node2_id = n.id
247 WHERE nn.node1_id = ? -- CorpusId
248 AND n.typename = ? -- NodeTypeId
249 AND nng.ngrams_type = ? -- NgramsTypeId
250 AND nn.delete = False
251 GROUP BY ng.terms, nng.node2_id
252 |]
253
254
255
256 getNgramsByDocOnlyUser :: NodeId -> [ListId] -> NgramsType -> [Text]
257 -> Cmd err (Map Text (Set NodeId))
258 getNgramsByDocOnlyUser cId ls nt ngs = Map.unionsWith (<>)
259 . map (fromListWith (<>) . map (second Set.singleton))
260 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
261
262
263
264 selectNgramsOnlyByDocUser :: DocId -> [ListId] -> NgramsType -> [Text]
265 -> Cmd err [(Text, NodeId)]
266 selectNgramsOnlyByDocUser dId ls nt tms =
267 runPGSQuery queryNgramsOnlyByDocUser
268 ( Values fields (DPS.Only <$> tms)
269 , Values [QualifiedIdentifier Nothing "int4"] (DPS.Only <$> (map (\(NodeId n) -> n) ls))
270 , dId
271 , ngramsTypeId nt
272 )
273 where
274 fields = [QualifiedIdentifier Nothing "text"]
275
276 queryNgramsOnlyByDocUser :: DPS.Query
277 queryNgramsOnlyByDocUser = [sql|
278
279 WITH input_rows(terms) AS (?),
280 input_list(id) AS (?)
281 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
282 JOIN ngrams ng ON nng.ngrams_id = ng.id
283 JOIN input_rows ir ON ir.terms = ng.terms
284 JOIN input_list il ON il.id = nng.node1_id
285 WHERE nng.node2_id = ? -- DocId
286 AND nng.ngrams_type = ? -- NgramsTypeId
287 GROUP BY ng.terms, nng.node2_id
288 |]
289
290
291 ------------------------------------------------------------------------
292 -- | TODO filter by language, database, any social field
293 getNodesByNgramsMaster :: UserCorpusId -> MasterCorpusId -> Cmd err (Map Text (Set NodeId))
294 getNodesByNgramsMaster ucId mcId = Map.unionsWith (<>)
295 . map (fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
296 -- . takeWhile (not . List.null)
297 -- . takeWhile (\l -> List.length l > 3)
298 <$> mapM (selectNgramsByNodeMaster 1000 ucId mcId) [0,500..10000]
299
300
301
302 type Limit = Int
303 type Offset = Int
304
305 selectNgramsByNodeMaster :: Int -> UserCorpusId -> MasterCorpusId -> Int -> Cmd err [(NodeId, Text)]
306 selectNgramsByNodeMaster n ucId mcId p = runPGSQuery
307 queryNgramsByNodeMaster'
308 ( ucId
309 , ngramsTypeId NgramsTerms
310 , nodeTypeId NodeDocument
311 , p
312 , nodeTypeId NodeDocument
313 , p
314 , n
315 , mcId
316 , nodeTypeId NodeDocument
317 , ngramsTypeId NgramsTerms
318 )
319
320 -- | TODO fix node_node_ngrams relation
321 queryNgramsByNodeMaster' :: DPS.Query
322 queryNgramsByNodeMaster' = [sql|
323
324 WITH nodesByNgramsUser AS (
325
326 SELECT n.id, ng.terms FROM nodes n
327 JOIN nodes_nodes nn ON n.id = nn.node2_id
328 JOIN node_node_ngrams nng ON nng.node2_id = n.id
329 JOIN ngrams ng ON nng.ngrams_id = ng.id
330 WHERE nn.node1_id = ? -- UserCorpusId
331 -- AND n.typename = ? -- NodeTypeId
332 AND nng.ngrams_type = ? -- NgramsTypeId
333 AND nn.delete = False
334 AND node_pos(n.id,?) >= ?
335 AND node_pos(n.id,?) < ?
336 GROUP BY n.id, ng.terms
337
338 ),
339
340 nodesByNgramsMaster AS (
341
342 SELECT n.id, ng.terms FROM nodes n TABLESAMPLE SYSTEM_ROWS(?)
343 JOIN node_node_ngrams nng ON n.id = nng.node2_id
344 JOIN ngrams ng ON ng.id = nng.ngrams_id
345
346 WHERE n.parent_id = ? -- Master Corpus NodeTypeId
347 AND n.typename = ? -- NodeTypeId
348 AND nng.ngrams_type = ? -- NgramsTypeId
349 GROUP BY n.id, ng.terms
350 )
351
352 SELECT m.id, m.terms FROM nodesByNgramsMaster m
353 RIGHT JOIN nodesByNgramsUser u ON u.id = m.id
354 |]
355
356