]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Metrics/NgramsByNode.hs
[FIX] Score by Doc or Corpus.
[gargantext.git] / src / Gargantext / Database / Metrics / NgramsByNode.hs
1 {-|
2 Module : Gargantext.Database.Metrics.NgramsByNode
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams by node enable contextual metrics.
11
12 -}
13
14 {-# LANGUAGE QuasiQuotes #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE OverloadedStrings #-}
17 {-# LANGUAGE RankNTypes #-}
18
19 module Gargantext.Database.Metrics.NgramsByNode
20 where
21
22 import Data.Map.Strict (Map, fromListWith, elems, toList, fromList)
23 import Data.Map.Strict.Patch (PatchMap, Replace, diff)
24 import Data.Set (Set)
25 import Data.Text (Text)
26 import Data.Tuple.Extra (second, swap)
27 import Database.PostgreSQL.Simple.SqlQQ (sql)
28 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
29 import Gargantext.Core (Lang(..))
30 import Gargantext.Database.Config (nodeTypeId)
31 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
32 import Gargantext.Database.Types.Node -- (ListId, CorpusId, NodeId)
33 import Gargantext.Database.Utils (Cmd, runPGSQuery)
34 import Gargantext.Prelude
35 import Gargantext.Text.Metrics.TFICF
36 import Gargantext.Text.Terms.Mono.Stem (stem)
37 import qualified Data.List as List
38 import qualified Data.Map.Strict as Map
39 import qualified Data.Set as Set
40 import qualified Data.Text as Text
41 import qualified Database.PostgreSQL.Simple as DPS
42
43 -- | TODO: group with 2 terms only can be
44 -- discussed. Main purpose of this is offering
45 -- a first grouping option to user and get some
46 -- enriched data to better learn and improve that algo
47 ngramsGroup :: Lang -> Int -> Int -> Text -> Text
48 ngramsGroup l _m _n = Text.intercalate " "
49 . map (stem l)
50 -- . take n
51 . List.sort
52 -- . (List.filter (\t -> Text.length t > m))
53 . Text.splitOn " "
54 . Text.replace "-" " "
55
56
57 sortTficf :: (Map Text (Double, Set Text))
58 -> [ (Text,(Double, Set Text))]
59 sortTficf = List.sortOn (fst . snd) . toList
60
61
62 getTficf' :: UserCorpusId -> MasterCorpusId -> NgramsType -> (Text -> Text)
63 -> Cmd err (Map Text (Double, Set Text))
64 getTficf' u m nt f = do
65 u' <- getNodesByNgramsUser u nt
66 m' <- getNodesByNgramsMaster u m
67
68 pure $ toTficfData (countNodesByNgramsWith f u')
69 (countNodesByNgramsWith f m')
70
71 --{-
72 getTficfWith :: UserCorpusId -> MasterCorpusId -> [ListId]
73 -> NgramsType -> Map Text (Maybe Text)
74 -> Cmd err (Map Text (Double, Set Text))
75 getTficfWith u m ls nt mtxt = do
76 u' <- getNodesByNgramsOnlyUser u ls nt (Map.keys mtxt)
77 m' <- getNodesByNgramsMaster u m
78
79 let f x = case Map.lookup x mtxt of
80 Nothing -> x
81 Just x' -> maybe x identity x'
82
83 pure $ toTficfData (countNodesByNgramsWith f u')
84 (countNodesByNgramsWith f m')
85 --}
86
87
88 type Context = (Double, Map Text (Double, Set Text))
89 type Supra = Context
90 type Infra = Context
91
92 toTficfData :: Infra -> Supra
93 -> Map Text (Double, Set Text)
94 toTficfData (ti, mi) (ts, ms) =
95 fromList [ (t, ( tficf (TficfInfra (Count n )(Total ti))
96 (TficfSupra (Count $ maybe 0 fst $ Map.lookup t ms)(Total ts))
97 , ns
98 )
99 )
100 | (t, (n,ns)) <- toList mi
101 ]
102
103
104 -- | fst is size of Supra Corpus
105 -- snd is Texts and size of Occurrences (different docs)
106 countNodesByNgramsWith :: (Text -> Text)
107 -> Map Text (Set NodeId)
108 -> (Double, Map Text (Double, Set Text))
109 countNodesByNgramsWith f m = (total, m')
110 where
111 total = fromIntegral $ Set.size $ Set.unions $ elems m
112 m' = Map.map ( swap . second (fromIntegral . Set.size))
113 $ groupNodesByNgramsWith f m
114
115
116 groupNodesByNgramsWith :: (Text -> Text)
117 -> Map Text (Set NodeId)
118 -> Map Text (Set Text, Set NodeId)
119 groupNodesByNgramsWith f m =
120 fromListWith (<>) $ map (\(t,ns) -> (f t, (Set.singleton t, ns)))
121 $ toList m
122
123 ------------------------------------------------------------------------
124 getNodesByNgramsUser :: CorpusId -> NgramsType
125 -> Cmd err (Map Text (Set NodeId))
126 getNodesByNgramsUser cId nt =
127 fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
128 <$> selectNgramsByNodeUser cId nt
129
130 selectNgramsByNodeUser :: CorpusId -> NgramsType
131 -> Cmd err [(NodeId, Text)]
132 selectNgramsByNodeUser cId nt =
133 runPGSQuery queryNgramsByNodeUser
134 ( cId
135 , nodeTypeId NodeDocument
136 , ngramsTypeId nt
137 , 1000 :: Int -- limit
138 , 0 :: Int -- offset
139 )
140
141 queryNgramsByNodeUser :: DPS.Query
142 queryNgramsByNodeUser = [sql|
143
144 SELECT nng.node2_id, ng.terms FROM node_node_ngrams nng
145 JOIN ngrams ng ON nng.ngrams_id = ng.id
146 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
147 JOIN nodes n ON nn.node2_id = n.id
148 WHERE nn.node1_id = ? -- CorpusId
149 AND n.typename = ? -- NodeTypeId
150 AND nng.ngrams_type = ? -- NgramsTypeId
151 AND nn.delete = False
152 GROUP BY nng.node2_id, ng.terms
153 ORDER BY (nng.node2_id, ng.terms) DESC
154 LIMIT ?
155 OFFSET ?
156 |]
157 ------------------------------------------------------------------------
158 -- TODO add groups
159 getOccByNgramsOnlyFast :: CorpusId -> NgramsType -> [Text]
160 -> Cmd err (Map Text Int)
161 getOccByNgramsOnlyFast cId nt ngs =
162 fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
163
164 -- just slower than getOccByNgramsOnlyFast
165 getOccByNgramsOnlySlow :: NodeType -> CorpusId -> [ListId] -> NgramsType -> [Text]
166 -> Cmd err (Map Text Int)
167 getOccByNgramsOnlySlow t cId ls nt ngs =
168 Map.map Set.size <$> getScore' t cId ls nt ngs
169 where
170 getScore' NodeCorpus = getNodesByNgramsOnlyUser
171 getScore' NodeDocument = getNgramsByDocOnlyUser
172 getScore' _ = getNodesByNgramsOnlyUser
173
174 getOccByNgramsOnlySafe :: CorpusId -> [ListId] -> NgramsType -> [Text]
175 -> Cmd err (Map Text Int)
176 getOccByNgramsOnlySafe cId ls nt ngs = do
177 printDebug "getOccByNgramsOnlySafe" (cId, nt, length ngs)
178 fast <- getOccByNgramsOnlyFast cId nt ngs
179 slow <- getOccByNgramsOnlySlow NodeCorpus cId ls nt ngs
180 when (fast /= slow) $
181 printDebug "getOccByNgramsOnlySafe: difference" (diff slow fast :: PatchMap Text (Replace (Maybe Int)))
182 pure slow
183
184
185 selectNgramsOccurrencesOnlyByNodeUser :: CorpusId -> NgramsType -> [Text]
186 -> Cmd err [(Text, Int)]
187 selectNgramsOccurrencesOnlyByNodeUser cId nt tms =
188 runPGSQuery queryNgramsOccurrencesOnlyByNodeUser
189 ( Values fields (DPS.Only <$> tms)
190 , cId
191 , nodeTypeId NodeDocument
192 , ngramsTypeId nt
193 )
194 where
195 fields = [QualifiedIdentifier Nothing "text"]
196
197 -- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids.
198 -- Question: with the grouping is the result exactly the same (since Set NodeId for
199 -- equivalent ngrams intersections are not empty)
200 queryNgramsOccurrencesOnlyByNodeUser :: DPS.Query
201 queryNgramsOccurrencesOnlyByNodeUser = [sql|
202
203 WITH input_rows(terms) AS (?)
204 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
205 JOIN ngrams ng ON nng.ngrams_id = ng.id
206 JOIN input_rows ir ON ir.terms = ng.terms
207 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
208 JOIN nodes n ON nn.node2_id = n.id
209 WHERE nn.node1_id = ? -- CorpusId
210 AND n.typename = ? -- NodeTypeId
211 AND nng.ngrams_type = ? -- NgramsTypeId
212 AND nn.delete = False
213 GROUP BY nng.node2_id, ng.terms
214 |]
215
216 getNodesByNgramsOnlyUser :: NodeId -> [ListId] -> NgramsType -> [Text]
217 -> Cmd err (Map Text (Set NodeId))
218 getNodesByNgramsOnlyUser cId ls nt ngs = Map.unionsWith (<>)
219 . map (fromListWith (<>) . map (second Set.singleton))
220 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt) (splitEvery 1000 ngs)
221
222 selectNgramsOnlyByNodeUser :: CorpusId -> [ListId] -> NgramsType -> [Text]
223 -> Cmd err [(Text, NodeId)]
224 selectNgramsOnlyByNodeUser cId ls nt tms =
225 runPGSQuery queryNgramsOnlyByNodeUser
226 ( Values fields (DPS.Only <$> tms)
227 , Values [QualifiedIdentifier Nothing "int4"] (DPS.Only <$> (map (\(NodeId n) -> n) ls))
228 , cId
229 , nodeTypeId NodeDocument
230 , ngramsTypeId nt
231 )
232 where
233 fields = [QualifiedIdentifier Nothing "text"]
234
235 queryNgramsOnlyByNodeUser :: DPS.Query
236 queryNgramsOnlyByNodeUser = [sql|
237
238 WITH input_rows(terms) AS (?),
239 input_list(id) AS (?)
240 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
241 JOIN ngrams ng ON nng.ngrams_id = ng.id
242 JOIN input_rows ir ON ir.terms = ng.terms
243 JOIN input_list il ON il.id = nng.node1_id
244 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
245 JOIN nodes n ON nn.node2_id = n.id
246 WHERE nn.node1_id = ? -- CorpusId
247 AND n.typename = ? -- NodeTypeId
248 AND nng.ngrams_type = ? -- NgramsTypeId
249 AND nn.delete = False
250 GROUP BY ng.terms, nng.node2_id
251 |]
252
253
254
255 getNgramsByDocOnlyUser :: NodeId -> [ListId] -> NgramsType -> [Text]
256 -> Cmd err (Map Text (Set NodeId))
257 getNgramsByDocOnlyUser cId ls nt ngs = Map.unionsWith (<>)
258 . map (fromListWith (<>) . map (second Set.singleton))
259 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
260
261
262
263 selectNgramsOnlyByDocUser :: DocId -> [ListId] -> NgramsType -> [Text]
264 -> Cmd err [(Text, NodeId)]
265 selectNgramsOnlyByDocUser dId ls nt tms =
266 runPGSQuery queryNgramsOnlyByDocUser
267 ( Values fields (DPS.Only <$> tms)
268 , Values [QualifiedIdentifier Nothing "int4"] (DPS.Only <$> (map (\(NodeId n) -> n) ls))
269 , dId
270 , ngramsTypeId nt
271 )
272 where
273 fields = [QualifiedIdentifier Nothing "text"]
274
275 queryNgramsOnlyByDocUser :: DPS.Query
276 queryNgramsOnlyByDocUser = [sql|
277
278 WITH input_rows(terms) AS (?),
279 input_list(id) AS (?)
280 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
281 JOIN ngrams ng ON nng.ngrams_id = ng.id
282 JOIN input_rows ir ON ir.terms = ng.terms
283 JOIN input_list il ON il.id = nng.node1_id
284 WHERE nng.node2_id = ? -- DocId
285 AND nng.ngrams_type = ? -- NgramsTypeId
286 GROUP BY ng.terms, nng.node2_id
287 |]
288
289
290 ------------------------------------------------------------------------
291 -- | TODO filter by language, database, any social field
292 getNodesByNgramsMaster :: UserCorpusId -> MasterCorpusId -> Cmd err (Map Text (Set NodeId))
293 getNodesByNgramsMaster ucId mcId = Map.unionsWith (<>)
294 . map (fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
295 -- . takeWhile (not . List.null)
296 -- . takeWhile (\l -> List.length l > 3)
297 <$> mapM (selectNgramsByNodeMaster 1000 ucId mcId) [0,500..10000]
298
299
300
301 type Limit = Int
302 type Offset = Int
303
304 selectNgramsByNodeMaster :: Int -> UserCorpusId -> MasterCorpusId -> Int -> Cmd err [(NodeId, Text)]
305 selectNgramsByNodeMaster n ucId mcId p = runPGSQuery
306 queryNgramsByNodeMaster'
307 ( ucId
308 , ngramsTypeId NgramsTerms
309 , nodeTypeId NodeDocument
310 , p
311 , nodeTypeId NodeDocument
312 , p
313 , n
314 , mcId
315 , nodeTypeId NodeDocument
316 , ngramsTypeId NgramsTerms
317 )
318
319 -- | TODO fix node_node_ngrams relation
320 queryNgramsByNodeMaster' :: DPS.Query
321 queryNgramsByNodeMaster' = [sql|
322
323 WITH nodesByNgramsUser AS (
324
325 SELECT n.id, ng.terms FROM nodes n
326 JOIN nodes_nodes nn ON n.id = nn.node2_id
327 JOIN node_node_ngrams nng ON nng.node2_id = n.id
328 JOIN ngrams ng ON nng.ngrams_id = ng.id
329 WHERE nn.node1_id = ? -- UserCorpusId
330 -- AND n.typename = ? -- NodeTypeId
331 AND nng.ngrams_type = ? -- NgramsTypeId
332 AND nn.delete = False
333 AND node_pos(n.id,?) >= ?
334 AND node_pos(n.id,?) < ?
335 GROUP BY n.id, ng.terms
336
337 ),
338
339 nodesByNgramsMaster AS (
340
341 SELECT n.id, ng.terms FROM nodes n TABLESAMPLE SYSTEM_ROWS(?)
342 JOIN node_node_ngrams nng ON n.id = nng.node2_id
343 JOIN ngrams ng ON ng.id = nng.ngrams_id
344
345 WHERE n.parent_id = ? -- Master Corpus NodeTypeId
346 AND n.typename = ? -- NodeTypeId
347 AND nng.ngrams_type = ? -- NgramsTypeId
348 GROUP BY n.id, ng.terms
349 )
350
351 SELECT m.id, m.terms FROM nodesByNgramsMaster m
352 RIGHT JOIN nodesByNgramsUser u ON u.id = m.id
353 |]
354
355