]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Metrics/NgramsByNode.hs
[Phylo][Merge] Fix warnings and adding Eq instance to Phylo for Behavior test.
[gargantext.git] / src / Gargantext / Database / Metrics / NgramsByNode.hs
1 {-|
2 Module : Gargantext.Database.Metrics.NgramsByNode
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams by node enable contextual metrics.
11
12 -}
13
14 {-# LANGUAGE QuasiQuotes #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE OverloadedStrings #-}
17 {-# LANGUAGE RankNTypes #-}
18
19 module Gargantext.Database.Metrics.NgramsByNode
20 where
21
22 import Data.Map.Strict (Map, fromListWith, elems, toList, fromList)
23 import Data.Map.Strict.Patch (PatchMap, Replace, diff)
24 import Data.Set (Set)
25 import Data.Text (Text)
26 import Data.Tuple.Extra (second, swap)
27 import Database.PostgreSQL.Simple.SqlQQ (sql)
28 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
29 import Gargantext.Core (Lang(..))
30 import Gargantext.Database.Config (nodeTypeId)
31 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
32 import Gargantext.Database.Types.Node -- (ListId, CorpusId, NodeId)
33 import Gargantext.Database.Utils (Cmd, runPGSQuery)
34 import Gargantext.Prelude
35 import Gargantext.Text.Metrics.TFICF
36 import Gargantext.Text.Terms.Mono.Stem (stem)
37 import qualified Data.List as List
38 import qualified Data.Map.Strict as Map
39 import qualified Data.Set as Set
40 import qualified Data.Text as Text
41 import qualified Database.PostgreSQL.Simple as DPS
42
43 -- | TODO: group with 2 terms only can be
44 -- discussed. Main purpose of this is offering
45 -- a first grouping option to user and get some
46 -- enriched data to better learn and improve that algo
47 ngramsGroup :: Lang -> Int -> Int -> Text -> Text
48 ngramsGroup l _m _n = Text.intercalate " "
49 . map (stem l)
50 -- . take n
51 . List.sort
52 -- . (List.filter (\t -> Text.length t > m))
53 . Text.splitOn " "
54 . Text.replace "-" " "
55
56
57 sortTficf :: (Map Text (Double, Set Text))
58 -> [ (Text,(Double, Set Text))]
59 sortTficf = List.sortOn (fst . snd) . toList
60
61
62 getTficf' :: UserCorpusId -> MasterCorpusId -> NgramsType -> (Text -> Text)
63 -> Cmd err (Map Text (Double, Set Text))
64 getTficf' u m nt f = do
65 u' <- getNodesByNgramsUser u nt
66 m' <- getNodesByNgramsMaster u m
67
68 pure $ toTficfData (countNodesByNgramsWith f u')
69 (countNodesByNgramsWith f m')
70
71 --{-
72 getTficfWith :: UserCorpusId -> MasterCorpusId
73 -> NgramsType -> Map Text (Maybe Text)
74 -> Cmd err (Map Text (Double, Set Text))
75 getTficfWith u m nt mtxt = do
76 u' <- getNodesByNgramsOnlyUser u nt (Map.keys mtxt)
77 m' <- getNodesByNgramsMaster u m
78
79 let f x = case Map.lookup x mtxt of
80 Nothing -> x
81 Just x' -> maybe x identity x'
82
83 pure $ toTficfData (countNodesByNgramsWith f u')
84 (countNodesByNgramsWith f m')
85 --}
86
87
88 type Context = (Double, Map Text (Double, Set Text))
89 type Supra = Context
90 type Infra = Context
91
92 toTficfData :: Infra -> Supra
93 -> Map Text (Double, Set Text)
94 toTficfData (ti, mi) (ts, ms) =
95 fromList [ (t, ( tficf (TficfInfra (Count n )(Total ti))
96 (TficfSupra (Count $ maybe 0 fst $ Map.lookup t ms)(Total ts))
97 , ns
98 )
99 )
100 | (t, (n,ns)) <- toList mi
101 ]
102
103
104 -- | fst is size of Supra Corpus
105 -- snd is Texts and size of Occurrences (different docs)
106 countNodesByNgramsWith :: (Text -> Text)
107 -> Map Text (Set NodeId)
108 -> (Double, Map Text (Double, Set Text))
109 countNodesByNgramsWith f m = (total, m')
110 where
111 total = fromIntegral $ Set.size $ Set.unions $ elems m
112 m' = Map.map ( swap . second (fromIntegral . Set.size))
113 $ groupNodesByNgramsWith f m
114
115
116 groupNodesByNgramsWith :: (Text -> Text)
117 -> Map Text (Set NodeId)
118 -> Map Text (Set Text, Set NodeId)
119 groupNodesByNgramsWith f m =
120 fromListWith (<>) $ map (\(t,ns) -> (f t, (Set.singleton t, ns)))
121 $ toList m
122
123 ------------------------------------------------------------------------
124 getNodesByNgramsUser :: CorpusId -> NgramsType
125 -> Cmd err (Map Text (Set NodeId))
126 getNodesByNgramsUser cId nt =
127 fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
128 <$> selectNgramsByNodeUser cId nt
129
130 selectNgramsByNodeUser :: CorpusId -> NgramsType
131 -> Cmd err [(NodeId, Text)]
132 selectNgramsByNodeUser cId nt =
133 runPGSQuery queryNgramsByNodeUser
134 ( cId
135 , nodeTypeId NodeDocument
136 , ngramsTypeId nt
137 , 1000 :: Int -- limit
138 , 0 :: Int -- offset
139 )
140
141 queryNgramsByNodeUser :: DPS.Query
142 queryNgramsByNodeUser = [sql|
143
144 SELECT nng.node_id, ng.terms FROM nodes_ngrams nng
145 JOIN ngrams ng ON nng.ngrams_id = ng.id
146 JOIN nodes_nodes nn ON nn.node2_id = nng.node_id
147 JOIN nodes n ON nn.node2_id = n.id
148 WHERE nn.node1_id = ? -- CorpusId
149 AND n.typename = ? -- NodeTypeId
150 AND nng.ngrams_type = ? -- NgramsTypeId
151 AND nn.delete = False
152 GROUP BY nng.node_id, ng.terms
153 ORDER BY (nng.node_id, ng.terms) DESC
154 LIMIT ?
155 OFFSET ?
156 |]
157 ------------------------------------------------------------------------
158 -- TODO add groups
159 getOccByNgramsOnlyFast :: CorpusId -> NgramsType -> [Text]
160 -> Cmd err (Map Text Int)
161 getOccByNgramsOnlyFast cId nt ngs =
162 fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
163
164 -- just slower than getOccByNgramsOnlyFast
165 getOccByNgramsOnlySlow :: CorpusId -> NgramsType -> [Text]
166 -> Cmd err (Map Text Int)
167 getOccByNgramsOnlySlow cId nt ngs =
168 Map.map Set.size <$> getNodesByNgramsOnlyUser cId nt ngs
169
170 getOccByNgramsOnlySafe :: CorpusId -> NgramsType -> [Text]
171 -> Cmd err (Map Text Int)
172 getOccByNgramsOnlySafe cId nt ngs = do
173 printDebug "getOccByNgramsOnlySafe" (cId, nt, length ngs)
174 fast <- getOccByNgramsOnlyFast cId nt ngs
175 slow <- getOccByNgramsOnlySlow cId nt ngs
176 when (fast /= slow) $
177 printDebug "getOccByNgramsOnlySafe: difference" (diff slow fast :: PatchMap Text (Replace (Maybe Int)))
178 pure slow
179
180
181 selectNgramsOccurrencesOnlyByNodeUser :: CorpusId -> NgramsType -> [Text]
182 -> Cmd err [(Text, Int)]
183 selectNgramsOccurrencesOnlyByNodeUser cId nt tms =
184 runPGSQuery queryNgramsOccurrencesOnlyByNodeUser
185 ( Values fields (DPS.Only <$> tms)
186 , cId
187 , nodeTypeId NodeDocument
188 , ngramsTypeId nt
189 )
190 where
191 fields = [QualifiedIdentifier Nothing "text"]
192
193 -- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids.
194 -- Question: with the grouping is the result exactly the same (since Set NodeId for
195 -- equivalent ngrams intersections are not empty)
196 queryNgramsOccurrencesOnlyByNodeUser :: DPS.Query
197 queryNgramsOccurrencesOnlyByNodeUser = [sql|
198
199 WITH input_rows(terms) AS (?)
200 SELECT ng.terms, COUNT(nng.node_id) FROM nodes_ngrams nng
201 JOIN ngrams ng ON nng.ngrams_id = ng.id
202 JOIN input_rows ir ON ir.terms = ng.terms
203 JOIN nodes_nodes nn ON nn.node2_id = nng.node_id
204 JOIN nodes n ON nn.node2_id = n.id
205 WHERE nn.node1_id = ? -- CorpusId
206 AND n.typename = ? -- NodeTypeId
207 AND nng.ngrams_type = ? -- NgramsTypeId
208 AND nn.delete = False
209 GROUP BY nng.node_id, ng.terms
210 |]
211
212 getNodesByNgramsOnlyUser :: CorpusId -> NgramsType -> [Text]
213 -> Cmd err (Map Text (Set NodeId))
214 getNodesByNgramsOnlyUser cId nt ngs = Map.unionsWith (<>)
215 . map (fromListWith (<>) . map (second Set.singleton))
216 <$> mapM (selectNgramsOnlyByNodeUser cId nt) (splitEvery 1000 ngs)
217
218 selectNgramsOnlyByNodeUser :: CorpusId -> NgramsType -> [Text]
219 -> Cmd err [(Text, NodeId)]
220 selectNgramsOnlyByNodeUser cId nt tms =
221 runPGSQuery queryNgramsOnlyByNodeUser
222 ( Values fields (DPS.Only <$> tms)
223 , cId
224 , nodeTypeId NodeDocument
225 , ngramsTypeId nt
226 )
227 where
228 fields = [QualifiedIdentifier Nothing "text"]
229
230 queryNgramsOnlyByNodeUser :: DPS.Query
231 queryNgramsOnlyByNodeUser = [sql|
232
233 WITH input_rows(terms) AS (?)
234 SELECT ng.terms, nng.node_id FROM nodes_ngrams nng
235 JOIN ngrams ng ON nng.ngrams_id = ng.id
236 JOIN input_rows ir ON ir.terms = ng.terms
237 JOIN nodes_nodes nn ON nn.node2_id = nng.node_id
238 JOIN nodes n ON nn.node2_id = n.id
239 WHERE nn.node1_id = ? -- CorpusId
240 AND n.typename = ? -- NodeTypeId
241 AND nng.ngrams_type = ? -- NgramsTypeId
242 AND nn.delete = False
243 GROUP BY nng.node_id, ng.terms
244 |]
245
246 ------------------------------------------------------------------------
247 -- | TODO filter by language, database, any social field
248 getNodesByNgramsMaster :: UserCorpusId -> MasterCorpusId -> Cmd err (Map Text (Set NodeId))
249 getNodesByNgramsMaster ucId mcId = Map.unionsWith (<>)
250 . map (fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
251 -- . takeWhile (not . List.null)
252 -- . takeWhile (\l -> List.length l > 3)
253 <$> mapM (selectNgramsByNodeMaster 1000 ucId mcId) [0,500..10000]
254
255
256
257 type Limit = Int
258 type Offset = Int
259
260 selectNgramsByNodeMaster :: Int -> UserCorpusId -> MasterCorpusId -> Int -> Cmd err [(NodeId, Text)]
261 selectNgramsByNodeMaster n ucId mcId p = runPGSQuery
262 queryNgramsByNodeMaster'
263 ( ucId
264 , ngramsTypeId NgramsTerms
265 , nodeTypeId NodeDocument
266 , p
267 , nodeTypeId NodeDocument
268 , p
269 , n
270 , mcId
271 , nodeTypeId NodeDocument
272 , ngramsTypeId NgramsTerms
273 )
274
275 queryNgramsByNodeMaster' :: DPS.Query
276 queryNgramsByNodeMaster' = [sql|
277
278 WITH nodesByNgramsUser AS (
279
280 SELECT n.id, ng.terms FROM nodes n
281 JOIN nodes_nodes nn ON n.id = nn.node2_id
282 JOIN nodes_ngrams nng ON nn.node2_id = n.id
283 JOIN ngrams ng ON nng.ngrams_id = ng.id
284 WHERE nn.node1_id = ? -- UserCorpusId
285 -- AND n.typename = ? -- NodeTypeId
286 AND nng.ngrams_type = ? -- NgramsTypeId
287 AND nn.delete = False
288 AND node_pos(n.id,?) >= ?
289 AND node_pos(n.id,?) < ?
290 GROUP BY n.id, ng.terms
291
292 ),
293
294 nodesByNgramsMaster AS (
295
296 SELECT n.id, ng.terms FROM nodes n TABLESAMPLE SYSTEM_ROWS(?)
297 JOIN nodes_ngrams nng ON n.id = nng.node_id
298 JOIN ngrams ng ON ng.id = nng.ngrams_id
299
300 WHERE n.parent_id = ? -- Master Corpus NodeTypeId
301 AND n.typename = ? -- NodeTypeId
302 AND nng.ngrams_type = ? -- NgramsTypeId
303 GROUP BY n.id, ng.terms
304 )
305
306 SELECT m.id, m.terms FROM nodesByNgramsMaster m
307 RIGHT JOIN nodesByNgramsUser u ON u.id = m.id
308 |]
309
310