]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Schema/Ngrams.hs
[FEAT][Text][Metrics] TFICF.
[gargantext.git] / src / Gargantext / Database / Schema / Ngrams.hs
1 {-|
2 Module : Gargantext.Database.Schema.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams connection to the Database.
11
12 -}
13
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE GeneralizedNewtypeDeriving #-}
19 {-# LANGUAGE MultiParamTypeClasses #-}
20 {-# LANGUAGE NoImplicitPrelude #-}
21 {-# LANGUAGE OverloadedStrings #-}
22 {-# LANGUAGE QuasiQuotes #-}
23 {-# LANGUAGE RankNTypes #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Schema.Ngrams where
27
28 import Control.Lens (makeLenses, view, over)
29 import Control.Monad (mzero)
30 import Data.ByteString.Internal (ByteString)
31 import Data.Map (Map, fromList, lookup, fromListWith)
32 import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
33 import Data.Set (Set)
34 import Data.Text (Text, splitOn)
35 import Database.PostgreSQL.Simple ((:.)(..))
36 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
37 import Database.PostgreSQL.Simple.SqlQQ (sql)
38 import Database.PostgreSQL.Simple.ToField (toField, ToField)
39 import Database.PostgreSQL.Simple.FromField (FromField, fromField)
40 import Database.PostgreSQL.Simple.ToRow (toRow)
41 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
42 import Debug.Trace (trace)
43 import GHC.Generics (Generic)
44 import Gargantext.Core.Types -- (fromListTypeId, ListType, NodePoly(Node))
45 import Gargantext.Database.Config (nodeTypeId,userMaster)
46 import Gargantext.Database.Root (getRoot)
47 import Gargantext.Database.Types.Node (NodeType)
48 import Gargantext.Database.Schema.Node (getListsWithParentId, getCorporaWithParentId)
49 import Gargantext.Database.Utils (Cmd, runPGSQuery, runOpaQuery, formatPGSQuery)
50 import Gargantext.Prelude
51 import Opaleye hiding (FromField)
52 import Prelude (Enum, Bounded, minBound, maxBound, Functor)
53 import qualified Data.Set as DS
54 import qualified Database.PostgreSQL.Simple as PGS
55
56
57 type NgramsTerms = Text
58 type NgramsId = Int
59 type Size = Int
60
61 --{-
62 data NgramsPoly id terms n = NgramsDb { ngrams_id :: id
63 , ngrams_terms :: terms
64 , ngrams_n :: n
65 } deriving (Show)
66
67 --}
68 type NgramsWrite = NgramsPoly (Maybe (Column PGInt4))
69 (Column PGText)
70 (Column PGInt4)
71
72 type NgramsRead = NgramsPoly (Column PGInt4)
73 (Column PGText)
74 (Column PGInt4)
75
76 type NgramsReadNull = NgramsPoly (Column (Nullable PGInt4))
77 (Column (Nullable PGText))
78 (Column (Nullable PGInt4))
79
80 --{-
81 type NgramsDb = NgramsPoly Int Text Int
82
83 $(makeAdaptorAndInstance "pNgramsDb" ''NgramsPoly)
84 -- $(makeLensesWith abbreviatedFields ''NgramsPoly)
85
86 ngramsTable :: Table NgramsWrite NgramsRead
87 ngramsTable = Table "ngrams" (pNgramsDb NgramsDb { ngrams_id = optional "id"
88 , ngrams_terms = required "terms"
89 , ngrams_n = required "n"
90 }
91 )
92 --{-
93 queryNgramsTable :: Query NgramsRead
94 queryNgramsTable = queryTable ngramsTable
95
96 dbGetNgramsDb :: Cmd err [NgramsDb]
97 dbGetNgramsDb = runOpaQuery queryNgramsTable
98 --}
99
100 -- | Main Ngrams Types
101 -- | Typed Ngrams
102 -- Typed Ngrams localize the context of the ngrams
103 -- ngrams in source field of document has Sources Type
104 -- ngrams in authors field of document has Authors Type
105 -- ngrams in text (title or abstract) of documents has Terms Type
106 data NgramsType = Authors | Institutes | Sources | NgramsTerms
107 deriving (Eq, Show, Ord, Enum, Bounded)
108
109 newtype NgramsTypeId = NgramsTypeId Int
110 deriving (Eq, Show, Ord, Num)
111
112 instance ToField NgramsTypeId where
113 toField (NgramsTypeId n) = toField n
114
115 instance FromField NgramsTypeId where
116 fromField fld mdata = do
117 n <- fromField fld mdata
118 if (n :: Int) > 0 then return $ NgramsTypeId n
119 else mzero
120
121 pgNgramsType :: NgramsType -> Column PGInt4
122 pgNgramsType = pgNgramsTypeId . ngramsTypeId
123
124 pgNgramsTypeId :: NgramsTypeId -> Column PGInt4
125 pgNgramsTypeId (NgramsTypeId n) = pgInt4 n
126
127 ngramsTypeId :: NgramsType -> NgramsTypeId
128 ngramsTypeId Authors = 1
129 ngramsTypeId Institutes = 2
130 ngramsTypeId Sources = 3
131 ngramsTypeId NgramsTerms = 4
132
133 fromNgramsTypeId :: NgramsTypeId -> Maybe NgramsType
134 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
135
136 ------------------------------------------------------------------------
137 -- | TODO put it in Gargantext.Text.Ngrams
138 data Ngrams = Ngrams { _ngramsTerms :: Text
139 , _ngramsSize :: Int
140 } deriving (Generic, Show, Eq, Ord)
141
142 makeLenses ''Ngrams
143 instance PGS.ToRow Ngrams where
144 toRow (Ngrams t s) = [toField t, toField s]
145
146 text2ngrams :: Text -> Ngrams
147 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
148
149 -------------------------------------------------------------------------
150 -- | TODO put it in Gargantext.Text.Ngrams
151 -- Named entity are typed ngrams of Terms Ngrams
152 data NgramsT a =
153 NgramsT { _ngramsType :: NgramsType
154 , _ngramsT :: a
155 } deriving (Generic, Show, Eq, Ord)
156
157 makeLenses ''NgramsT
158
159 instance Functor NgramsT where
160 fmap = over ngramsT
161 -----------------------------------------------------------------------
162 data NgramsIndexed =
163 NgramsIndexed
164 { _ngrams :: Ngrams
165 , _ngramsId :: NgramsId
166 } deriving (Show, Generic, Eq, Ord)
167
168 makeLenses ''NgramsIndexed
169 ------------------------------------------------------------------------
170 data NgramIds =
171 NgramIds
172 { ngramId :: Int
173 , ngramTerms :: Text
174 } deriving (Show, Generic, Eq, Ord)
175
176 instance PGS.FromRow NgramIds where
177 fromRow = NgramIds <$> field <*> field
178
179 ----------------------
180 withMap :: Map NgramsTerms NgramsId -> NgramsTerms -> NgramsId
181 withMap m n = maybe (panic "withMap: should not happen") identity (lookup n m)
182
183 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
184 indexNgramsT = fmap . indexNgramsWith . withMap
185
186 indexNgrams :: Map NgramsTerms NgramsId -> Ngrams -> NgramsIndexed
187 indexNgrams = indexNgramsWith . withMap
188
189 -- NP: not sure we need it anymore
190 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams -> NgramsT NgramsIndexed
191 indexNgramsTWith = fmap . indexNgramsWith
192
193 indexNgramsWith :: (NgramsTerms -> NgramsId) -> Ngrams -> NgramsIndexed
194 indexNgramsWith f n = NgramsIndexed n (f $ _ngramsTerms n)
195
196 insertNgrams :: [Ngrams] -> Cmd err (Map NgramsTerms NgramsId)
197 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
198
199 insertNgrams' :: [Ngrams] -> Cmd err [NgramIds]
200 insertNgrams' ns = runPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
201 where
202 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
203
204 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd err ByteString
205 insertNgrams_Debug ns = formatPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
206 where
207 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
208
209 ----------------------
210 queryInsertNgrams :: PGS.Query
211 queryInsertNgrams = [sql|
212 WITH input_rows(terms,n) AS (?)
213 , ins AS (
214 INSERT INTO ngrams (terms,n)
215 SELECT * FROM input_rows
216 ON CONFLICT (terms) DO NOTHING -- unique index created here
217 RETURNING id,terms
218 )
219
220 SELECT id, terms
221 FROM ins
222 UNION ALL
223 SELECT c.id, terms
224 FROM input_rows
225 JOIN ngrams c USING (terms); -- columns of unique index
226 |]
227
228
229 -- | Ngrams Table
230 -- TODO: the way we are getting main Master Corpus and List ID is not clean
231 -- TODO: if ids are not present -> create
232 -- TODO: Global Env State Monad to keep in memory the ids without retrieving it each time
233 getNgramsTableDb :: NodeType -> NgramsType
234 -> NgramsTableParamUser
235 -> Limit -> Offset
236 -> Cmd err [NgramsTableData]
237 getNgramsTableDb nt ngrt ntp limit_ offset_ = do
238
239
240 maybeRoot <- head <$> getRoot userMaster
241 let path = "Garg.Db.Ngrams.getTableNgrams: "
242 let masterRootId = maybe (panic $ path <> "no userMaster Tree") (view node_id) maybeRoot
243 -- let errMess = panic "Error"
244
245 corpusMasterId <- maybe (panic "error master corpus") (view node_id) <$> head <$> getCorporaWithParentId masterRootId
246
247 listMasterId <- maybe (panic "error master list") (view node_id) <$> head <$> getListsWithParentId corpusMasterId
248
249 getNgramsTableData nt ngrt ntp (NgramsTableParam listMasterId corpusMasterId) limit_ offset_
250
251 data NgramsTableParam =
252 NgramsTableParam { _nt_listId :: NodeId
253 , _nt_corpusId :: NodeId
254 }
255
256 type NgramsTableParamUser = NgramsTableParam
257 type NgramsTableParamMaster = NgramsTableParam
258
259
260 data NgramsTableData = NgramsTableData { _ntd_id :: Int
261 , _ntd_parent_id :: Maybe Int
262 , _ntd_terms :: Text
263 , _ntd_n :: Int
264 , _ntd_listType :: Maybe ListType
265 , _ntd_weight :: Double
266 } deriving (Show)
267
268
269
270 getNgramsTableData :: NodeType -> NgramsType
271 -> NgramsTableParamUser -> NgramsTableParamMaster
272 -> Limit -> Offset
273 -> Cmd err [NgramsTableData]
274 getNgramsTableData nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) limit_ offset_ =
275 trace ("Ngrams table params: " <> show params) <$>
276 map (\(i,p,t,n,lt,w) -> NgramsTableData i p t n (fromListTypeId lt) w) <$>
277 runPGSQuery querySelectTableNgramsTrees params
278 where
279 nodeTId = nodeTypeId nodeT
280 ngrmTId = ngramsTypeId ngrmT
281 params = (ul,ml,uc,mc,nodeTId,ngrmTId) :. (limit_, offset_)
282
283 getNgramsTableDataDebug :: PGS.ToRow a => a -> Cmd err ByteString
284 getNgramsTableDataDebug = formatPGSQuery querySelectTableNgramsTrees
285
286
287 querySelectTableNgrams :: PGS.Query
288 querySelectTableNgrams = [sql|
289
290 WITH tableUser AS (
291 SELECT ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
292 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
293 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
294 JOIN nodes_nodes nn ON nn.node2_id = corp.node_id
295 JOIN nodes n ON n.id = corp.node_id
296
297 WHERE list.node_id = ? -- User listId
298 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
299 AND n.typename = ? -- both type of childs (Documents or Contacts)
300 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms or...)
301 AND list.parent_id IS NULL
302 )
303 , tableMaster AS (
304 SELECT ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
305 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
306 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
307 JOIN nodes n ON n.id = corp.node_id
308 JOIN nodes_nodes nn ON nn.node2_id = n.id
309
310 WHERE list.node_id = ? -- Master listId
311 AND n.parent_id = ? -- Master CorpusId or AnnuaireId
312 AND n.typename = ? -- Master childs (Documents or Contacts)
313 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
314 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
315 AND list.parent_id IS NULL
316 )
317
318 SELECT COALESCE(tu.terms,tm.terms) AS terms
319 , COALESCE(tu.n,tm.n) AS n
320 , COALESCE(tu.list_type,tm.list_type) AS ngrams_type
321 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
322 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
323 GROUP BY tu.terms,tm.terms,tu.n,tm.n,tu.list_type,tm.list_type
324 ORDER BY 1,2
325 LIMIT ?
326 OFFSET ?;
327
328 |]
329
330
331 querySelectTableNgramsTrees :: PGS.Query
332 querySelectTableNgramsTrees = [sql|
333
334 -- DROP FUNCTION tree_start(integer,integer,integer,integer,integer,integer,integer,integer);
335 -- DROP FUNCTION tree_end(integer,integer,integer,integer,integer,integer);
336 -- DROP FUNCTION tree_ngrams(integer,integer,integer,integer,integer,integer,integer,integer);
337
338 CREATE OR REPLACE FUNCTION public.tree_start(luid INT, lmid INT,cuid INT, cmid INT, tdoc INT, tngrams INT, lmt INT, ofst INT)
339 RETURNS TABLE (id INT, parent_id INT, terms VARCHAR(255), n int, list_type int, weight float8) AS $$
340 BEGIN
341 RETURN QUERY
342 WITH tableUser AS (
343 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
344 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
345 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
346 JOIN nodes_nodes nn ON nn.node2_id = corp.node_id
347 JOIN nodes n ON n.id = corp.node_id
348
349 WHERE list.node_id = luid -- User listId
350 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
351 AND n.typename = tdoc -- both type of childs (Documents or Contacts)
352 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms or...)
353 AND list.parent_id IS NULL
354 ),
355 tableMaster AS (
356 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
357 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
358 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
359 JOIN nodes n ON n.id = corp.node_id
360 JOIN nodes_nodes nn ON nn.node2_id = n.id
361
362 WHERE list.node_id = lmid -- Master listId
363 AND n.parent_id = cmid -- Master CorpusId or AnnuaireId
364 AND n.typename = tdoc -- Master childs (Documents or Contacts)
365 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms1)
366 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
367 AND list.parent_id IS NULL
368 )
369
370 SELECT COALESCE(tu.id,tm.id) AS id
371 , COALESCE(tu.parent_id,tm.parent_id) AS parent_id
372 , COALESCE(tu.terms,tm.terms) AS terms
373 , COALESCE(tu.n,tm.n) AS n
374 , COALESCE(tu.list_type,tm.list_type) AS ngrams_type
375 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
376 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
377 GROUP BY tu.id,tm.id,tu.parent_id,tm.parent_id,tu.terms,tm.terms,tu.n,tm.n,tu.list_type,tm.list_type
378 ORDER BY 3
379 LIMIT lmt
380 OFFSET ofst
381 ;
382 END $$
383 LANGUAGE plpgsql ;
384
385 CREATE OR REPLACE FUNCTION public.tree_end(luid INT, lmid INT,cuid INT, cmid INT, tdoc INT, tngrams INT)
386 RETURNS TABLE (id INT, parent_id INT, terms VARCHAR(255), n int, list_type int, weight float8) AS $$
387 BEGIN
388 RETURN QUERY
389 WITH tableUser2 AS (
390 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
391 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
392 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
393 JOIN nodes_nodes nn ON nn.node2_id = corp.node_id
394 JOIN nodes n ON n.id = corp.node_id
395
396 WHERE list.node_id = luid -- User listId
397 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
398 AND n.typename = tdoc -- both type of childs (Documents or Contacts)
399 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms or...)
400 )
401 , tableMaster2 AS (
402 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
403 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
404 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
405 JOIN nodes n ON n.id = corp.node_id
406 JOIN nodes_nodes nn ON nn.node2_id = n.id
407
408 WHERE list.node_id = lmid -- Master listId
409 AND n.parent_id = cmid -- Master CorpusId or AnnuaireId
410 AND n.typename = tdoc -- Master childs (Documents or Contacts)
411 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms1)
412 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
413 )
414 SELECT COALESCE(tu.id,tm.id) as id
415 , COALESCE(tu.parent_id,tm.parent_id) as parent_id
416 , COALESCE(tu.terms,tm.terms) AS terms
417 , COALESCE(tu.n,tm.n) AS n
418 , COALESCE(tu.list_type,tm.list_type) AS list_type
419 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
420 FROM tableUser2 tu RIGHT JOIN tableMaster2 tm ON tu.terms = tm.terms
421 GROUP BY tu.id,tm.id,tu.parent_id,tm.parent_id,tu.terms,tm.terms,tu.n,tm.n,tu.list_type,tm.list_type
422 ;
423 END $$
424 LANGUAGE plpgsql ;
425
426
427 CREATE OR REPLACE FUNCTION public.tree_ngrams(luid INT, lmid INT,cuid INT, cmid INT, tdoc INT, tngrams INT, lmt INT, ofst INT)
428 RETURNS TABLE (id INT, parent_id INT, terms VARCHAR(255), n int, list_type int, weight float8) AS $$
429 BEGIN
430 RETURN QUERY WITH RECURSIVE
431 ngrams_tree (id,parent_id,terms,n,list_type,weight) AS (
432 SELECT ts.id,ts.parent_id,ts.terms,ts.n,ts.list_type,ts.weight FROM tree_start($1,$2,$3,$4,$5,$6,$7,$8) ts
433 UNION
434 SELECT te.id,te.parent_id,te.terms,te.n,te.list_type,te.weight FROM tree_end($1,$2,$3,$4,$5,$6) as te
435 INNER JOIN ngrams_tree ON te.parent_id = ngrams_tree.id
436 )
437 SELECT * from ngrams_tree;
438 END $$
439 LANGUAGE plpgsql ;
440
441 select * from tree_ngrams(?,?,?,?,?,?,?,?)
442
443 |]
444
445
446
447 type ListIdUser = NodeId
448 type ListIdMaster = NodeId
449
450 type MapToChildren = Map Text (Set Text)
451 type MapToParent = Map Text Text
452
453 getNgramsGroup :: ListIdUser -> ListIdMaster -> Cmd err (MapToParent, MapToChildren)
454 getNgramsGroup lu lm = do
455 groups <- runPGSQuery querySelectNgramsGroup (lu,lm)
456 let mapChildren = fromListWith (<>) $ map (\(a,b) -> (a, DS.singleton b)) groups
457 let mapParent = fromListWith (<>) $ map (\(a,b) -> (b, a)) groups
458 pure (mapParent, mapChildren)
459
460 querySelectNgramsGroup :: PGS.Query
461 querySelectNgramsGroup = [sql|
462 WITH groupUser AS (
463 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
464 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
465 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
466 WHERE
467 nnn.node_id = ? -- User listId
468 ),
469 groupMaster AS (
470 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
471 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
472 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
473 WHERE
474 nnn.node_id = ? -- Master listId
475 )
476 SELECT COALESCE(gu.t1,gm.t1) AS ngram1_id
477 , COALESCE(gu.t2,gm.t2) AS ngram2_id
478 FROM groupUser gu LEFT JOIN groupMaster gm ON gu.t1 = gm.t1
479 |]
480