2 Module : Gargantext.Database.Schema.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams connection to the Database.
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE GeneralizedNewtypeDeriving #-}
19 {-# LANGUAGE MultiParamTypeClasses #-}
20 {-# LANGUAGE NoImplicitPrelude #-}
21 {-# LANGUAGE OverloadedStrings #-}
22 {-# LANGUAGE QuasiQuotes #-}
23 {-# LANGUAGE RankNTypes #-}
24 {-# LANGUAGE TemplateHaskell #-}
26 module Gargantext.Database.Schema.Ngrams where
28 import Control.Lens (makeLenses, view, over)
29 import Control.Monad (mzero)
30 import Data.ByteString.Internal (ByteString)
31 import Data.Map (Map, fromList, lookup, fromListWith)
32 import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
34 import Data.Text (Text, splitOn)
35 import Database.PostgreSQL.Simple ((:.)(..))
36 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
37 import Database.PostgreSQL.Simple.SqlQQ (sql)
38 import Database.PostgreSQL.Simple.ToField (toField, ToField)
39 import Database.PostgreSQL.Simple.FromField (FromField, fromField)
40 import Database.PostgreSQL.Simple.ToRow (toRow)
41 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
42 import Debug.Trace (trace)
43 import GHC.Generics (Generic)
44 import Gargantext.Core.Types -- (fromListTypeId, ListType, NodePoly(Node))
45 import Gargantext.Database.Config (nodeTypeId,userMaster)
46 import Gargantext.Database.Root (getRoot)
47 import Gargantext.Database.Types.Node (NodeType)
48 import Gargantext.Database.Schema.Node (getListsWithParentId, getCorporaWithParentId)
49 import Gargantext.Database.Utils (Cmd, runPGSQuery, runOpaQuery, formatPGSQuery)
50 import Gargantext.Prelude
51 import Opaleye hiding (FromField)
52 import Prelude (Enum, Bounded, minBound, maxBound, Functor)
53 import qualified Data.Set as DS
54 import qualified Database.PostgreSQL.Simple as PGS
57 data NgramsPoly id terms n = NgramsDb { ngrams_id :: id
58 , ngrams_terms :: terms
63 type NgramsWrite = NgramsPoly (Maybe (Column PGInt4))
67 type NgramsRead = NgramsPoly (Column PGInt4)
71 type NgramsReadNull = NgramsPoly (Column (Nullable PGInt4))
72 (Column (Nullable PGText))
73 (Column (Nullable PGInt4))
76 type NgramsDb = NgramsPoly Int Text Int
78 $(makeAdaptorAndInstance "pNgramsDb" ''NgramsPoly)
79 -- $(makeLensesWith abbreviatedFields ''NgramsPoly)
81 ngramsTable :: Table NgramsWrite NgramsRead
82 ngramsTable = Table "ngrams" (pNgramsDb NgramsDb { ngrams_id = optional "id"
83 , ngrams_terms = required "terms"
84 , ngrams_n = required "n"
88 queryNgramsTable :: Query NgramsRead
89 queryNgramsTable = queryTable ngramsTable
91 dbGetNgramsDb :: Cmd err [NgramsDb]
92 dbGetNgramsDb = runOpaQuery queryNgramsTable
95 -- | Main Ngrams Types
97 -- Typed Ngrams localize the context of the ngrams
98 -- ngrams in source field of document has Sources Type
99 -- ngrams in authors field of document has Authors Type
100 -- ngrams in text (title or abstract) of documents has Terms Type
101 data NgramsType = Authors | Institutes | Sources | NgramsTerms
102 deriving (Eq, Show, Ord, Enum, Bounded)
104 newtype NgramsTypeId = NgramsTypeId Int
105 deriving (Eq, Show, Ord, Num)
107 instance ToField NgramsTypeId where
108 toField (NgramsTypeId n) = toField n
110 instance FromField NgramsTypeId where
111 fromField fld mdata = do
112 n <- fromField fld mdata
113 if (n :: Int) > 0 then return $ NgramsTypeId n
116 pgNgramsType :: NgramsType -> Column PGInt4
117 pgNgramsType = pgNgramsTypeId . ngramsTypeId
119 pgNgramsTypeId :: NgramsTypeId -> Column PGInt4
120 pgNgramsTypeId (NgramsTypeId n) = pgInt4 n
122 ngramsTypeId :: NgramsType -> NgramsTypeId
123 ngramsTypeId Authors = 1
124 ngramsTypeId Institutes = 2
125 ngramsTypeId Sources = 3
126 ngramsTypeId NgramsTerms = 4
128 fromNgramsTypeId :: NgramsTypeId -> Maybe NgramsType
129 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
131 type NgramsTerms = Text
135 ------------------------------------------------------------------------
136 -- | TODO put it in Gargantext.Text.Ngrams
137 data Ngrams = Ngrams { _ngramsTerms :: Text
139 } deriving (Generic, Show, Eq, Ord)
142 instance PGS.ToRow Ngrams where
143 toRow (Ngrams t s) = [toField t, toField s]
145 text2ngrams :: Text -> Ngrams
146 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
148 -------------------------------------------------------------------------
149 -- | TODO put it in Gargantext.Text.Ngrams
150 -- Named entity are typed ngrams of Terms Ngrams
152 NgramsT { _ngramsType :: NgramsType
154 } deriving (Generic, Show, Eq, Ord)
158 instance Functor NgramsT where
160 -----------------------------------------------------------------------
164 , _ngramsId :: NgramsId
165 } deriving (Show, Generic, Eq, Ord)
167 makeLenses ''NgramsIndexed
168 ------------------------------------------------------------------------
173 } deriving (Show, Generic, Eq, Ord)
175 instance PGS.FromRow NgramIds where
176 fromRow = NgramIds <$> field <*> field
178 ----------------------
179 withMap :: Map NgramsTerms NgramsId -> NgramsTerms -> NgramsId
180 withMap m n = maybe (panic "withMap: should not happen") identity (lookup n m)
182 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
183 indexNgramsT = fmap . indexNgramsWith . withMap
185 indexNgrams :: Map NgramsTerms NgramsId -> Ngrams -> NgramsIndexed
186 indexNgrams = indexNgramsWith . withMap
188 -- NP: not sure we need it anymore
189 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams -> NgramsT NgramsIndexed
190 indexNgramsTWith = fmap . indexNgramsWith
192 indexNgramsWith :: (NgramsTerms -> NgramsId) -> Ngrams -> NgramsIndexed
193 indexNgramsWith f n = NgramsIndexed n (f $ _ngramsTerms n)
195 insertNgrams :: [Ngrams] -> Cmd err (Map NgramsTerms NgramsId)
196 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
198 insertNgrams' :: [Ngrams] -> Cmd err [NgramIds]
199 insertNgrams' ns = runPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
201 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
203 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd err ByteString
204 insertNgrams_Debug ns = formatPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
206 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
208 ----------------------
209 queryInsertNgrams :: PGS.Query
210 queryInsertNgrams = [sql|
211 WITH input_rows(terms,n) AS (?)
213 INSERT INTO ngrams (terms,n)
214 SELECT * FROM input_rows
215 ON CONFLICT (terms) DO NOTHING -- unique index created here
224 JOIN ngrams c USING (terms); -- columns of unique index
229 -- TODO: the way we are getting main Master Corpus and List ID is not clean
230 -- TODO: if ids are not present -> create
231 -- TODO: Global Env State Monad to keep in memory the ids without retrieving it each time
232 getNgramsTableDb :: NodeType -> NgramsType
233 -> NgramsTableParamUser
235 -> Cmd err [NgramsTableData']
236 getNgramsTableDb nt ngrt ntp limit_ offset_ = do
239 maybeRoot <- head <$> getRoot userMaster
240 let path = "Garg.Db.Ngrams.getTableNgrams: "
241 let masterRootId = maybe (panic $ path <> "no userMaster Tree") (view node_id) maybeRoot
242 -- let errMess = panic "Error"
244 corpusMasterId <- maybe (panic "error master corpus") (view node_id) <$> head <$> getCorporaWithParentId masterRootId
246 listMasterId <- maybe (panic "error master list") (view node_id) <$> head <$> getListsWithParentId corpusMasterId
248 getNgramsTableData' nt ngrt ntp (NgramsTableParam listMasterId corpusMasterId) limit_ offset_
250 data NgramsTableParam =
251 NgramsTableParam { _nt_listId :: NodeId
252 , _nt_corpusId :: NodeId
255 type NgramsTableParamUser = NgramsTableParam
256 type NgramsTableParamMaster = NgramsTableParam
258 data NgramsTableData = NgramsTableData { _ntd_ngrams :: Text
260 , _ntd_listType :: Maybe ListType
261 , _ntd_weight :: Double
264 getNgramsTableData :: NodeType -> NgramsType
265 -> NgramsTableParamUser -> NgramsTableParamMaster
267 -> Cmd err [NgramsTableData]
268 getNgramsTableData nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) limit_ offset_ =
269 trace ("Ngrams table params" <> show params) <$>
270 map (\(t,n,nt,w) -> NgramsTableData t n (fromListTypeId nt) w) <$>
271 runPGSQuery querySelectTableNgrams params
273 nodeTId = nodeTypeId nodeT
274 ngrmTId = ngramsTypeId ngrmT
275 params = (ul,uc,nodeTId,ngrmTId,ml,mc,nodeTId,ngrmTId,uc) :.
279 data NgramsTableData' = NgramsTableData' { _ntd2_id :: Int
280 , _ntd2_parent_id :: Maybe Int
281 , _ntd2_terms :: Text
283 , _ntd2_listType :: Maybe ListType
284 , _ntd2_weight :: Double
289 getNgramsTableData' :: NodeType -> NgramsType
290 -> NgramsTableParamUser -> NgramsTableParamMaster
292 -> Cmd err [NgramsTableData']
293 getNgramsTableData' nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) limit_ offset_ =
294 trace ("Ngrams table params: " <> show params) <$>
295 map (\(i,p,t,n,lt,w) -> NgramsTableData' i p t n (fromListTypeId lt) w) <$>
296 runPGSQuery querySelectTableNgramsTrees params
298 nodeTId = nodeTypeId nodeT
299 ngrmTId = ngramsTypeId ngrmT
300 params = (ul,ml,uc,mc,nodeTId,ngrmTId) :. (limit_, offset_)
302 getNgramsTableDataDebug :: PGS.ToRow a => a -> Cmd err ByteString
303 getNgramsTableDataDebug = formatPGSQuery querySelectTableNgramsTrees
306 querySelectTableNgrams :: PGS.Query
307 querySelectTableNgrams = [sql|
310 SELECT ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
311 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
312 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
313 JOIN nodes_nodes nn ON nn.node2_id = corp.node_id
314 JOIN nodes n ON n.id = corp.node_id
316 WHERE list.node_id = ? -- User listId
317 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
318 AND n.typename = ? -- both type of childs (Documents or Contacts)
319 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms or...)
320 AND list.parent_id IS NULL
323 SELECT ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
324 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
325 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
326 JOIN nodes n ON n.id = corp.node_id
327 JOIN nodes_nodes nn ON nn.node2_id = n.id
329 WHERE list.node_id = ? -- Master listId
330 AND n.parent_id = ? -- Master CorpusId or AnnuaireId
331 AND n.typename = ? -- Master childs (Documents or Contacts)
332 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
333 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
334 AND list.parent_id IS NULL
337 SELECT COALESCE(tu.terms,tm.terms) AS terms
338 , COALESCE(tu.n,tm.n) AS n
339 , COALESCE(tu.list_type,tm.list_type) AS ngrams_type
340 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
341 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
342 GROUP BY tu.terms,tm.terms,tu.n,tm.n,tu.list_type,tm.list_type
350 querySelectTableNgramsTrees :: PGS.Query
351 querySelectTableNgramsTrees = [sql|
353 -- DROP FUNCTION tree_start(integer,integer,integer,integer,integer,integer,integer,integer);
354 -- DROP FUNCTION tree_end(integer,integer,integer,integer,integer,integer);
355 -- DROP FUNCTION tree_ngrams(integer,integer,integer,integer,integer,integer,integer,integer);
357 CREATE OR REPLACE FUNCTION public.tree_start(luid INT, lmid INT,cuid INT, cmid INT, tdoc INT, tngrams INT, lmt INT, ofst INT)
358 RETURNS TABLE (id INT, parent_id INT, terms VARCHAR(255), n int, list_type int, weight float8) AS $$
362 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
363 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
364 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
365 JOIN nodes_nodes nn ON nn.node2_id = corp.node_id
366 JOIN nodes n ON n.id = corp.node_id
368 WHERE list.node_id = luid -- User listId
369 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
370 AND n.typename = tdoc -- both type of childs (Documents or Contacts)
371 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms or...)
372 AND list.parent_id IS NULL
375 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
376 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
377 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
378 JOIN nodes n ON n.id = corp.node_id
379 JOIN nodes_nodes nn ON nn.node2_id = n.id
381 WHERE list.node_id = lmid -- Master listId
382 AND n.parent_id = cmid -- Master CorpusId or AnnuaireId
383 AND n.typename = tdoc -- Master childs (Documents or Contacts)
384 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms1)
385 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
386 AND list.parent_id IS NULL
389 SELECT COALESCE(tu.id,tm.id) AS id
390 , COALESCE(tu.parent_id,tm.parent_id) AS parent_id
391 , COALESCE(tu.terms,tm.terms) AS terms
392 , COALESCE(tu.n,tm.n) AS n
393 , COALESCE(tu.list_type,tm.list_type) AS ngrams_type
394 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
395 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
396 GROUP BY tu.id,tm.id,tu.parent_id,tm.parent_id,tu.terms,tm.terms,tu.n,tm.n,tu.list_type,tm.list_type
404 CREATE OR REPLACE FUNCTION public.tree_end(luid INT, lmid INT,cuid INT, cmid INT, tdoc INT, tngrams INT)
405 RETURNS TABLE (id INT, parent_id INT, terms VARCHAR(255), n int, list_type int, weight float8) AS $$
409 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
410 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
411 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
412 JOIN nodes_nodes nn ON nn.node2_id = corp.node_id
413 JOIN nodes n ON n.id = corp.node_id
415 WHERE list.node_id = luid -- User listId
416 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
417 AND n.typename = tdoc -- both type of childs (Documents or Contacts)
418 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms or...)
421 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
422 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
423 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
424 JOIN nodes n ON n.id = corp.node_id
425 JOIN nodes_nodes nn ON nn.node2_id = n.id
427 WHERE list.node_id = lmid -- Master listId
428 AND n.parent_id = cmid -- Master CorpusId or AnnuaireId
429 AND n.typename = tdoc -- Master childs (Documents or Contacts)
430 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms1)
431 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
433 SELECT COALESCE(tu.id,tm.id) as id
434 , COALESCE(tu.parent_id,tm.parent_id) as parent_id
435 , COALESCE(tu.terms,tm.terms) AS terms
436 , COALESCE(tu.n,tm.n) AS n
437 , COALESCE(tu.list_type,tm.list_type) AS list_type
438 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
439 FROM tableUser2 tu RIGHT JOIN tableMaster2 tm ON tu.terms = tm.terms
440 GROUP BY tu.id,tm.id,tu.parent_id,tm.parent_id,tu.terms,tm.terms,tu.n,tm.n,tu.list_type,tm.list_type
446 CREATE OR REPLACE FUNCTION public.tree_ngrams(luid INT, lmid INT,cuid INT, cmid INT, tdoc INT, tngrams INT, lmt INT, ofst INT)
447 RETURNS TABLE (id INT, parent_id INT, terms VARCHAR(255), n int, list_type int, weight float8) AS $$
449 RETURN QUERY WITH RECURSIVE
450 ngrams_tree (id,parent_id,terms,n,list_type,weight) AS (
451 SELECT ts.id,ts.parent_id,ts.terms,ts.n,ts.list_type,ts.weight FROM tree_start($1,$2,$3,$4,$5,$6,$7,$8) ts
453 SELECT te.id,te.parent_id,te.terms,te.n,te.list_type,te.weight FROM tree_end($1,$2,$3,$4,$5,$6) as te
454 INNER JOIN ngrams_tree ON te.parent_id = ngrams_tree.id
456 SELECT * from ngrams_tree;
460 select * from tree_ngrams(?,?,?,?,?,?,?,?)
466 type ListIdUser = NodeId
467 type ListIdMaster = NodeId
469 type MapToChildren = Map Text (Set Text)
470 type MapToParent = Map Text Text
472 getNgramsGroup :: ListIdUser -> ListIdMaster -> Cmd err (MapToParent, MapToChildren)
473 getNgramsGroup lu lm = do
474 groups <- runPGSQuery querySelectNgramsGroup (lu,lm)
475 let mapChildren = fromListWith (<>) $ map (\(a,b) -> (a, DS.singleton b)) groups
476 let mapParent = fromListWith (<>) $ map (\(a,b) -> (b, a)) groups
477 pure (mapParent, mapChildren)
479 querySelectNgramsGroup :: PGS.Query
480 querySelectNgramsGroup = [sql|
482 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
483 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
484 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
486 nnn.node_id = ? -- User listId
489 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
490 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
491 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
493 nnn.node_id = ? -- Master listId
495 SELECT COALESCE(gu.t1,gm.t1) AS ngram1_id
496 , COALESCE(gu.t2,gm.t2) AS ngram2_id
497 FROM groupUser gu LEFT JOIN groupMaster gm ON gu.t1 = gm.t1