2 Module : Gargantext.Database.Schema.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams connection to the Database.
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE GeneralizedNewtypeDeriving #-}
19 {-# LANGUAGE MultiParamTypeClasses #-}
20 {-# LANGUAGE NoImplicitPrelude #-}
21 {-# LANGUAGE OverloadedStrings #-}
22 {-# LANGUAGE QuasiQuotes #-}
23 {-# LANGUAGE RankNTypes #-}
24 {-# LANGUAGE TemplateHaskell #-}
26 module Gargantext.Database.Schema.Ngrams where
28 import Control.Lens (makeLenses, view, over)
29 import Control.Monad (mzero)
30 import Data.ByteString.Internal (ByteString)
31 import Data.Map (Map, fromList, lookup, fromListWith)
32 import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
34 import Data.Text (Text, splitOn)
35 import Database.PostgreSQL.Simple ((:.)(..))
36 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
37 import Database.PostgreSQL.Simple.SqlQQ (sql)
38 import Database.PostgreSQL.Simple.ToField (toField, ToField)
39 import Database.PostgreSQL.Simple.FromField (FromField, fromField)
40 import Database.PostgreSQL.Simple.ToRow (toRow)
41 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
42 import Debug.Trace (trace)
43 import GHC.Generics (Generic)
44 import Gargantext.Core.Types -- (fromListTypeId, ListType, NodePoly(Node))
45 import Gargantext.Database.Config (nodeTypeId,userMaster)
46 import Gargantext.Database.Root (getRoot)
47 import Gargantext.Database.Types.Node (NodeType)
48 import Gargantext.Database.Schema.Node (getListsWithParentId, getCorporaWithParentId)
49 import Gargantext.Database.Utils (Cmd, runPGSQuery, runOpaQuery, formatPGSQuery)
50 import Gargantext.Prelude
51 import Opaleye hiding (FromField)
52 import Prelude (Enum, Bounded, minBound, maxBound, Functor)
53 import qualified Data.Set as DS
54 import qualified Database.PostgreSQL.Simple as PGS
57 type NgramsTerms = Text
62 data NgramsPoly id terms n = NgramsDb { ngrams_id :: id
63 , ngrams_terms :: terms
68 type NgramsWrite = NgramsPoly (Maybe (Column PGInt4))
72 type NgramsRead = NgramsPoly (Column PGInt4)
76 type NgramsReadNull = NgramsPoly (Column (Nullable PGInt4))
77 (Column (Nullable PGText))
78 (Column (Nullable PGInt4))
81 type NgramsDb = NgramsPoly Int Text Int
83 $(makeAdaptorAndInstance "pNgramsDb" ''NgramsPoly)
84 -- $(makeLensesWith abbreviatedFields ''NgramsPoly)
86 ngramsTable :: Table NgramsWrite NgramsRead
87 ngramsTable = Table "ngrams" (pNgramsDb NgramsDb { ngrams_id = optional "id"
88 , ngrams_terms = required "terms"
89 , ngrams_n = required "n"
93 queryNgramsTable :: Query NgramsRead
94 queryNgramsTable = queryTable ngramsTable
96 dbGetNgramsDb :: Cmd err [NgramsDb]
97 dbGetNgramsDb = runOpaQuery queryNgramsTable
100 -- | Main Ngrams Types
102 -- Typed Ngrams localize the context of the ngrams
103 -- ngrams in source field of document has Sources Type
104 -- ngrams in authors field of document has Authors Type
105 -- ngrams in text (title or abstract) of documents has Terms Type
106 data NgramsType = Authors | Institutes | Sources | NgramsTerms
107 deriving (Eq, Show, Ord, Enum, Bounded)
109 newtype NgramsTypeId = NgramsTypeId Int
110 deriving (Eq, Show, Ord, Num)
112 instance ToField NgramsTypeId where
113 toField (NgramsTypeId n) = toField n
115 instance FromField NgramsTypeId where
116 fromField fld mdata = do
117 n <- fromField fld mdata
118 if (n :: Int) > 0 then return $ NgramsTypeId n
121 pgNgramsType :: NgramsType -> Column PGInt4
122 pgNgramsType = pgNgramsTypeId . ngramsTypeId
124 pgNgramsTypeId :: NgramsTypeId -> Column PGInt4
125 pgNgramsTypeId (NgramsTypeId n) = pgInt4 n
127 ngramsTypeId :: NgramsType -> NgramsTypeId
128 ngramsTypeId Authors = 1
129 ngramsTypeId Institutes = 2
130 ngramsTypeId Sources = 3
131 ngramsTypeId NgramsTerms = 4
133 fromNgramsTypeId :: NgramsTypeId -> Maybe NgramsType
134 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
136 ------------------------------------------------------------------------
137 -- | TODO put it in Gargantext.Text.Ngrams
138 data Ngrams = Ngrams { _ngramsTerms :: Text
140 } deriving (Generic, Show, Eq, Ord)
143 instance PGS.ToRow Ngrams where
144 toRow (Ngrams t s) = [toField t, toField s]
146 text2ngrams :: Text -> Ngrams
147 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
149 -------------------------------------------------------------------------
150 -- | TODO put it in Gargantext.Text.Ngrams
151 -- Named entity are typed ngrams of Terms Ngrams
153 NgramsT { _ngramsType :: NgramsType
155 } deriving (Generic, Show, Eq, Ord)
159 instance Functor NgramsT where
161 -----------------------------------------------------------------------
165 , _ngramsId :: NgramsId
166 } deriving (Show, Generic, Eq, Ord)
168 makeLenses ''NgramsIndexed
169 ------------------------------------------------------------------------
174 } deriving (Show, Generic, Eq, Ord)
176 instance PGS.FromRow NgramIds where
177 fromRow = NgramIds <$> field <*> field
179 ----------------------
180 withMap :: Map NgramsTerms NgramsId -> NgramsTerms -> NgramsId
181 withMap m n = maybe (panic "withMap: should not happen") identity (lookup n m)
183 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
184 indexNgramsT = fmap . indexNgramsWith . withMap
186 indexNgrams :: Map NgramsTerms NgramsId -> Ngrams -> NgramsIndexed
187 indexNgrams = indexNgramsWith . withMap
189 -- NP: not sure we need it anymore
190 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams -> NgramsT NgramsIndexed
191 indexNgramsTWith = fmap . indexNgramsWith
193 indexNgramsWith :: (NgramsTerms -> NgramsId) -> Ngrams -> NgramsIndexed
194 indexNgramsWith f n = NgramsIndexed n (f $ _ngramsTerms n)
196 -- TODO-ACCESS: access must not be checked here but when insertNgrams is called.
197 insertNgrams :: [Ngrams] -> Cmd err (Map NgramsTerms NgramsId)
198 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
200 -- TODO-ACCESS: access must not be checked here but when insertNgrams' is called.
201 insertNgrams' :: [Ngrams] -> Cmd err [NgramIds]
202 insertNgrams' ns = runPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
204 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
206 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd err ByteString
207 insertNgrams_Debug ns = formatPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
209 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
211 ----------------------
212 queryInsertNgrams :: PGS.Query
213 queryInsertNgrams = [sql|
214 WITH input_rows(terms,n) AS (?)
216 INSERT INTO ngrams (terms,n)
217 SELECT * FROM input_rows
218 ON CONFLICT (terms) DO NOTHING -- unique index created here
227 JOIN ngrams c USING (terms); -- columns of unique index
232 -- TODO: the way we are getting main Master Corpus and List ID is not clean
233 -- TODO: if ids are not present -> create
234 -- TODO: Global Env State Monad to keep in memory the ids without retrieving it each time
235 getNgramsTableDb :: NodeType -> NgramsType
236 -> NgramsTableParamUser
238 -> Cmd err [NgramsTableData]
239 getNgramsTableDb nt ngrt ntp limit_ offset_ = do
242 maybeRoot <- head <$> getRoot userMaster
243 let path = "Garg.Db.Ngrams.getTableNgrams: "
244 let masterRootId = maybe (panic $ path <> "no userMaster Tree") (view node_id) maybeRoot
245 -- let errMess = panic "Error"
247 corpusMasterId <- maybe (panic "error master corpus") (view node_id) <$> head <$> getCorporaWithParentId masterRootId
249 listMasterId <- maybe (panic "error master list") (view node_id) <$> head <$> getListsWithParentId corpusMasterId
251 getNgramsTableData nt ngrt ntp (NgramsTableParam listMasterId corpusMasterId) limit_ offset_
253 data NgramsTableParam =
254 NgramsTableParam { _nt_listId :: NodeId
255 , _nt_corpusId :: NodeId
258 type NgramsTableParamUser = NgramsTableParam
259 type NgramsTableParamMaster = NgramsTableParam
262 data NgramsTableData = NgramsTableData { _ntd_id :: Int
263 , _ntd_parent_id :: Maybe Int
266 , _ntd_listType :: Maybe ListType
267 , _ntd_weight :: Double
272 getNgramsTableData :: NodeType -> NgramsType
273 -> NgramsTableParamUser -> NgramsTableParamMaster
275 -> Cmd err [NgramsTableData]
276 getNgramsTableData nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) limit_ offset_ =
277 trace ("Ngrams table params: " <> show params) <$>
278 map (\(i,p,t,n,lt,w) -> NgramsTableData i p t n (fromListTypeId lt) w) <$>
279 runPGSQuery querySelectTableNgramsTrees params
281 nodeTId = nodeTypeId nodeT
282 ngrmTId = ngramsTypeId ngrmT
283 params = (ul,ml,uc,mc,nodeTId,ngrmTId) :. (limit_, offset_)
285 getNgramsTableDataDebug :: PGS.ToRow a => a -> Cmd err ByteString
286 getNgramsTableDataDebug = formatPGSQuery querySelectTableNgramsTrees
289 querySelectTableNgrams :: PGS.Query
290 querySelectTableNgrams = [sql|
293 SELECT ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
294 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
295 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
296 JOIN nodes_nodes nn ON nn.node2_id = corp.node_id
297 JOIN nodes n ON n.id = corp.node_id
299 WHERE list.node_id = ? -- User listId
300 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
301 AND n.typename = ? -- both type of childs (Documents or Contacts)
302 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms or...)
303 AND list.parent_id IS NULL
306 SELECT ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
307 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
308 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
309 JOIN nodes n ON n.id = corp.node_id
310 JOIN nodes_nodes nn ON nn.node2_id = n.id
312 WHERE list.node_id = ? -- Master listId
313 AND n.parent_id = ? -- Master CorpusId or AnnuaireId
314 AND n.typename = ? -- Master childs (Documents or Contacts)
315 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
316 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
317 AND list.parent_id IS NULL
320 SELECT COALESCE(tu.terms,tm.terms) AS terms
321 , COALESCE(tu.n,tm.n) AS n
322 , COALESCE(tu.list_type,tm.list_type) AS ngrams_type
323 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
324 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
325 GROUP BY tu.terms,tm.terms,tu.n,tm.n,tu.list_type,tm.list_type
333 querySelectTableNgramsTrees :: PGS.Query
334 querySelectTableNgramsTrees = [sql|
336 -- DROP FUNCTION tree_start(integer,integer,integer,integer,integer,integer,integer,integer);
337 -- DROP FUNCTION tree_end(integer,integer,integer,integer,integer,integer);
338 -- DROP FUNCTION tree_ngrams(integer,integer,integer,integer,integer,integer,integer,integer);
340 CREATE OR REPLACE FUNCTION public.tree_start(luid INT, lmid INT,cuid INT, cmid INT, tdoc INT, tngrams INT, lmt INT, ofst INT)
341 RETURNS TABLE (id INT, parent_id INT, terms VARCHAR(255), n int, list_type int, weight float8) AS $$
345 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
346 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
347 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
348 JOIN nodes_nodes nn ON nn.node2_id = corp.node_id
349 JOIN nodes n ON n.id = corp.node_id
351 WHERE list.node_id = luid -- User listId
352 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
353 AND n.typename = tdoc -- both type of childs (Documents or Contacts)
354 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms or...)
355 AND list.parent_id IS NULL
358 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
359 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
360 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
361 JOIN nodes n ON n.id = corp.node_id
362 JOIN nodes_nodes nn ON nn.node2_id = n.id
364 WHERE list.node_id = lmid -- Master listId
365 AND n.parent_id = cmid -- Master CorpusId or AnnuaireId
366 AND n.typename = tdoc -- Master childs (Documents or Contacts)
367 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms1)
368 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
369 AND list.parent_id IS NULL
372 SELECT COALESCE(tu.id,tm.id) AS id
373 , COALESCE(tu.parent_id,tm.parent_id) AS parent_id
374 , COALESCE(tu.terms,tm.terms) AS terms
375 , COALESCE(tu.n,tm.n) AS n
376 , COALESCE(tu.list_type,tm.list_type) AS ngrams_type
377 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
378 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
379 GROUP BY tu.id,tm.id,tu.parent_id,tm.parent_id,tu.terms,tm.terms,tu.n,tm.n,tu.list_type,tm.list_type
387 CREATE OR REPLACE FUNCTION public.tree_end(luid INT, lmid INT,cuid INT, cmid INT, tdoc INT, tngrams INT)
388 RETURNS TABLE (id INT, parent_id INT, terms VARCHAR(255), n int, list_type int, weight float8) AS $$
392 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
393 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
394 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
395 JOIN nodes_nodes nn ON nn.node2_id = corp.node_id
396 JOIN nodes n ON n.id = corp.node_id
398 WHERE list.node_id = luid -- User listId
399 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
400 AND n.typename = tdoc -- both type of childs (Documents or Contacts)
401 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms or...)
404 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
405 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
406 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
407 JOIN nodes n ON n.id = corp.node_id
408 JOIN nodes_nodes nn ON nn.node2_id = n.id
410 WHERE list.node_id = lmid -- Master listId
411 AND n.parent_id = cmid -- Master CorpusId or AnnuaireId
412 AND n.typename = tdoc -- Master childs (Documents or Contacts)
413 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms1)
414 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
416 SELECT COALESCE(tu.id,tm.id) as id
417 , COALESCE(tu.parent_id,tm.parent_id) as parent_id
418 , COALESCE(tu.terms,tm.terms) AS terms
419 , COALESCE(tu.n,tm.n) AS n
420 , COALESCE(tu.list_type,tm.list_type) AS list_type
421 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
422 FROM tableUser2 tu RIGHT JOIN tableMaster2 tm ON tu.terms = tm.terms
423 GROUP BY tu.id,tm.id,tu.parent_id,tm.parent_id,tu.terms,tm.terms,tu.n,tm.n,tu.list_type,tm.list_type
429 CREATE OR REPLACE FUNCTION public.tree_ngrams(luid INT, lmid INT,cuid INT, cmid INT, tdoc INT, tngrams INT, lmt INT, ofst INT)
430 RETURNS TABLE (id INT, parent_id INT, terms VARCHAR(255), n int, list_type int, weight float8) AS $$
432 RETURN QUERY WITH RECURSIVE
433 ngrams_tree (id,parent_id,terms,n,list_type,weight) AS (
434 SELECT ts.id,ts.parent_id,ts.terms,ts.n,ts.list_type,ts.weight FROM tree_start($1,$2,$3,$4,$5,$6,$7,$8) ts
436 SELECT te.id,te.parent_id,te.terms,te.n,te.list_type,te.weight FROM tree_end($1,$2,$3,$4,$5,$6) as te
437 INNER JOIN ngrams_tree ON te.parent_id = ngrams_tree.id
439 SELECT * from ngrams_tree;
443 select * from tree_ngrams(?,?,?,?,?,?,?,?)
449 type ListIdUser = NodeId
450 type ListIdMaster = NodeId
452 type MapToChildren = Map Text (Set Text)
453 type MapToParent = Map Text Text
455 getNgramsGroup :: ListIdUser -> ListIdMaster -> Cmd err (MapToParent, MapToChildren)
456 getNgramsGroup lu lm = do
457 groups <- runPGSQuery querySelectNgramsGroup (lu,lm)
458 let mapChildren = fromListWith (<>) $ map (\(a,b) -> (a, DS.singleton b)) groups
459 let mapParent = fromListWith (<>) $ map (\(a,b) -> (b, a)) groups
460 pure (mapParent, mapChildren)
462 querySelectNgramsGroup :: PGS.Query
463 querySelectNgramsGroup = [sql|
465 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
466 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
467 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
469 nnn.node_id = ? -- User listId
472 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
473 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
474 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
476 nnn.node_id = ? -- Master listId
478 SELECT COALESCE(gu.t1,gm.t1) AS ngram1_id
479 , COALESCE(gu.t2,gm.t2) AS ngram2_id
480 FROM groupUser gu LEFT JOIN groupMaster gm ON gu.t1 = gm.t1