2 Module : Gargantext.Database.Schema.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams connection to the Database.
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE GeneralizedNewtypeDeriving #-}
19 {-# LANGUAGE MultiParamTypeClasses #-}
20 {-# LANGUAGE NoImplicitPrelude #-}
21 {-# LANGUAGE OverloadedStrings #-}
22 {-# LANGUAGE QuasiQuotes #-}
23 {-# LANGUAGE RankNTypes #-}
24 {-# LANGUAGE TemplateHaskell #-}
26 module Gargantext.Database.Schema.Ngrams where
28 import Control.Lens (makeLenses, view, over)
29 import Control.Monad (mzero)
30 import Data.ByteString.Internal (ByteString)
31 import Data.Map (Map, fromList, lookup, fromListWith)
32 import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
34 import Data.Text (Text, splitOn)
35 import Database.PostgreSQL.Simple ((:.)(..))
36 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
37 import Database.PostgreSQL.Simple.SqlQQ (sql)
38 import Database.PostgreSQL.Simple.ToField (toField, ToField)
39 import Database.PostgreSQL.Simple.FromField (FromField, fromField)
40 import Database.PostgreSQL.Simple.ToRow (toRow)
41 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
42 import Debug.Trace (trace)
43 import GHC.Generics (Generic)
44 import Gargantext.Core.Types -- (fromListTypeId, ListType, NodePoly(Node))
45 import Gargantext.Database.Config (nodeTypeId,userMaster)
46 import Gargantext.Database.Root (getRoot)
47 import Gargantext.Database.Types.Node (NodeType)
48 import Gargantext.Database.Schema.Node (getListsWithParentId, getCorporaWithParentId)
49 import Gargantext.Database.Utils (Cmd, runPGSQuery, runOpaQuery, formatPGSQuery)
50 import Gargantext.Prelude
51 import Opaleye hiding (FromField)
52 import Prelude (Enum, Bounded, minBound, maxBound, Functor)
53 import qualified Data.Set as DS
54 import qualified Database.PostgreSQL.Simple as PGS
57 type NgramsTerms = Text
61 data NgramsPoly id terms n = NgramsDb { ngrams_id :: id
62 , ngrams_terms :: terms
66 type NgramsWrite = NgramsPoly (Maybe (Column PGInt4))
70 type NgramsRead = NgramsPoly (Column PGInt4)
74 type NgramsReadNull = NgramsPoly (Column (Nullable PGInt4))
75 (Column (Nullable PGText))
76 (Column (Nullable PGInt4))
78 type NgramsDb = NgramsPoly Int Text Int
80 $(makeAdaptorAndInstance "pNgramsDb" ''NgramsPoly)
81 -- $(makeLensesWith abbreviatedFields ''NgramsPoly)
83 ngramsTable :: Table NgramsWrite NgramsRead
84 ngramsTable = Table "ngrams" (pNgramsDb NgramsDb { ngrams_id = optional "id"
85 , ngrams_terms = required "terms"
86 , ngrams_n = required "n"
90 queryNgramsTable :: Query NgramsRead
91 queryNgramsTable = queryTable ngramsTable
93 dbGetNgramsDb :: Cmd err [NgramsDb]
94 dbGetNgramsDb = runOpaQuery queryNgramsTable
96 -- | Main Ngrams Types
98 -- Typed Ngrams localize the context of the ngrams
99 -- ngrams in source field of document has Sources Type
100 -- ngrams in authors field of document has Authors Type
101 -- ngrams in text (title or abstract) of documents has Terms Type
102 data NgramsType = Authors | Institutes | Sources | NgramsTerms
103 deriving (Eq, Show, Ord, Enum, Bounded)
105 newtype NgramsTypeId = NgramsTypeId Int
106 deriving (Eq, Show, Ord, Num)
108 instance ToField NgramsTypeId where
109 toField (NgramsTypeId n) = toField n
111 instance FromField NgramsTypeId where
112 fromField fld mdata = do
113 n <- fromField fld mdata
114 if (n :: Int) > 0 then return $ NgramsTypeId n
117 pgNgramsType :: NgramsType -> Column PGInt4
118 pgNgramsType = pgNgramsTypeId . ngramsTypeId
120 pgNgramsTypeId :: NgramsTypeId -> Column PGInt4
121 pgNgramsTypeId (NgramsTypeId n) = pgInt4 n
123 ngramsTypeId :: NgramsType -> NgramsTypeId
124 ngramsTypeId Authors = 1
125 ngramsTypeId Institutes = 2
126 ngramsTypeId Sources = 3
127 ngramsTypeId NgramsTerms = 4
129 fromNgramsTypeId :: NgramsTypeId -> Maybe NgramsType
130 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
132 ------------------------------------------------------------------------
133 -- | TODO put it in Gargantext.Text.Ngrams
134 data Ngrams = Ngrams { _ngramsTerms :: Text
136 } deriving (Generic, Show, Eq, Ord)
139 instance PGS.ToRow Ngrams where
140 toRow (Ngrams t s) = [toField t, toField s]
142 text2ngrams :: Text -> Ngrams
143 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
145 -------------------------------------------------------------------------
146 -- | TODO put it in Gargantext.Text.Ngrams
147 -- Named entity are typed ngrams of Terms Ngrams
149 NgramsT { _ngramsType :: NgramsType
151 } deriving (Generic, Show, Eq, Ord)
155 instance Functor NgramsT where
157 -----------------------------------------------------------------------
161 , _ngramsId :: NgramsId
162 } deriving (Show, Generic, Eq, Ord)
164 makeLenses ''NgramsIndexed
165 ------------------------------------------------------------------------
170 } deriving (Show, Generic, Eq, Ord)
172 instance PGS.FromRow NgramIds where
173 fromRow = NgramIds <$> field <*> field
175 ----------------------
176 withMap :: Map NgramsTerms NgramsId -> NgramsTerms -> NgramsId
177 withMap m n = maybe (panic "withMap: should not happen") identity (lookup n m)
179 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
180 indexNgramsT = fmap . indexNgramsWith . withMap
182 indexNgrams :: Map NgramsTerms NgramsId -> Ngrams -> NgramsIndexed
183 indexNgrams = indexNgramsWith . withMap
185 -- NP: not sure we need it anymore
186 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams -> NgramsT NgramsIndexed
187 indexNgramsTWith = fmap . indexNgramsWith
189 indexNgramsWith :: (NgramsTerms -> NgramsId) -> Ngrams -> NgramsIndexed
190 indexNgramsWith f n = NgramsIndexed n (f $ _ngramsTerms n)
192 -- TODO-ACCESS: access must not be checked here but when insertNgrams is called.
193 insertNgrams :: [Ngrams] -> Cmd err (Map NgramsTerms NgramsId)
194 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
196 -- TODO-ACCESS: access must not be checked here but when insertNgrams' is called.
197 insertNgrams' :: [Ngrams] -> Cmd err [NgramIds]
198 insertNgrams' ns = runPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
200 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
202 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd err ByteString
203 insertNgrams_Debug ns = formatPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
205 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
207 ----------------------
208 queryInsertNgrams :: PGS.Query
209 queryInsertNgrams = [sql|
210 WITH input_rows(terms,n) AS (?)
212 INSERT INTO ngrams (terms,n)
213 SELECT * FROM input_rows
214 ON CONFLICT (terms) DO NOTHING -- unique index created here
223 JOIN ngrams c USING (terms); -- columns of unique index
228 -- TODO: the way we are getting main Master Corpus and List ID is not clean
229 -- TODO: if ids are not present -> create
230 -- TODO: Global Env State Monad to keep in memory the ids without retrieving it each time
231 getNgramsTableDb :: NodeType -> NgramsType
232 -> NgramsTableParamUser
234 -> Cmd err [NgramsTableData]
235 getNgramsTableDb nt ngrt ntp limit_ offset_ = do
238 maybeRoot <- head <$> getRoot userMaster
239 let path = "Garg.Db.Ngrams.getTableNgrams: "
240 let masterRootId = maybe (panic $ path <> "no userMaster Tree") (view node_id) maybeRoot
241 -- let errMess = panic "Error"
243 corpusMasterId <- maybe (panic "error master corpus") (view node_id) <$> head <$> getCorporaWithParentId masterRootId
245 listMasterId <- maybe (panic "error master list") (view node_id) <$> head <$> getListsWithParentId corpusMasterId
247 getNgramsTableData nt ngrt ntp (NgramsTableParam listMasterId corpusMasterId) limit_ offset_
249 data NgramsTableParam =
250 NgramsTableParam { _nt_listId :: NodeId
251 , _nt_corpusId :: NodeId
254 type NgramsTableParamUser = NgramsTableParam
255 type NgramsTableParamMaster = NgramsTableParam
258 data NgramsTableData = NgramsTableData { _ntd_id :: Int
259 , _ntd_parent_id :: Maybe Int
262 , _ntd_listType :: Maybe ListType
263 , _ntd_weight :: Double
268 getNgramsTableData :: NodeType -> NgramsType
269 -> NgramsTableParamUser -> NgramsTableParamMaster
271 -> Cmd err [NgramsTableData]
272 getNgramsTableData nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) limit_ offset_ =
273 trace ("Ngrams table params: " <> show params) <$>
274 map (\(i,p,t,n,lt,w) -> NgramsTableData i p t n (fromListTypeId lt) w) <$>
275 runPGSQuery querySelectTableNgramsTrees params
277 nodeTId = nodeTypeId nodeT
278 ngrmTId = ngramsTypeId ngrmT
279 params = (ul,ml,uc,mc,nodeTId,ngrmTId) :. (limit_, offset_)
281 getNgramsTableDataDebug :: PGS.ToRow a => a -> Cmd err ByteString
282 getNgramsTableDataDebug = formatPGSQuery querySelectTableNgramsTrees
285 querySelectTableNgrams :: PGS.Query
286 querySelectTableNgrams = [sql|
289 SELECT ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
290 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
291 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
292 JOIN nodes_nodes nn ON nn.node2_id = corp.node_id
293 JOIN nodes n ON n.id = corp.node_id
295 WHERE list.node_id = ? -- User listId
296 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
297 AND n.typename = ? -- both type of childs (Documents or Contacts)
298 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms or...)
299 AND list.parent_id IS NULL
302 SELECT ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
303 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
304 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
305 JOIN nodes n ON n.id = corp.node_id
306 JOIN nodes_nodes nn ON nn.node2_id = n.id
308 WHERE list.node_id = ? -- Master listId
309 AND n.parent_id = ? -- Master CorpusId or AnnuaireId
310 AND n.typename = ? -- Master childs (Documents or Contacts)
311 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
312 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
313 AND list.parent_id IS NULL
316 SELECT COALESCE(tu.terms,tm.terms) AS terms
317 , COALESCE(tu.n,tm.n) AS n
318 , COALESCE(tu.list_type,tm.list_type) AS ngrams_type
319 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
320 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
321 GROUP BY tu.terms,tm.terms,tu.n,tm.n,tu.list_type,tm.list_type
329 querySelectTableNgramsTrees :: PGS.Query
330 querySelectTableNgramsTrees = [sql|
332 -- DROP FUNCTION tree_start(integer,integer,integer,integer,integer,integer,integer,integer);
333 -- DROP FUNCTION tree_end(integer,integer,integer,integer,integer,integer);
334 -- DROP FUNCTION tree_ngrams(integer,integer,integer,integer,integer,integer,integer,integer);
336 CREATE OR REPLACE FUNCTION public.tree_start(luid INT, lmid INT,cuid INT, cmid INT, tdoc INT, tngrams INT, lmt INT, ofst INT)
337 RETURNS TABLE (id INT, parent_id INT, terms VARCHAR(255), n int, list_type int, weight float8) AS $$
341 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
342 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
343 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
344 JOIN nodes_nodes nn ON nn.node2_id = corp.node_id
345 JOIN nodes n ON n.id = corp.node_id
347 WHERE list.node_id = luid -- User listId
348 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
349 AND n.typename = tdoc -- both type of childs (Documents or Contacts)
350 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms or...)
351 AND list.parent_id IS NULL
354 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
355 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
356 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
357 JOIN nodes n ON n.id = corp.node_id
358 JOIN nodes_nodes nn ON nn.node2_id = n.id
360 WHERE list.node_id = lmid -- Master listId
361 AND n.parent_id = cmid -- Master CorpusId or AnnuaireId
362 AND n.typename = tdoc -- Master childs (Documents or Contacts)
363 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms1)
364 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
365 AND list.parent_id IS NULL
368 SELECT COALESCE(tu.id,tm.id) AS id
369 , COALESCE(tu.parent_id,tm.parent_id) AS parent_id
370 , COALESCE(tu.terms,tm.terms) AS terms
371 , COALESCE(tu.n,tm.n) AS n
372 , COALESCE(tu.list_type,tm.list_type) AS ngrams_type
373 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
374 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
375 GROUP BY tu.id,tm.id,tu.parent_id,tm.parent_id,tu.terms,tm.terms,tu.n,tm.n,tu.list_type,tm.list_type
383 CREATE OR REPLACE FUNCTION public.tree_end(luid INT, lmid INT,cuid INT, cmid INT, tdoc INT, tngrams INT)
384 RETURNS TABLE (id INT, parent_id INT, terms VARCHAR(255), n int, list_type int, weight float8) AS $$
388 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
389 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
390 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
391 JOIN nodes_nodes nn ON nn.node2_id = corp.node_id
392 JOIN nodes n ON n.id = corp.node_id
394 WHERE list.node_id = luid -- User listId
395 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
396 AND n.typename = tdoc -- both type of childs (Documents or Contacts)
397 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms or...)
400 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
401 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
402 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
403 JOIN nodes n ON n.id = corp.node_id
404 JOIN nodes_nodes nn ON nn.node2_id = n.id
406 WHERE list.node_id = lmid -- Master listId
407 AND n.parent_id = cmid -- Master CorpusId or AnnuaireId
408 AND n.typename = tdoc -- Master childs (Documents or Contacts)
409 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms1)
410 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
412 SELECT COALESCE(tu.id,tm.id) as id
413 , COALESCE(tu.parent_id,tm.parent_id) as parent_id
414 , COALESCE(tu.terms,tm.terms) AS terms
415 , COALESCE(tu.n,tm.n) AS n
416 , COALESCE(tu.list_type,tm.list_type) AS list_type
417 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
418 FROM tableUser2 tu RIGHT JOIN tableMaster2 tm ON tu.terms = tm.terms
419 GROUP BY tu.id,tm.id,tu.parent_id,tm.parent_id,tu.terms,tm.terms,tu.n,tm.n,tu.list_type,tm.list_type
425 CREATE OR REPLACE FUNCTION public.tree_ngrams(luid INT, lmid INT,cuid INT, cmid INT, tdoc INT, tngrams INT, lmt INT, ofst INT)
426 RETURNS TABLE (id INT, parent_id INT, terms VARCHAR(255), n int, list_type int, weight float8) AS $$
428 RETURN QUERY WITH RECURSIVE
429 ngrams_tree (id,parent_id,terms,n,list_type,weight) AS (
430 SELECT ts.id,ts.parent_id,ts.terms,ts.n,ts.list_type,ts.weight FROM tree_start($1,$2,$3,$4,$5,$6,$7,$8) ts
432 SELECT te.id,te.parent_id,te.terms,te.n,te.list_type,te.weight FROM tree_end($1,$2,$3,$4,$5,$6) as te
433 INNER JOIN ngrams_tree ON te.parent_id = ngrams_tree.id
435 SELECT * from ngrams_tree;
439 select * from tree_ngrams(?,?,?,?,?,?,?,?)
445 type ListIdUser = NodeId
446 type ListIdMaster = NodeId
448 type MapToChildren = Map Text (Set Text)
449 type MapToParent = Map Text Text
451 getNgramsGroup :: ListIdUser -> ListIdMaster -> Cmd err (MapToParent, MapToChildren)
452 getNgramsGroup lu lm = do
453 groups <- runPGSQuery querySelectNgramsGroup (lu,lm)
454 let mapChildren = fromListWith (<>) $ map (\(a,b) -> (a, DS.singleton b)) groups
455 let mapParent = fromListWith (<>) $ map (\(a,b) -> (b, a)) groups
456 pure (mapParent, mapChildren)
458 querySelectNgramsGroup :: PGS.Query
459 querySelectNgramsGroup = [sql|
461 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
462 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
463 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
465 nnn.node_id = ? -- User listId
468 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
469 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
470 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
472 nnn.node_id = ? -- Master listId
474 SELECT COALESCE(gu.t1,gm.t1) AS ngram1_id
475 , COALESCE(gu.t2,gm.t2) AS ngram2_id
476 FROM groupUser gu LEFT JOIN groupMaster gm ON gu.t1 = gm.t1