2 Module : Gargantext.Database.Schema.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams connection to the Database.
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE GeneralizedNewtypeDeriving #-}
19 {-# LANGUAGE MultiParamTypeClasses #-}
20 {-# LANGUAGE NoImplicitPrelude #-}
21 {-# LANGUAGE OverloadedStrings #-}
22 {-# LANGUAGE QuasiQuotes #-}
23 {-# LANGUAGE RankNTypes #-}
24 {-# LANGUAGE TemplateHaskell #-}
26 module Gargantext.Database.Schema.Ngrams where
28 import Control.Lens (makeLenses, view, over)
29 import Control.Monad (mzero)
31 import Data.Aeson.Types (toJSONKeyText)
32 import Data.ByteString.Internal (ByteString)
33 import Data.Map (Map, fromList, lookup, fromListWith)
34 import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
36 import Data.Text (Text, splitOn, pack)
37 import Database.PostgreSQL.Simple ((:.)(..))
38 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
39 import Database.PostgreSQL.Simple.SqlQQ (sql)
40 import Database.PostgreSQL.Simple.ToField (toField, ToField)
41 import Database.PostgreSQL.Simple.FromField (FromField, fromField)
42 import Database.PostgreSQL.Simple.ToRow (toRow)
43 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
44 import Debug.Trace (trace)
45 import GHC.Generics (Generic)
46 import Gargantext.Core.Types -- (fromListTypeId, ListType, NodePoly(Node))
47 import Gargantext.Database.Config (nodeTypeId,userMaster)
48 import Gargantext.Database.Root (getRoot)
49 import Gargantext.Database.Types.Node (NodeType)
50 import Gargantext.Database.Schema.Node (getListsWithParentId, getCorporaWithParentId)
51 import Gargantext.Database.Utils (Cmd, runPGSQuery, runOpaQuery, formatPGSQuery)
52 import Gargantext.Prelude
53 import Opaleye hiding (FromField)
54 import Prelude (Enum, Bounded, minBound, maxBound, Functor)
55 import qualified Data.Set as DS
56 import qualified Database.PostgreSQL.Simple as PGS
60 type NgramsTerms = Text
63 data NgramsPoly id terms n = NgramsDb { ngrams_id :: id
64 , ngrams_terms :: terms
68 type NgramsWrite = NgramsPoly (Maybe (Column PGInt4))
72 type NgramsRead = NgramsPoly (Column PGInt4)
76 type NgramsReadNull = NgramsPoly (Column (Nullable PGInt4))
77 (Column (Nullable PGText))
78 (Column (Nullable PGInt4))
80 type NgramsDb = NgramsPoly Int Text Int
82 $(makeAdaptorAndInstance "pNgramsDb" ''NgramsPoly)
83 -- $(makeLensesWith abbreviatedFields ''NgramsPoly)
85 ngramsTable :: Table NgramsWrite NgramsRead
86 ngramsTable = Table "ngrams" (pNgramsDb NgramsDb { ngrams_id = optional "id"
87 , ngrams_terms = required "terms"
88 , ngrams_n = required "n"
92 queryNgramsTable :: Query NgramsRead
93 queryNgramsTable = queryTable ngramsTable
95 dbGetNgramsDb :: Cmd err [NgramsDb]
96 dbGetNgramsDb = runOpaQuery queryNgramsTable
98 -- | Main Ngrams Types
100 -- Typed Ngrams localize the context of the ngrams
101 -- ngrams in source field of document has Sources Type
102 -- ngrams in authors field of document has Authors Type
103 -- ngrams in text (title or abstract) of documents has Terms Type
104 data NgramsType = Authors | Institutes | Sources | NgramsTerms
105 deriving (Eq, Show, Ord, Enum, Bounded, Generic)
107 instance FromJSON NgramsType
108 instance FromJSONKey NgramsType where
109 fromJSONKey = FromJSONKeyTextParser (parseJSON . String)
110 instance ToJSON NgramsType
111 instance ToJSONKey NgramsType where
112 toJSONKey = toJSONKeyText (pack . show)
114 newtype NgramsTypeId = NgramsTypeId Int
115 deriving (Eq, Show, Ord, Num)
117 instance ToField NgramsTypeId where
118 toField (NgramsTypeId n) = toField n
120 instance FromField NgramsTypeId where
121 fromField fld mdata = do
122 n <- fromField fld mdata
123 if (n :: Int) > 0 then return $ NgramsTypeId n
126 instance QueryRunnerColumnDefault (Nullable PGInt4) NgramsTypeId
128 queryRunnerColumnDefault = fieldQueryRunnerColumn
130 pgNgramsType :: NgramsType -> Column PGInt4
131 pgNgramsType = pgNgramsTypeId . ngramsTypeId
133 pgNgramsTypeId :: NgramsTypeId -> Column PGInt4
134 pgNgramsTypeId (NgramsTypeId n) = pgInt4 n
136 ngramsTypeId :: NgramsType -> NgramsTypeId
137 ngramsTypeId Authors = 1
138 ngramsTypeId Institutes = 2
139 ngramsTypeId Sources = 3
140 ngramsTypeId NgramsTerms = 4
142 fromNgramsTypeId :: NgramsTypeId -> Maybe NgramsType
143 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
145 ------------------------------------------------------------------------
146 -- | TODO put it in Gargantext.Text.Ngrams
147 data Ngrams = Ngrams { _ngramsTerms :: Text
149 } deriving (Generic, Show, Eq, Ord)
152 instance PGS.ToRow Ngrams where
153 toRow (Ngrams t s) = [toField t, toField s]
155 text2ngrams :: Text -> Ngrams
156 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
158 -------------------------------------------------------------------------
159 -- | TODO put it in Gargantext.Text.Ngrams
160 -- Named entity are typed ngrams of Terms Ngrams
162 NgramsT { _ngramsType :: NgramsType
164 } deriving (Generic, Show, Eq, Ord)
168 instance Functor NgramsT where
170 -----------------------------------------------------------------------
174 , _ngramsId :: NgramsId
175 } deriving (Show, Generic, Eq, Ord)
177 makeLenses ''NgramsIndexed
178 ------------------------------------------------------------------------
183 } deriving (Show, Generic, Eq, Ord)
185 instance PGS.FromRow NgramIds where
186 fromRow = NgramIds <$> field <*> field
188 ----------------------
189 withMap :: Map NgramsTerms NgramsId -> NgramsTerms -> NgramsId
190 withMap m n = maybe (panic "withMap: should not happen") identity (lookup n m)
192 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
193 indexNgramsT = fmap . indexNgramsWith . withMap
195 indexNgrams :: Map NgramsTerms NgramsId -> Ngrams -> NgramsIndexed
196 indexNgrams = indexNgramsWith . withMap
198 -- NP: not sure we need it anymore
199 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams -> NgramsT NgramsIndexed
200 indexNgramsTWith = fmap . indexNgramsWith
202 indexNgramsWith :: (NgramsTerms -> NgramsId) -> Ngrams -> NgramsIndexed
203 indexNgramsWith f n = NgramsIndexed n (f $ _ngramsTerms n)
205 -- TODO-ACCESS: access must not be checked here but when insertNgrams is called.
206 insertNgrams :: [Ngrams] -> Cmd err (Map NgramsTerms NgramsId)
207 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
209 -- TODO-ACCESS: access must not be checked here but when insertNgrams' is called.
210 insertNgrams' :: [Ngrams] -> Cmd err [NgramIds]
211 insertNgrams' ns = runPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
213 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
215 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd err ByteString
216 insertNgrams_Debug ns = formatPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
218 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
220 ----------------------
221 queryInsertNgrams :: PGS.Query
222 queryInsertNgrams = [sql|
223 WITH input_rows(terms,n) AS (?)
225 INSERT INTO ngrams (terms,n)
226 SELECT * FROM input_rows
227 ON CONFLICT (terms) DO NOTHING -- unique index created here
236 JOIN ngrams c USING (terms); -- columns of unique index
241 -- TODO: the way we are getting main Master Corpus and List ID is not clean
242 -- TODO: if ids are not present -> create
243 -- TODO: Global Env State Monad to keep in memory the ids without retrieving it each time
244 getNgramsTableDb :: NodeType -> NgramsType
245 -> NgramsTableParamUser
247 -> Cmd err [NgramsTableData]
248 getNgramsTableDb nt ngrt ntp limit_ offset_ = do
251 maybeRoot <- head <$> getRoot userMaster
252 let path = "Garg.Db.Ngrams.getTableNgrams: "
253 let masterRootId = maybe (panic $ path <> "no userMaster Tree") (view node_id) maybeRoot
254 -- let errMess = panic "Error"
256 corpusMasterId <- maybe (panic "error master corpus") (view node_id) <$> head <$> getCorporaWithParentId masterRootId
258 listMasterId <- maybe (panic "error master list") (view node_id) <$> head <$> getListsWithParentId corpusMasterId
260 getNgramsTableData nt ngrt ntp (NgramsTableParam listMasterId corpusMasterId) limit_ offset_
262 data NgramsTableParam =
263 NgramsTableParam { _nt_listId :: NodeId
264 , _nt_corpusId :: NodeId
267 type NgramsTableParamUser = NgramsTableParam
268 type NgramsTableParamMaster = NgramsTableParam
271 data NgramsTableData = NgramsTableData { _ntd_id :: Int
272 , _ntd_parent_id :: Maybe Int
275 , _ntd_listType :: Maybe ListType
276 , _ntd_weight :: Double
281 getNgramsTableData :: NodeType -> NgramsType
282 -> NgramsTableParamUser -> NgramsTableParamMaster
284 -> Cmd err [NgramsTableData]
285 getNgramsTableData nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) limit_ offset_ =
286 trace ("Ngrams table params: " <> show params) <$>
287 map (\(i,p,t,n,lt,w) -> NgramsTableData i p t n (fromListTypeId lt) w) <$>
288 runPGSQuery querySelectTableNgramsTrees params
290 nodeTId = nodeTypeId nodeT
291 ngrmTId = ngramsTypeId ngrmT
292 params = (ul,ml,uc,mc,nodeTId,ngrmTId) :. (limit_, offset_)
294 getNgramsTableDataDebug :: PGS.ToRow a => a -> Cmd err ByteString
295 getNgramsTableDataDebug = formatPGSQuery querySelectTableNgramsTrees
298 querySelectTableNgrams :: PGS.Query
299 querySelectTableNgrams = [sql|
302 SELECT ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
303 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
304 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
305 JOIN nodes_nodes nn ON nn.node2_id = corp.node_id
306 JOIN nodes n ON n.id = corp.node_id
308 WHERE list.node_id = ? -- User listId
309 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
310 AND n.typename = ? -- both type of childs (Documents or Contacts)
311 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms or...)
312 AND list.parent_id IS NULL
315 SELECT ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
316 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
317 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
318 JOIN nodes n ON n.id = corp.node_id
319 JOIN nodes_nodes nn ON nn.node2_id = n.id
321 WHERE list.node_id = ? -- Master listId
322 AND n.parent_id = ? -- Master CorpusId or AnnuaireId
323 AND n.typename = ? -- Master childs (Documents or Contacts)
324 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
325 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
326 AND list.parent_id IS NULL
329 SELECT COALESCE(tu.terms,tm.terms) AS terms
330 , COALESCE(tu.n,tm.n) AS n
331 , COALESCE(tu.list_type,tm.list_type) AS ngrams_type
332 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
333 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
334 GROUP BY tu.terms,tm.terms,tu.n,tm.n,tu.list_type,tm.list_type
342 querySelectTableNgramsTrees :: PGS.Query
343 querySelectTableNgramsTrees = [sql|
345 -- DROP FUNCTION tree_start(integer,integer,integer,integer,integer,integer,integer,integer);
346 -- DROP FUNCTION tree_end(integer,integer,integer,integer,integer,integer);
347 -- DROP FUNCTION tree_ngrams(integer,integer,integer,integer,integer,integer,integer,integer);
349 CREATE OR REPLACE FUNCTION public.tree_start(luid INT, lmid INT,cuid INT, cmid INT, tdoc INT, tngrams INT, lmt INT, ofst INT)
350 RETURNS TABLE (id INT, parent_id INT, terms VARCHAR(255), n int, list_type int, weight float8) AS $$
354 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
355 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
356 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
357 JOIN nodes_nodes nn ON nn.node2_id = corp.node_id
358 JOIN nodes n ON n.id = corp.node_id
360 WHERE list.node_id = luid -- User listId
361 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
362 AND n.typename = tdoc -- both type of childs (Documents or Contacts)
363 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms or...)
364 AND list.parent_id IS NULL
367 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
368 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
369 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
370 JOIN nodes n ON n.id = corp.node_id
371 JOIN nodes_nodes nn ON nn.node2_id = n.id
373 WHERE list.node_id = lmid -- Master listId
374 AND n.parent_id = cmid -- Master CorpusId or AnnuaireId
375 AND n.typename = tdoc -- Master childs (Documents or Contacts)
376 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms1)
377 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
378 AND list.parent_id IS NULL
381 SELECT COALESCE(tu.id,tm.id) AS id
382 , COALESCE(tu.parent_id,tm.parent_id) AS parent_id
383 , COALESCE(tu.terms,tm.terms) AS terms
384 , COALESCE(tu.n,tm.n) AS n
385 , COALESCE(tu.list_type,tm.list_type) AS ngrams_type
386 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
387 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
388 GROUP BY tu.id,tm.id,tu.parent_id,tm.parent_id,tu.terms,tm.terms,tu.n,tm.n,tu.list_type,tm.list_type
396 CREATE OR REPLACE FUNCTION public.tree_end(luid INT, lmid INT,cuid INT, cmid INT, tdoc INT, tngrams INT)
397 RETURNS TABLE (id INT, parent_id INT, terms VARCHAR(255), n int, list_type int, weight float8) AS $$
401 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
402 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
403 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
404 JOIN nodes_nodes nn ON nn.node2_id = corp.node_id
405 JOIN nodes n ON n.id = corp.node_id
407 WHERE list.node_id = luid -- User listId
408 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
409 AND n.typename = tdoc -- both type of childs (Documents or Contacts)
410 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms or...)
413 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
414 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
415 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
416 JOIN nodes n ON n.id = corp.node_id
417 JOIN nodes_nodes nn ON nn.node2_id = n.id
419 WHERE list.node_id = lmid -- Master listId
420 AND n.parent_id = cmid -- Master CorpusId or AnnuaireId
421 AND n.typename = tdoc -- Master childs (Documents or Contacts)
422 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms1)
423 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
425 SELECT COALESCE(tu.id,tm.id) as id
426 , COALESCE(tu.parent_id,tm.parent_id) as parent_id
427 , COALESCE(tu.terms,tm.terms) AS terms
428 , COALESCE(tu.n,tm.n) AS n
429 , COALESCE(tu.list_type,tm.list_type) AS list_type
430 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
431 FROM tableUser2 tu RIGHT JOIN tableMaster2 tm ON tu.terms = tm.terms
432 GROUP BY tu.id,tm.id,tu.parent_id,tm.parent_id,tu.terms,tm.terms,tu.n,tm.n,tu.list_type,tm.list_type
438 CREATE OR REPLACE FUNCTION public.tree_ngrams(luid INT, lmid INT,cuid INT, cmid INT, tdoc INT, tngrams INT, lmt INT, ofst INT)
439 RETURNS TABLE (id INT, parent_id INT, terms VARCHAR(255), n int, list_type int, weight float8) AS $$
441 RETURN QUERY WITH RECURSIVE
442 ngrams_tree (id,parent_id,terms,n,list_type,weight) AS (
443 SELECT ts.id,ts.parent_id,ts.terms,ts.n,ts.list_type,ts.weight FROM tree_start($1,$2,$3,$4,$5,$6,$7,$8) ts
445 SELECT te.id,te.parent_id,te.terms,te.n,te.list_type,te.weight FROM tree_end($1,$2,$3,$4,$5,$6) as te
446 INNER JOIN ngrams_tree ON te.parent_id = ngrams_tree.id
448 SELECT * from ngrams_tree;
452 select * from tree_ngrams(?,?,?,?,?,?,?,?)
458 type ListIdUser = NodeId
459 type ListIdMaster = NodeId
461 type MapToChildren = Map Text (Set Text)
462 type MapToParent = Map Text Text
464 getNgramsGroup :: ListIdUser -> ListIdMaster -> Cmd err (MapToParent, MapToChildren)
465 getNgramsGroup lu lm = do
466 groups <- runPGSQuery querySelectNgramsGroup (lu,lm)
467 let mapChildren = fromListWith (<>) $ map (\(a,b) -> (a, DS.singleton b)) groups
468 let mapParent = fromListWith (<>) $ map (\(a,b) -> (b, a)) groups
469 pure (mapParent, mapChildren)
471 querySelectNgramsGroup :: PGS.Query
472 querySelectNgramsGroup = [sql|
474 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
475 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
476 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
478 nnn.node_id = ? -- User listId
481 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
482 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
483 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
485 nnn.node_id = ? -- Master listId
487 SELECT COALESCE(gu.t1,gm.t1) AS ngram1_id
488 , COALESCE(gu.t2,gm.t2) AS ngram2_id
489 FROM groupUser gu LEFT JOIN groupMaster gm ON gu.t1 = gm.t1