2 Module : Gargantext.Database.Schema.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams connection to the Database.
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE GeneralizedNewtypeDeriving #-}
19 {-# LANGUAGE MultiParamTypeClasses #-}
20 {-# LANGUAGE NoImplicitPrelude #-}
21 {-# LANGUAGE OverloadedStrings #-}
22 {-# LANGUAGE QuasiQuotes #-}
23 {-# LANGUAGE RankNTypes #-}
24 {-# LANGUAGE TemplateHaskell #-}
26 module Gargantext.Database.Schema.Ngrams where
28 import Data.Aeson (FromJSON, FromJSONKey)
29 import Control.Lens (makeLenses, view, over)
30 import Control.Monad (mzero)
32 import Data.ByteString.Internal (ByteString)
33 import Data.Map (Map, fromList, lookup, fromListWith)
34 import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
36 import Data.Text (Text, splitOn)
37 import Database.PostgreSQL.Simple ((:.)(..))
38 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
39 import Database.PostgreSQL.Simple.SqlQQ (sql)
40 import Database.PostgreSQL.Simple.ToField (toField, ToField)
41 import Database.PostgreSQL.Simple.FromField (FromField, fromField)
42 import Database.PostgreSQL.Simple.ToRow (toRow)
43 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
44 import Debug.Trace (trace)
45 import GHC.Generics (Generic)
46 import Gargantext.Core.Types -- (fromListTypeId, ListType, NodePoly(Node))
47 import Gargantext.Database.Config (nodeTypeId,userMaster)
48 import Gargantext.Database.Root (getRoot)
49 import Gargantext.Database.Types.Node (NodeType)
50 import Gargantext.Database.Schema.Node (getListsWithParentId, getCorporaWithParentId)
51 import Gargantext.Database.Utils (Cmd, runPGSQuery, runOpaQuery, formatPGSQuery)
52 import Gargantext.Prelude
53 import Opaleye hiding (FromField)
54 import Prelude (Enum, Bounded, minBound, maxBound, Functor)
55 import qualified Data.Set as DS
56 import qualified Database.PostgreSQL.Simple as PGS
59 type NgramsTerms = Text
63 data NgramsPoly id terms n = NgramsDb { ngrams_id :: id
64 , ngrams_terms :: terms
68 type NgramsWrite = NgramsPoly (Maybe (Column PGInt4))
72 type NgramsRead = NgramsPoly (Column PGInt4)
76 type NgramsReadNull = NgramsPoly (Column (Nullable PGInt4))
77 (Column (Nullable PGText))
78 (Column (Nullable PGInt4))
80 type NgramsDb = NgramsPoly Int Text Int
82 $(makeAdaptorAndInstance "pNgramsDb" ''NgramsPoly)
83 -- $(makeLensesWith abbreviatedFields ''NgramsPoly)
85 ngramsTable :: Table NgramsWrite NgramsRead
86 ngramsTable = Table "ngrams" (pNgramsDb NgramsDb { ngrams_id = optional "id"
87 , ngrams_terms = required "terms"
88 , ngrams_n = required "n"
92 queryNgramsTable :: Query NgramsRead
93 queryNgramsTable = queryTable ngramsTable
95 dbGetNgramsDb :: Cmd err [NgramsDb]
96 dbGetNgramsDb = runOpaQuery queryNgramsTable
98 -- | Main Ngrams Types
100 -- Typed Ngrams localize the context of the ngrams
101 -- ngrams in source field of document has Sources Type
102 -- ngrams in authors field of document has Authors Type
103 -- ngrams in text (title or abstract) of documents has Terms Type
104 data NgramsType = Authors | Institutes | Sources | NgramsTerms
105 deriving (Eq, Show, Ord, Enum, Bounded, Generic)
107 instance FromJSON NgramsType
108 instance FromJSONKey NgramsType
109 instance ToJSON NgramsType
110 instance ToJSONKey NgramsType
112 newtype NgramsTypeId = NgramsTypeId Int
113 deriving (Eq, Show, Ord, Num)
115 instance ToField NgramsTypeId where
116 toField (NgramsTypeId n) = toField n
118 instance FromField NgramsTypeId where
119 fromField fld mdata = do
120 n <- fromField fld mdata
121 if (n :: Int) > 0 then return $ NgramsTypeId n
124 pgNgramsType :: NgramsType -> Column PGInt4
125 pgNgramsType = pgNgramsTypeId . ngramsTypeId
127 pgNgramsTypeId :: NgramsTypeId -> Column PGInt4
128 pgNgramsTypeId (NgramsTypeId n) = pgInt4 n
130 ngramsTypeId :: NgramsType -> NgramsTypeId
131 ngramsTypeId Authors = 1
132 ngramsTypeId Institutes = 2
133 ngramsTypeId Sources = 3
134 ngramsTypeId NgramsTerms = 4
136 fromNgramsTypeId :: NgramsTypeId -> Maybe NgramsType
137 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
139 ------------------------------------------------------------------------
140 -- | TODO put it in Gargantext.Text.Ngrams
141 data Ngrams = Ngrams { _ngramsTerms :: Text
143 } deriving (Generic, Show, Eq, Ord)
146 instance PGS.ToRow Ngrams where
147 toRow (Ngrams t s) = [toField t, toField s]
149 text2ngrams :: Text -> Ngrams
150 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
152 -------------------------------------------------------------------------
153 -- | TODO put it in Gargantext.Text.Ngrams
154 -- Named entity are typed ngrams of Terms Ngrams
156 NgramsT { _ngramsType :: NgramsType
158 } deriving (Generic, Show, Eq, Ord)
162 instance Functor NgramsT where
164 -----------------------------------------------------------------------
168 , _ngramsId :: NgramsId
169 } deriving (Show, Generic, Eq, Ord)
171 makeLenses ''NgramsIndexed
172 ------------------------------------------------------------------------
177 } deriving (Show, Generic, Eq, Ord)
179 instance PGS.FromRow NgramIds where
180 fromRow = NgramIds <$> field <*> field
182 ----------------------
183 withMap :: Map NgramsTerms NgramsId -> NgramsTerms -> NgramsId
184 withMap m n = maybe (panic "withMap: should not happen") identity (lookup n m)
186 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
187 indexNgramsT = fmap . indexNgramsWith . withMap
189 indexNgrams :: Map NgramsTerms NgramsId -> Ngrams -> NgramsIndexed
190 indexNgrams = indexNgramsWith . withMap
192 -- NP: not sure we need it anymore
193 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams -> NgramsT NgramsIndexed
194 indexNgramsTWith = fmap . indexNgramsWith
196 indexNgramsWith :: (NgramsTerms -> NgramsId) -> Ngrams -> NgramsIndexed
197 indexNgramsWith f n = NgramsIndexed n (f $ _ngramsTerms n)
199 -- TODO-ACCESS: access must not be checked here but when insertNgrams is called.
200 insertNgrams :: [Ngrams] -> Cmd err (Map NgramsTerms NgramsId)
201 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
203 -- TODO-ACCESS: access must not be checked here but when insertNgrams' is called.
204 insertNgrams' :: [Ngrams] -> Cmd err [NgramIds]
205 insertNgrams' ns = runPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
207 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
209 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd err ByteString
210 insertNgrams_Debug ns = formatPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
212 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
214 ----------------------
215 queryInsertNgrams :: PGS.Query
216 queryInsertNgrams = [sql|
217 WITH input_rows(terms,n) AS (?)
219 INSERT INTO ngrams (terms,n)
220 SELECT * FROM input_rows
221 ON CONFLICT (terms) DO NOTHING -- unique index created here
230 JOIN ngrams c USING (terms); -- columns of unique index
235 -- TODO: the way we are getting main Master Corpus and List ID is not clean
236 -- TODO: if ids are not present -> create
237 -- TODO: Global Env State Monad to keep in memory the ids without retrieving it each time
238 getNgramsTableDb :: NodeType -> NgramsType
239 -> NgramsTableParamUser
241 -> Cmd err [NgramsTableData]
242 getNgramsTableDb nt ngrt ntp limit_ offset_ = do
245 maybeRoot <- head <$> getRoot userMaster
246 let path = "Garg.Db.Ngrams.getTableNgrams: "
247 let masterRootId = maybe (panic $ path <> "no userMaster Tree") (view node_id) maybeRoot
248 -- let errMess = panic "Error"
250 corpusMasterId <- maybe (panic "error master corpus") (view node_id) <$> head <$> getCorporaWithParentId masterRootId
252 listMasterId <- maybe (panic "error master list") (view node_id) <$> head <$> getListsWithParentId corpusMasterId
254 getNgramsTableData nt ngrt ntp (NgramsTableParam listMasterId corpusMasterId) limit_ offset_
256 data NgramsTableParam =
257 NgramsTableParam { _nt_listId :: NodeId
258 , _nt_corpusId :: NodeId
261 type NgramsTableParamUser = NgramsTableParam
262 type NgramsTableParamMaster = NgramsTableParam
265 data NgramsTableData = NgramsTableData { _ntd_id :: Int
266 , _ntd_parent_id :: Maybe Int
269 , _ntd_listType :: Maybe ListType
270 , _ntd_weight :: Double
275 getNgramsTableData :: NodeType -> NgramsType
276 -> NgramsTableParamUser -> NgramsTableParamMaster
278 -> Cmd err [NgramsTableData]
279 getNgramsTableData nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) limit_ offset_ =
280 trace ("Ngrams table params: " <> show params) <$>
281 map (\(i,p,t,n,lt,w) -> NgramsTableData i p t n (fromListTypeId lt) w) <$>
282 runPGSQuery querySelectTableNgramsTrees params
284 nodeTId = nodeTypeId nodeT
285 ngrmTId = ngramsTypeId ngrmT
286 params = (ul,ml,uc,mc,nodeTId,ngrmTId) :. (limit_, offset_)
288 getNgramsTableDataDebug :: PGS.ToRow a => a -> Cmd err ByteString
289 getNgramsTableDataDebug = formatPGSQuery querySelectTableNgramsTrees
292 querySelectTableNgrams :: PGS.Query
293 querySelectTableNgrams = [sql|
296 SELECT ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
297 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
298 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
299 JOIN nodes_nodes nn ON nn.node2_id = corp.node_id
300 JOIN nodes n ON n.id = corp.node_id
302 WHERE list.node_id = ? -- User listId
303 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
304 AND n.typename = ? -- both type of childs (Documents or Contacts)
305 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms or...)
306 AND list.parent_id IS NULL
309 SELECT ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
310 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
311 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
312 JOIN nodes n ON n.id = corp.node_id
313 JOIN nodes_nodes nn ON nn.node2_id = n.id
315 WHERE list.node_id = ? -- Master listId
316 AND n.parent_id = ? -- Master CorpusId or AnnuaireId
317 AND n.typename = ? -- Master childs (Documents or Contacts)
318 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
319 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
320 AND list.parent_id IS NULL
323 SELECT COALESCE(tu.terms,tm.terms) AS terms
324 , COALESCE(tu.n,tm.n) AS n
325 , COALESCE(tu.list_type,tm.list_type) AS ngrams_type
326 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
327 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
328 GROUP BY tu.terms,tm.terms,tu.n,tm.n,tu.list_type,tm.list_type
336 querySelectTableNgramsTrees :: PGS.Query
337 querySelectTableNgramsTrees = [sql|
339 -- DROP FUNCTION tree_start(integer,integer,integer,integer,integer,integer,integer,integer);
340 -- DROP FUNCTION tree_end(integer,integer,integer,integer,integer,integer);
341 -- DROP FUNCTION tree_ngrams(integer,integer,integer,integer,integer,integer,integer,integer);
343 CREATE OR REPLACE FUNCTION public.tree_start(luid INT, lmid INT,cuid INT, cmid INT, tdoc INT, tngrams INT, lmt INT, ofst INT)
344 RETURNS TABLE (id INT, parent_id INT, terms VARCHAR(255), n int, list_type int, weight float8) AS $$
348 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
349 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
350 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
351 JOIN nodes_nodes nn ON nn.node2_id = corp.node_id
352 JOIN nodes n ON n.id = corp.node_id
354 WHERE list.node_id = luid -- User listId
355 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
356 AND n.typename = tdoc -- both type of childs (Documents or Contacts)
357 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms or...)
358 AND list.parent_id IS NULL
361 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
362 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
363 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
364 JOIN nodes n ON n.id = corp.node_id
365 JOIN nodes_nodes nn ON nn.node2_id = n.id
367 WHERE list.node_id = lmid -- Master listId
368 AND n.parent_id = cmid -- Master CorpusId or AnnuaireId
369 AND n.typename = tdoc -- Master childs (Documents or Contacts)
370 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms1)
371 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
372 AND list.parent_id IS NULL
375 SELECT COALESCE(tu.id,tm.id) AS id
376 , COALESCE(tu.parent_id,tm.parent_id) AS parent_id
377 , COALESCE(tu.terms,tm.terms) AS terms
378 , COALESCE(tu.n,tm.n) AS n
379 , COALESCE(tu.list_type,tm.list_type) AS ngrams_type
380 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
381 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
382 GROUP BY tu.id,tm.id,tu.parent_id,tm.parent_id,tu.terms,tm.terms,tu.n,tm.n,tu.list_type,tm.list_type
390 CREATE OR REPLACE FUNCTION public.tree_end(luid INT, lmid INT,cuid INT, cmid INT, tdoc INT, tngrams INT)
391 RETURNS TABLE (id INT, parent_id INT, terms VARCHAR(255), n int, list_type int, weight float8) AS $$
395 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
396 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
397 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
398 JOIN nodes_nodes nn ON nn.node2_id = corp.node_id
399 JOIN nodes n ON n.id = corp.node_id
401 WHERE list.node_id = luid -- User listId
402 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
403 AND n.typename = tdoc -- both type of childs (Documents or Contacts)
404 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms or...)
407 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
408 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
409 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
410 JOIN nodes n ON n.id = corp.node_id
411 JOIN nodes_nodes nn ON nn.node2_id = n.id
413 WHERE list.node_id = lmid -- Master listId
414 AND n.parent_id = cmid -- Master CorpusId or AnnuaireId
415 AND n.typename = tdoc -- Master childs (Documents or Contacts)
416 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms1)
417 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
419 SELECT COALESCE(tu.id,tm.id) as id
420 , COALESCE(tu.parent_id,tm.parent_id) as parent_id
421 , COALESCE(tu.terms,tm.terms) AS terms
422 , COALESCE(tu.n,tm.n) AS n
423 , COALESCE(tu.list_type,tm.list_type) AS list_type
424 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
425 FROM tableUser2 tu RIGHT JOIN tableMaster2 tm ON tu.terms = tm.terms
426 GROUP BY tu.id,tm.id,tu.parent_id,tm.parent_id,tu.terms,tm.terms,tu.n,tm.n,tu.list_type,tm.list_type
432 CREATE OR REPLACE FUNCTION public.tree_ngrams(luid INT, lmid INT,cuid INT, cmid INT, tdoc INT, tngrams INT, lmt INT, ofst INT)
433 RETURNS TABLE (id INT, parent_id INT, terms VARCHAR(255), n int, list_type int, weight float8) AS $$
435 RETURN QUERY WITH RECURSIVE
436 ngrams_tree (id,parent_id,terms,n,list_type,weight) AS (
437 SELECT ts.id,ts.parent_id,ts.terms,ts.n,ts.list_type,ts.weight FROM tree_start($1,$2,$3,$4,$5,$6,$7,$8) ts
439 SELECT te.id,te.parent_id,te.terms,te.n,te.list_type,te.weight FROM tree_end($1,$2,$3,$4,$5,$6) as te
440 INNER JOIN ngrams_tree ON te.parent_id = ngrams_tree.id
442 SELECT * from ngrams_tree;
446 select * from tree_ngrams(?,?,?,?,?,?,?,?)
452 type ListIdUser = NodeId
453 type ListIdMaster = NodeId
455 type MapToChildren = Map Text (Set Text)
456 type MapToParent = Map Text Text
458 getNgramsGroup :: ListIdUser -> ListIdMaster -> Cmd err (MapToParent, MapToChildren)
459 getNgramsGroup lu lm = do
460 groups <- runPGSQuery querySelectNgramsGroup (lu,lm)
461 let mapChildren = fromListWith (<>) $ map (\(a,b) -> (a, DS.singleton b)) groups
462 let mapParent = fromListWith (<>) $ map (\(a,b) -> (b, a)) groups
463 pure (mapParent, mapChildren)
465 querySelectNgramsGroup :: PGS.Query
466 querySelectNgramsGroup = [sql|
468 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
469 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
470 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
472 nnn.node_id = ? -- User listId
475 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
476 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
477 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
479 nnn.node_id = ? -- Master listId
481 SELECT COALESCE(gu.t1,gm.t1) AS ngram1_id
482 , COALESCE(gu.t2,gm.t2) AS ngram2_id
483 FROM groupUser gu LEFT JOIN groupMaster gm ON gu.t1 = gm.t1