2 Module : Gargantext.Database.Schema.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams connection to the Database.
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE GeneralizedNewtypeDeriving #-}
19 {-# LANGUAGE MultiParamTypeClasses #-}
20 {-# LANGUAGE NoImplicitPrelude #-}
21 {-# LANGUAGE OverloadedStrings #-}
22 {-# LANGUAGE QuasiQuotes #-}
23 {-# LANGUAGE RankNTypes #-}
24 {-# LANGUAGE TemplateHaskell #-}
26 module Gargantext.Database.Schema.Ngrams where
28 import Control.Lens (makeLenses, view, over)
29 import Control.Monad (mzero)
30 import Data.ByteString.Internal (ByteString)
31 import Data.Map (Map, fromList, lookup, fromListWith)
32 import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
34 import Data.Text (Text, splitOn)
35 import Database.PostgreSQL.Simple ((:.)(..))
36 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
37 import Database.PostgreSQL.Simple.SqlQQ (sql)
38 import Database.PostgreSQL.Simple.ToField (toField, ToField)
39 import Database.PostgreSQL.Simple.FromField (FromField, fromField)
40 import Database.PostgreSQL.Simple.ToRow (toRow)
41 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
42 import Debug.Trace (trace)
43 import GHC.Generics (Generic)
44 import Gargantext.Core.Types -- (fromListTypeId, ListType, NodePoly(Node))
45 import Gargantext.Database.Config (nodeTypeId,userMaster)
46 import Gargantext.Database.Root (getRoot)
47 import Gargantext.Database.Types.Node (NodeType)
48 import Gargantext.Database.Schema.Node (getListsWithParentId, getCorporaWithParentId)
49 import Gargantext.Database.Utils (Cmd, runPGSQuery, runOpaQuery, formatPGSQuery)
50 import Gargantext.Prelude
51 import Opaleye hiding (FromField)
52 import Prelude (Enum, Bounded, minBound, maxBound, Functor)
53 import qualified Data.Set as DS
54 import qualified Database.PostgreSQL.Simple as PGS
57 type NgramsTerms = Text
62 data NgramsPoly id terms n = NgramsDb { ngrams_id :: id
63 , ngrams_terms :: terms
68 type NgramsWrite = NgramsPoly (Maybe (Column PGInt4))
72 type NgramsRead = NgramsPoly (Column PGInt4)
76 type NgramsReadNull = NgramsPoly (Column (Nullable PGInt4))
77 (Column (Nullable PGText))
78 (Column (Nullable PGInt4))
81 type NgramsDb = NgramsPoly Int Text Int
83 $(makeAdaptorAndInstance "pNgramsDb" ''NgramsPoly)
84 -- $(makeLensesWith abbreviatedFields ''NgramsPoly)
86 ngramsTable :: Table NgramsWrite NgramsRead
87 ngramsTable = Table "ngrams" (pNgramsDb NgramsDb { ngrams_id = optional "id"
88 , ngrams_terms = required "terms"
89 , ngrams_n = required "n"
93 queryNgramsTable :: Query NgramsRead
94 queryNgramsTable = queryTable ngramsTable
96 dbGetNgramsDb :: Cmd err [NgramsDb]
97 dbGetNgramsDb = runOpaQuery queryNgramsTable
100 -- | Main Ngrams Types
102 -- Typed Ngrams localize the context of the ngrams
103 -- ngrams in source field of document has Sources Type
104 -- ngrams in authors field of document has Authors Type
105 -- ngrams in text (title or abstract) of documents has Terms Type
106 data NgramsType = Authors | Institutes | Sources | NgramsTerms
107 deriving (Eq, Show, Ord, Enum, Bounded)
109 newtype NgramsTypeId = NgramsTypeId Int
110 deriving (Eq, Show, Ord, Num)
112 instance ToField NgramsTypeId where
113 toField (NgramsTypeId n) = toField n
115 instance FromField NgramsTypeId where
116 fromField fld mdata = do
117 n <- fromField fld mdata
118 if (n :: Int) > 0 then return $ NgramsTypeId n
121 pgNgramsType :: NgramsType -> Column PGInt4
122 pgNgramsType = pgNgramsTypeId . ngramsTypeId
124 pgNgramsTypeId :: NgramsTypeId -> Column PGInt4
125 pgNgramsTypeId (NgramsTypeId n) = pgInt4 n
127 ngramsTypeId :: NgramsType -> NgramsTypeId
128 ngramsTypeId Authors = 1
129 ngramsTypeId Institutes = 2
130 ngramsTypeId Sources = 3
131 ngramsTypeId NgramsTerms = 4
133 fromNgramsTypeId :: NgramsTypeId -> Maybe NgramsType
134 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
136 ------------------------------------------------------------------------
137 -- | TODO put it in Gargantext.Text.Ngrams
138 data Ngrams = Ngrams { _ngramsTerms :: Text
140 } deriving (Generic, Show, Eq, Ord)
143 instance PGS.ToRow Ngrams where
144 toRow (Ngrams t s) = [toField t, toField s]
146 text2ngrams :: Text -> Ngrams
147 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
149 -------------------------------------------------------------------------
150 -- | TODO put it in Gargantext.Text.Ngrams
151 -- Named entity are typed ngrams of Terms Ngrams
153 NgramsT { _ngramsType :: NgramsType
155 } deriving (Generic, Show, Eq, Ord)
159 instance Functor NgramsT where
161 -----------------------------------------------------------------------
165 , _ngramsId :: NgramsId
166 } deriving (Show, Generic, Eq, Ord)
168 makeLenses ''NgramsIndexed
169 ------------------------------------------------------------------------
174 } deriving (Show, Generic, Eq, Ord)
176 instance PGS.FromRow NgramIds where
177 fromRow = NgramIds <$> field <*> field
179 ----------------------
180 withMap :: Map NgramsTerms NgramsId -> NgramsTerms -> NgramsId
181 withMap m n = maybe (panic "withMap: should not happen") identity (lookup n m)
183 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
184 indexNgramsT = fmap . indexNgramsWith . withMap
186 indexNgrams :: Map NgramsTerms NgramsId -> Ngrams -> NgramsIndexed
187 indexNgrams = indexNgramsWith . withMap
189 -- NP: not sure we need it anymore
190 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams -> NgramsT NgramsIndexed
191 indexNgramsTWith = fmap . indexNgramsWith
193 indexNgramsWith :: (NgramsTerms -> NgramsId) -> Ngrams -> NgramsIndexed
194 indexNgramsWith f n = NgramsIndexed n (f $ _ngramsTerms n)
196 insertNgrams :: [Ngrams] -> Cmd err (Map NgramsTerms NgramsId)
197 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
199 insertNgrams' :: [Ngrams] -> Cmd err [NgramIds]
200 insertNgrams' ns = runPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
202 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
204 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd err ByteString
205 insertNgrams_Debug ns = formatPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
207 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
209 ----------------------
210 queryInsertNgrams :: PGS.Query
211 queryInsertNgrams = [sql|
212 WITH input_rows(terms,n) AS (?)
214 INSERT INTO ngrams (terms,n)
215 SELECT * FROM input_rows
216 ON CONFLICT (terms) DO NOTHING -- unique index created here
225 JOIN ngrams c USING (terms); -- columns of unique index
230 -- TODO: the way we are getting main Master Corpus and List ID is not clean
231 -- TODO: if ids are not present -> create
232 -- TODO: Global Env State Monad to keep in memory the ids without retrieving it each time
233 getNgramsTableDb :: NodeType -> NgramsType
234 -> NgramsTableParamUser
236 -> Cmd err [NgramsTableData]
237 getNgramsTableDb nt ngrt ntp limit_ offset_ = do
240 maybeRoot <- head <$> getRoot userMaster
241 let path = "Garg.Db.Ngrams.getTableNgrams: "
242 let masterRootId = maybe (panic $ path <> "no userMaster Tree") (view node_id) maybeRoot
243 -- let errMess = panic "Error"
245 corpusMasterId <- maybe (panic "error master corpus") (view node_id) <$> head <$> getCorporaWithParentId masterRootId
247 listMasterId <- maybe (panic "error master list") (view node_id) <$> head <$> getListsWithParentId corpusMasterId
249 getNgramsTableData nt ngrt ntp (NgramsTableParam listMasterId corpusMasterId) limit_ offset_
251 data NgramsTableParam =
252 NgramsTableParam { _nt_listId :: NodeId
253 , _nt_corpusId :: NodeId
256 type NgramsTableParamUser = NgramsTableParam
257 type NgramsTableParamMaster = NgramsTableParam
260 data NgramsTableData = NgramsTableData { _ntd_id :: Int
261 , _ntd_parent_id :: Maybe Int
264 , _ntd_listType :: Maybe ListType
265 , _ntd_weight :: Double
270 getNgramsTableData :: NodeType -> NgramsType
271 -> NgramsTableParamUser -> NgramsTableParamMaster
273 -> Cmd err [NgramsTableData]
274 getNgramsTableData nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) limit_ offset_ =
275 trace ("Ngrams table params: " <> show params) <$>
276 map (\(i,p,t,n,lt,w) -> NgramsTableData i p t n (fromListTypeId lt) w) <$>
277 runPGSQuery querySelectTableNgramsTrees params
279 nodeTId = nodeTypeId nodeT
280 ngrmTId = ngramsTypeId ngrmT
281 params = (ul,ml,uc,mc,nodeTId,ngrmTId) :. (limit_, offset_)
283 getNgramsTableDataDebug :: PGS.ToRow a => a -> Cmd err ByteString
284 getNgramsTableDataDebug = formatPGSQuery querySelectTableNgramsTrees
287 querySelectTableNgrams :: PGS.Query
288 querySelectTableNgrams = [sql|
291 SELECT ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
292 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
293 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
294 JOIN nodes_nodes nn ON nn.node2_id = corp.node_id
295 JOIN nodes n ON n.id = corp.node_id
297 WHERE list.node_id = ? -- User listId
298 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
299 AND n.typename = ? -- both type of childs (Documents or Contacts)
300 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms or...)
301 AND list.parent_id IS NULL
304 SELECT ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
305 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
306 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
307 JOIN nodes n ON n.id = corp.node_id
308 JOIN nodes_nodes nn ON nn.node2_id = n.id
310 WHERE list.node_id = ? -- Master listId
311 AND n.parent_id = ? -- Master CorpusId or AnnuaireId
312 AND n.typename = ? -- Master childs (Documents or Contacts)
313 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
314 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
315 AND list.parent_id IS NULL
318 SELECT COALESCE(tu.terms,tm.terms) AS terms
319 , COALESCE(tu.n,tm.n) AS n
320 , COALESCE(tu.list_type,tm.list_type) AS ngrams_type
321 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
322 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
323 GROUP BY tu.terms,tm.terms,tu.n,tm.n,tu.list_type,tm.list_type
331 querySelectTableNgramsTrees :: PGS.Query
332 querySelectTableNgramsTrees = [sql|
334 -- DROP FUNCTION tree_start(integer,integer,integer,integer,integer,integer,integer,integer);
335 -- DROP FUNCTION tree_end(integer,integer,integer,integer,integer,integer);
336 -- DROP FUNCTION tree_ngrams(integer,integer,integer,integer,integer,integer,integer,integer);
338 CREATE OR REPLACE FUNCTION public.tree_start(luid INT, lmid INT,cuid INT, cmid INT, tdoc INT, tngrams INT, lmt INT, ofst INT)
339 RETURNS TABLE (id INT, parent_id INT, terms VARCHAR(255), n int, list_type int, weight float8) AS $$
343 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
344 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
345 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
346 JOIN nodes_nodes nn ON nn.node2_id = corp.node_id
347 JOIN nodes n ON n.id = corp.node_id
349 WHERE list.node_id = luid -- User listId
350 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
351 AND n.typename = tdoc -- both type of childs (Documents or Contacts)
352 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms or...)
353 AND list.parent_id IS NULL
356 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
357 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
358 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
359 JOIN nodes n ON n.id = corp.node_id
360 JOIN nodes_nodes nn ON nn.node2_id = n.id
362 WHERE list.node_id = lmid -- Master listId
363 AND n.parent_id = cmid -- Master CorpusId or AnnuaireId
364 AND n.typename = tdoc -- Master childs (Documents or Contacts)
365 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms1)
366 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
367 AND list.parent_id IS NULL
370 SELECT COALESCE(tu.id,tm.id) AS id
371 , COALESCE(tu.parent_id,tm.parent_id) AS parent_id
372 , COALESCE(tu.terms,tm.terms) AS terms
373 , COALESCE(tu.n,tm.n) AS n
374 , COALESCE(tu.list_type,tm.list_type) AS ngrams_type
375 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
376 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
377 GROUP BY tu.id,tm.id,tu.parent_id,tm.parent_id,tu.terms,tm.terms,tu.n,tm.n,tu.list_type,tm.list_type
385 CREATE OR REPLACE FUNCTION public.tree_end(luid INT, lmid INT,cuid INT, cmid INT, tdoc INT, tngrams INT)
386 RETURNS TABLE (id INT, parent_id INT, terms VARCHAR(255), n int, list_type int, weight float8) AS $$
390 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
391 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
392 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
393 JOIN nodes_nodes nn ON nn.node2_id = corp.node_id
394 JOIN nodes n ON n.id = corp.node_id
396 WHERE list.node_id = luid -- User listId
397 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
398 AND n.typename = tdoc -- both type of childs (Documents or Contacts)
399 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms or...)
402 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
403 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
404 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
405 JOIN nodes n ON n.id = corp.node_id
406 JOIN nodes_nodes nn ON nn.node2_id = n.id
408 WHERE list.node_id = lmid -- Master listId
409 AND n.parent_id = cmid -- Master CorpusId or AnnuaireId
410 AND n.typename = tdoc -- Master childs (Documents or Contacts)
411 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms1)
412 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
414 SELECT COALESCE(tu.id,tm.id) as id
415 , COALESCE(tu.parent_id,tm.parent_id) as parent_id
416 , COALESCE(tu.terms,tm.terms) AS terms
417 , COALESCE(tu.n,tm.n) AS n
418 , COALESCE(tu.list_type,tm.list_type) AS list_type
419 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
420 FROM tableUser2 tu RIGHT JOIN tableMaster2 tm ON tu.terms = tm.terms
421 GROUP BY tu.id,tm.id,tu.parent_id,tm.parent_id,tu.terms,tm.terms,tu.n,tm.n,tu.list_type,tm.list_type
427 CREATE OR REPLACE FUNCTION public.tree_ngrams(luid INT, lmid INT,cuid INT, cmid INT, tdoc INT, tngrams INT, lmt INT, ofst INT)
428 RETURNS TABLE (id INT, parent_id INT, terms VARCHAR(255), n int, list_type int, weight float8) AS $$
430 RETURN QUERY WITH RECURSIVE
431 ngrams_tree (id,parent_id,terms,n,list_type,weight) AS (
432 SELECT ts.id,ts.parent_id,ts.terms,ts.n,ts.list_type,ts.weight FROM tree_start($1,$2,$3,$4,$5,$6,$7,$8) ts
434 SELECT te.id,te.parent_id,te.terms,te.n,te.list_type,te.weight FROM tree_end($1,$2,$3,$4,$5,$6) as te
435 INNER JOIN ngrams_tree ON te.parent_id = ngrams_tree.id
437 SELECT * from ngrams_tree;
441 select * from tree_ngrams(?,?,?,?,?,?,?,?)
447 type ListIdUser = NodeId
448 type ListIdMaster = NodeId
450 type MapToChildren = Map Text (Set Text)
451 type MapToParent = Map Text Text
453 getNgramsGroup :: ListIdUser -> ListIdMaster -> Cmd err (MapToParent, MapToChildren)
454 getNgramsGroup lu lm = do
455 groups <- runPGSQuery querySelectNgramsGroup (lu,lm)
456 let mapChildren = fromListWith (<>) $ map (\(a,b) -> (a, DS.singleton b)) groups
457 let mapParent = fromListWith (<>) $ map (\(a,b) -> (b, a)) groups
458 pure (mapParent, mapChildren)
460 querySelectNgramsGroup :: PGS.Query
461 querySelectNgramsGroup = [sql|
463 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
464 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
465 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
467 nnn.node_id = ? -- User listId
470 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
471 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
472 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
474 nnn.node_id = ? -- Master listId
476 SELECT COALESCE(gu.t1,gm.t1) AS ngram1_id
477 , COALESCE(gu.t2,gm.t2) AS ngram2_id
478 FROM groupUser gu LEFT JOIN groupMaster gm ON gu.t1 = gm.t1