]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Schema/Ngrams.hs
Merge branch 'dev-ngrams-repo' of ssh://gitlab.iscpif.fr:20022/gargantext/haskell...
[gargantext.git] / src / Gargantext / Database / Schema / Ngrams.hs
1 {-|
2 Module : Gargantext.Database.Schema.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams connection to the Database.
11
12 -}
13
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE GeneralizedNewtypeDeriving #-}
19 {-# LANGUAGE MultiParamTypeClasses #-}
20 {-# LANGUAGE NoImplicitPrelude #-}
21 {-# LANGUAGE OverloadedStrings #-}
22 {-# LANGUAGE QuasiQuotes #-}
23 {-# LANGUAGE RankNTypes #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Schema.Ngrams where
27
28 import Control.Lens (makeLenses, view, over)
29 import Control.Monad (mzero)
30 import Data.ByteString.Internal (ByteString)
31 import Data.Map (Map, fromList, lookup, fromListWith)
32 import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
33 import Data.Set (Set)
34 import Data.Text (Text, splitOn)
35 import Database.PostgreSQL.Simple ((:.)(..))
36 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
37 import Database.PostgreSQL.Simple.SqlQQ (sql)
38 import Database.PostgreSQL.Simple.ToField (toField, ToField)
39 import Database.PostgreSQL.Simple.FromField (FromField, fromField)
40 import Database.PostgreSQL.Simple.ToRow (toRow)
41 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
42 import Debug.Trace (trace)
43 import GHC.Generics (Generic)
44 import Gargantext.Core.Types -- (fromListTypeId, ListType, NodePoly(Node))
45 import Gargantext.Database.Config (nodeTypeId,userMaster)
46 import Gargantext.Database.Root (getRoot)
47 import Gargantext.Database.Types.Node (NodeType)
48 import Gargantext.Database.Schema.Node (getListsWithParentId, getCorporaWithParentId)
49 import Gargantext.Database.Utils (Cmd, runPGSQuery, runOpaQuery, formatPGSQuery)
50 import Gargantext.Prelude
51 import Opaleye hiding (FromField)
52 import Prelude (Enum, Bounded, minBound, maxBound, Functor)
53 import qualified Data.Set as DS
54 import qualified Database.PostgreSQL.Simple as PGS
55
56
57 type NgramsTerms = Text
58 type NgramsId = Int
59 type Size = Int
60
61 data NgramsPoly id terms n = NgramsDb { ngrams_id :: id
62 , ngrams_terms :: terms
63 , ngrams_n :: n
64 } deriving (Show)
65
66 type NgramsWrite = NgramsPoly (Maybe (Column PGInt4))
67 (Column PGText)
68 (Column PGInt4)
69
70 type NgramsRead = NgramsPoly (Column PGInt4)
71 (Column PGText)
72 (Column PGInt4)
73
74 type NgramsReadNull = NgramsPoly (Column (Nullable PGInt4))
75 (Column (Nullable PGText))
76 (Column (Nullable PGInt4))
77
78 type NgramsDb = NgramsPoly Int Text Int
79
80 $(makeAdaptorAndInstance "pNgramsDb" ''NgramsPoly)
81 -- $(makeLensesWith abbreviatedFields ''NgramsPoly)
82
83 ngramsTable :: Table NgramsWrite NgramsRead
84 ngramsTable = Table "ngrams" (pNgramsDb NgramsDb { ngrams_id = optional "id"
85 , ngrams_terms = required "terms"
86 , ngrams_n = required "n"
87 }
88 )
89
90 queryNgramsTable :: Query NgramsRead
91 queryNgramsTable = queryTable ngramsTable
92
93 dbGetNgramsDb :: Cmd err [NgramsDb]
94 dbGetNgramsDb = runOpaQuery queryNgramsTable
95
96 -- | Main Ngrams Types
97 -- | Typed Ngrams
98 -- Typed Ngrams localize the context of the ngrams
99 -- ngrams in source field of document has Sources Type
100 -- ngrams in authors field of document has Authors Type
101 -- ngrams in text (title or abstract) of documents has Terms Type
102 data NgramsType = Authors | Institutes | Sources | NgramsTerms
103 deriving (Eq, Show, Ord, Enum, Bounded)
104
105 newtype NgramsTypeId = NgramsTypeId Int
106 deriving (Eq, Show, Ord, Num)
107
108 instance ToField NgramsTypeId where
109 toField (NgramsTypeId n) = toField n
110
111 instance FromField NgramsTypeId where
112 fromField fld mdata = do
113 n <- fromField fld mdata
114 if (n :: Int) > 0 then return $ NgramsTypeId n
115 else mzero
116
117 pgNgramsType :: NgramsType -> Column PGInt4
118 pgNgramsType = pgNgramsTypeId . ngramsTypeId
119
120 pgNgramsTypeId :: NgramsTypeId -> Column PGInt4
121 pgNgramsTypeId (NgramsTypeId n) = pgInt4 n
122
123 ngramsTypeId :: NgramsType -> NgramsTypeId
124 ngramsTypeId Authors = 1
125 ngramsTypeId Institutes = 2
126 ngramsTypeId Sources = 3
127 ngramsTypeId NgramsTerms = 4
128
129 fromNgramsTypeId :: NgramsTypeId -> Maybe NgramsType
130 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
131
132 ------------------------------------------------------------------------
133 -- | TODO put it in Gargantext.Text.Ngrams
134 data Ngrams = Ngrams { _ngramsTerms :: Text
135 , _ngramsSize :: Int
136 } deriving (Generic, Show, Eq, Ord)
137
138 makeLenses ''Ngrams
139 instance PGS.ToRow Ngrams where
140 toRow (Ngrams t s) = [toField t, toField s]
141
142 text2ngrams :: Text -> Ngrams
143 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
144
145 -------------------------------------------------------------------------
146 -- | TODO put it in Gargantext.Text.Ngrams
147 -- Named entity are typed ngrams of Terms Ngrams
148 data NgramsT a =
149 NgramsT { _ngramsType :: NgramsType
150 , _ngramsT :: a
151 } deriving (Generic, Show, Eq, Ord)
152
153 makeLenses ''NgramsT
154
155 instance Functor NgramsT where
156 fmap = over ngramsT
157 -----------------------------------------------------------------------
158 data NgramsIndexed =
159 NgramsIndexed
160 { _ngrams :: Ngrams
161 , _ngramsId :: NgramsId
162 } deriving (Show, Generic, Eq, Ord)
163
164 makeLenses ''NgramsIndexed
165 ------------------------------------------------------------------------
166 data NgramIds =
167 NgramIds
168 { ngramId :: Int
169 , ngramTerms :: Text
170 } deriving (Show, Generic, Eq, Ord)
171
172 instance PGS.FromRow NgramIds where
173 fromRow = NgramIds <$> field <*> field
174
175 ----------------------
176 withMap :: Map NgramsTerms NgramsId -> NgramsTerms -> NgramsId
177 withMap m n = maybe (panic "withMap: should not happen") identity (lookup n m)
178
179 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
180 indexNgramsT = fmap . indexNgramsWith . withMap
181
182 indexNgrams :: Map NgramsTerms NgramsId -> Ngrams -> NgramsIndexed
183 indexNgrams = indexNgramsWith . withMap
184
185 -- NP: not sure we need it anymore
186 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams -> NgramsT NgramsIndexed
187 indexNgramsTWith = fmap . indexNgramsWith
188
189 indexNgramsWith :: (NgramsTerms -> NgramsId) -> Ngrams -> NgramsIndexed
190 indexNgramsWith f n = NgramsIndexed n (f $ _ngramsTerms n)
191
192 -- TODO-ACCESS: access must not be checked here but when insertNgrams is called.
193 insertNgrams :: [Ngrams] -> Cmd err (Map NgramsTerms NgramsId)
194 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
195
196 -- TODO-ACCESS: access must not be checked here but when insertNgrams' is called.
197 insertNgrams' :: [Ngrams] -> Cmd err [NgramIds]
198 insertNgrams' ns = runPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
199 where
200 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
201
202 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd err ByteString
203 insertNgrams_Debug ns = formatPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
204 where
205 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
206
207 ----------------------
208 queryInsertNgrams :: PGS.Query
209 queryInsertNgrams = [sql|
210 WITH input_rows(terms,n) AS (?)
211 , ins AS (
212 INSERT INTO ngrams (terms,n)
213 SELECT * FROM input_rows
214 ON CONFLICT (terms) DO NOTHING -- unique index created here
215 RETURNING id,terms
216 )
217
218 SELECT id, terms
219 FROM ins
220 UNION ALL
221 SELECT c.id, terms
222 FROM input_rows
223 JOIN ngrams c USING (terms); -- columns of unique index
224 |]
225
226
227 -- | Ngrams Table
228 -- TODO: the way we are getting main Master Corpus and List ID is not clean
229 -- TODO: if ids are not present -> create
230 -- TODO: Global Env State Monad to keep in memory the ids without retrieving it each time
231 getNgramsTableDb :: NodeType -> NgramsType
232 -> NgramsTableParamUser
233 -> Limit -> Offset
234 -> Cmd err [NgramsTableData]
235 getNgramsTableDb nt ngrt ntp limit_ offset_ = do
236
237
238 maybeRoot <- head <$> getRoot userMaster
239 let path = "Garg.Db.Ngrams.getTableNgrams: "
240 let masterRootId = maybe (panic $ path <> "no userMaster Tree") (view node_id) maybeRoot
241 -- let errMess = panic "Error"
242
243 corpusMasterId <- maybe (panic "error master corpus") (view node_id) <$> head <$> getCorporaWithParentId masterRootId
244
245 listMasterId <- maybe (panic "error master list") (view node_id) <$> head <$> getListsWithParentId corpusMasterId
246
247 getNgramsTableData nt ngrt ntp (NgramsTableParam listMasterId corpusMasterId) limit_ offset_
248
249 data NgramsTableParam =
250 NgramsTableParam { _nt_listId :: NodeId
251 , _nt_corpusId :: NodeId
252 }
253
254 type NgramsTableParamUser = NgramsTableParam
255 type NgramsTableParamMaster = NgramsTableParam
256
257
258 data NgramsTableData = NgramsTableData { _ntd_id :: Int
259 , _ntd_parent_id :: Maybe Int
260 , _ntd_terms :: Text
261 , _ntd_n :: Int
262 , _ntd_listType :: Maybe ListType
263 , _ntd_weight :: Double
264 } deriving (Show)
265
266
267
268 getNgramsTableData :: NodeType -> NgramsType
269 -> NgramsTableParamUser -> NgramsTableParamMaster
270 -> Limit -> Offset
271 -> Cmd err [NgramsTableData]
272 getNgramsTableData nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) limit_ offset_ =
273 trace ("Ngrams table params: " <> show params) <$>
274 map (\(i,p,t,n,lt,w) -> NgramsTableData i p t n (fromListTypeId lt) w) <$>
275 runPGSQuery querySelectTableNgramsTrees params
276 where
277 nodeTId = nodeTypeId nodeT
278 ngrmTId = ngramsTypeId ngrmT
279 params = (ul,ml,uc,mc,nodeTId,ngrmTId) :. (limit_, offset_)
280
281 getNgramsTableDataDebug :: PGS.ToRow a => a -> Cmd err ByteString
282 getNgramsTableDataDebug = formatPGSQuery querySelectTableNgramsTrees
283
284
285 querySelectTableNgrams :: PGS.Query
286 querySelectTableNgrams = [sql|
287
288 WITH tableUser AS (
289 SELECT ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
290 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
291 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
292 JOIN nodes_nodes nn ON nn.node2_id = corp.node_id
293 JOIN nodes n ON n.id = corp.node_id
294
295 WHERE list.node_id = ? -- User listId
296 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
297 AND n.typename = ? -- both type of childs (Documents or Contacts)
298 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms or...)
299 AND list.parent_id IS NULL
300 )
301 , tableMaster AS (
302 SELECT ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
303 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
304 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
305 JOIN nodes n ON n.id = corp.node_id
306 JOIN nodes_nodes nn ON nn.node2_id = n.id
307
308 WHERE list.node_id = ? -- Master listId
309 AND n.parent_id = ? -- Master CorpusId or AnnuaireId
310 AND n.typename = ? -- Master childs (Documents or Contacts)
311 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
312 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
313 AND list.parent_id IS NULL
314 )
315
316 SELECT COALESCE(tu.terms,tm.terms) AS terms
317 , COALESCE(tu.n,tm.n) AS n
318 , COALESCE(tu.list_type,tm.list_type) AS ngrams_type
319 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
320 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
321 GROUP BY tu.terms,tm.terms,tu.n,tm.n,tu.list_type,tm.list_type
322 ORDER BY 1,2
323 LIMIT ?
324 OFFSET ?;
325
326 |]
327
328
329 querySelectTableNgramsTrees :: PGS.Query
330 querySelectTableNgramsTrees = [sql|
331
332 -- DROP FUNCTION tree_start(integer,integer,integer,integer,integer,integer,integer,integer);
333 -- DROP FUNCTION tree_end(integer,integer,integer,integer,integer,integer);
334 -- DROP FUNCTION tree_ngrams(integer,integer,integer,integer,integer,integer,integer,integer);
335
336 CREATE OR REPLACE FUNCTION public.tree_start(luid INT, lmid INT,cuid INT, cmid INT, tdoc INT, tngrams INT, lmt INT, ofst INT)
337 RETURNS TABLE (id INT, parent_id INT, terms VARCHAR(255), n int, list_type int, weight float8) AS $$
338 BEGIN
339 RETURN QUERY
340 WITH tableUser AS (
341 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
342 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
343 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
344 JOIN nodes_nodes nn ON nn.node2_id = corp.node_id
345 JOIN nodes n ON n.id = corp.node_id
346
347 WHERE list.node_id = luid -- User listId
348 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
349 AND n.typename = tdoc -- both type of childs (Documents or Contacts)
350 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms or...)
351 AND list.parent_id IS NULL
352 ),
353 tableMaster AS (
354 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
355 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
356 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
357 JOIN nodes n ON n.id = corp.node_id
358 JOIN nodes_nodes nn ON nn.node2_id = n.id
359
360 WHERE list.node_id = lmid -- Master listId
361 AND n.parent_id = cmid -- Master CorpusId or AnnuaireId
362 AND n.typename = tdoc -- Master childs (Documents or Contacts)
363 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms1)
364 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
365 AND list.parent_id IS NULL
366 )
367
368 SELECT COALESCE(tu.id,tm.id) AS id
369 , COALESCE(tu.parent_id,tm.parent_id) AS parent_id
370 , COALESCE(tu.terms,tm.terms) AS terms
371 , COALESCE(tu.n,tm.n) AS n
372 , COALESCE(tu.list_type,tm.list_type) AS ngrams_type
373 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
374 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
375 GROUP BY tu.id,tm.id,tu.parent_id,tm.parent_id,tu.terms,tm.terms,tu.n,tm.n,tu.list_type,tm.list_type
376 ORDER BY 3
377 LIMIT lmt
378 OFFSET ofst
379 ;
380 END $$
381 LANGUAGE plpgsql ;
382
383 CREATE OR REPLACE FUNCTION public.tree_end(luid INT, lmid INT,cuid INT, cmid INT, tdoc INT, tngrams INT)
384 RETURNS TABLE (id INT, parent_id INT, terms VARCHAR(255), n int, list_type int, weight float8) AS $$
385 BEGIN
386 RETURN QUERY
387 WITH tableUser2 AS (
388 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
389 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
390 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
391 JOIN nodes_nodes nn ON nn.node2_id = corp.node_id
392 JOIN nodes n ON n.id = corp.node_id
393
394 WHERE list.node_id = luid -- User listId
395 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
396 AND n.typename = tdoc -- both type of childs (Documents or Contacts)
397 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms or...)
398 )
399 , tableMaster2 AS (
400 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
401 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
402 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
403 JOIN nodes n ON n.id = corp.node_id
404 JOIN nodes_nodes nn ON nn.node2_id = n.id
405
406 WHERE list.node_id = lmid -- Master listId
407 AND n.parent_id = cmid -- Master CorpusId or AnnuaireId
408 AND n.typename = tdoc -- Master childs (Documents or Contacts)
409 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms1)
410 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
411 )
412 SELECT COALESCE(tu.id,tm.id) as id
413 , COALESCE(tu.parent_id,tm.parent_id) as parent_id
414 , COALESCE(tu.terms,tm.terms) AS terms
415 , COALESCE(tu.n,tm.n) AS n
416 , COALESCE(tu.list_type,tm.list_type) AS list_type
417 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
418 FROM tableUser2 tu RIGHT JOIN tableMaster2 tm ON tu.terms = tm.terms
419 GROUP BY tu.id,tm.id,tu.parent_id,tm.parent_id,tu.terms,tm.terms,tu.n,tm.n,tu.list_type,tm.list_type
420 ;
421 END $$
422 LANGUAGE plpgsql ;
423
424
425 CREATE OR REPLACE FUNCTION public.tree_ngrams(luid INT, lmid INT,cuid INT, cmid INT, tdoc INT, tngrams INT, lmt INT, ofst INT)
426 RETURNS TABLE (id INT, parent_id INT, terms VARCHAR(255), n int, list_type int, weight float8) AS $$
427 BEGIN
428 RETURN QUERY WITH RECURSIVE
429 ngrams_tree (id,parent_id,terms,n,list_type,weight) AS (
430 SELECT ts.id,ts.parent_id,ts.terms,ts.n,ts.list_type,ts.weight FROM tree_start($1,$2,$3,$4,$5,$6,$7,$8) ts
431 UNION
432 SELECT te.id,te.parent_id,te.terms,te.n,te.list_type,te.weight FROM tree_end($1,$2,$3,$4,$5,$6) as te
433 INNER JOIN ngrams_tree ON te.parent_id = ngrams_tree.id
434 )
435 SELECT * from ngrams_tree;
436 END $$
437 LANGUAGE plpgsql ;
438
439 select * from tree_ngrams(?,?,?,?,?,?,?,?)
440
441 |]
442
443
444
445 type ListIdUser = NodeId
446 type ListIdMaster = NodeId
447
448 type MapToChildren = Map Text (Set Text)
449 type MapToParent = Map Text Text
450
451 getNgramsGroup :: ListIdUser -> ListIdMaster -> Cmd err (MapToParent, MapToChildren)
452 getNgramsGroup lu lm = do
453 groups <- runPGSQuery querySelectNgramsGroup (lu,lm)
454 let mapChildren = fromListWith (<>) $ map (\(a,b) -> (a, DS.singleton b)) groups
455 let mapParent = fromListWith (<>) $ map (\(a,b) -> (b, a)) groups
456 pure (mapParent, mapChildren)
457
458 querySelectNgramsGroup :: PGS.Query
459 querySelectNgramsGroup = [sql|
460 WITH groupUser AS (
461 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
462 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
463 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
464 WHERE
465 nnn.node_id = ? -- User listId
466 ),
467 groupMaster AS (
468 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
469 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
470 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
471 WHERE
472 nnn.node_id = ? -- Master listId
473 )
474 SELECT COALESCE(gu.t1,gm.t1) AS ngram1_id
475 , COALESCE(gu.t2,gm.t2) AS ngram2_id
476 FROM groupUser gu LEFT JOIN groupMaster gm ON gu.t1 = gm.t1
477 |]
478