]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Schema/Ngrams.hs
Merge remote-tracking branch 'origin/dev-comments' into dev
[gargantext.git] / src / Gargantext / Database / Schema / Ngrams.hs
1 {-|
2 Module : Gargantext.Database.Schema.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams connection to the Database.
11
12 -}
13
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE GeneralizedNewtypeDeriving #-}
19 {-# LANGUAGE MultiParamTypeClasses #-}
20 {-# LANGUAGE NoImplicitPrelude #-}
21 {-# LANGUAGE OverloadedStrings #-}
22 {-# LANGUAGE QuasiQuotes #-}
23 {-# LANGUAGE RankNTypes #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Schema.Ngrams where
27
28 import Control.Lens (makeLenses, view, over)
29 import Control.Monad (mzero)
30 import Data.ByteString.Internal (ByteString)
31 import Data.Map (Map, fromList, lookup, fromListWith)
32 import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
33 import Data.Set (Set)
34 import Data.Text (Text, splitOn)
35 import Database.PostgreSQL.Simple ((:.)(..))
36 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
37 import Database.PostgreSQL.Simple.SqlQQ (sql)
38 import Database.PostgreSQL.Simple.ToField (toField, ToField)
39 import Database.PostgreSQL.Simple.FromField (FromField, fromField)
40 import Database.PostgreSQL.Simple.ToRow (toRow)
41 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
42 import Debug.Trace (trace)
43 import GHC.Generics (Generic)
44 import Gargantext.Core.Types -- (fromListTypeId, ListType, NodePoly(Node))
45 import Gargantext.Database.Config (nodeTypeId,userMaster)
46 import Gargantext.Database.Root (getRoot)
47 import Gargantext.Database.Types.Node (NodeType)
48 import Gargantext.Database.Schema.Node (getListsWithParentId, getCorporaWithParentId)
49 import Gargantext.Database.Utils (Cmd, runPGSQuery, runOpaQuery, formatPGSQuery)
50 import Gargantext.Prelude
51 import Opaleye hiding (FromField)
52 import Prelude (Enum, Bounded, minBound, maxBound, Functor)
53 import qualified Data.Set as DS
54 import qualified Database.PostgreSQL.Simple as PGS
55
56
57 type NgramsTerms = Text
58 type NgramsId = Int
59 type Size = Int
60
61 --{-
62 data NgramsPoly id terms n = NgramsDb { ngrams_id :: id
63 , ngrams_terms :: terms
64 , ngrams_n :: n
65 } deriving (Show)
66
67 --}
68 type NgramsWrite = NgramsPoly (Maybe (Column PGInt4))
69 (Column PGText)
70 (Column PGInt4)
71
72 type NgramsRead = NgramsPoly (Column PGInt4)
73 (Column PGText)
74 (Column PGInt4)
75
76 type NgramsReadNull = NgramsPoly (Column (Nullable PGInt4))
77 (Column (Nullable PGText))
78 (Column (Nullable PGInt4))
79
80 --{-
81 type NgramsDb = NgramsPoly Int Text Int
82
83 $(makeAdaptorAndInstance "pNgramsDb" ''NgramsPoly)
84 -- $(makeLensesWith abbreviatedFields ''NgramsPoly)
85
86 ngramsTable :: Table NgramsWrite NgramsRead
87 ngramsTable = Table "ngrams" (pNgramsDb NgramsDb { ngrams_id = optional "id"
88 , ngrams_terms = required "terms"
89 , ngrams_n = required "n"
90 }
91 )
92 --{-
93 queryNgramsTable :: Query NgramsRead
94 queryNgramsTable = queryTable ngramsTable
95
96 dbGetNgramsDb :: Cmd err [NgramsDb]
97 dbGetNgramsDb = runOpaQuery queryNgramsTable
98 --}
99
100 -- | Main Ngrams Types
101 -- | Typed Ngrams
102 -- Typed Ngrams localize the context of the ngrams
103 -- ngrams in source field of document has Sources Type
104 -- ngrams in authors field of document has Authors Type
105 -- ngrams in text (title or abstract) of documents has Terms Type
106 data NgramsType = Authors | Institutes | Sources | NgramsTerms
107 deriving (Eq, Show, Ord, Enum, Bounded)
108
109 newtype NgramsTypeId = NgramsTypeId Int
110 deriving (Eq, Show, Ord, Num)
111
112 instance ToField NgramsTypeId where
113 toField (NgramsTypeId n) = toField n
114
115 instance FromField NgramsTypeId where
116 fromField fld mdata = do
117 n <- fromField fld mdata
118 if (n :: Int) > 0 then return $ NgramsTypeId n
119 else mzero
120
121 pgNgramsType :: NgramsType -> Column PGInt4
122 pgNgramsType = pgNgramsTypeId . ngramsTypeId
123
124 pgNgramsTypeId :: NgramsTypeId -> Column PGInt4
125 pgNgramsTypeId (NgramsTypeId n) = pgInt4 n
126
127 ngramsTypeId :: NgramsType -> NgramsTypeId
128 ngramsTypeId Authors = 1
129 ngramsTypeId Institutes = 2
130 ngramsTypeId Sources = 3
131 ngramsTypeId NgramsTerms = 4
132
133 fromNgramsTypeId :: NgramsTypeId -> Maybe NgramsType
134 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
135
136 ------------------------------------------------------------------------
137 -- | TODO put it in Gargantext.Text.Ngrams
138 data Ngrams = Ngrams { _ngramsTerms :: Text
139 , _ngramsSize :: Int
140 } deriving (Generic, Show, Eq, Ord)
141
142 makeLenses ''Ngrams
143 instance PGS.ToRow Ngrams where
144 toRow (Ngrams t s) = [toField t, toField s]
145
146 text2ngrams :: Text -> Ngrams
147 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
148
149 -------------------------------------------------------------------------
150 -- | TODO put it in Gargantext.Text.Ngrams
151 -- Named entity are typed ngrams of Terms Ngrams
152 data NgramsT a =
153 NgramsT { _ngramsType :: NgramsType
154 , _ngramsT :: a
155 } deriving (Generic, Show, Eq, Ord)
156
157 makeLenses ''NgramsT
158
159 instance Functor NgramsT where
160 fmap = over ngramsT
161 -----------------------------------------------------------------------
162 data NgramsIndexed =
163 NgramsIndexed
164 { _ngrams :: Ngrams
165 , _ngramsId :: NgramsId
166 } deriving (Show, Generic, Eq, Ord)
167
168 makeLenses ''NgramsIndexed
169 ------------------------------------------------------------------------
170 data NgramIds =
171 NgramIds
172 { ngramId :: Int
173 , ngramTerms :: Text
174 } deriving (Show, Generic, Eq, Ord)
175
176 instance PGS.FromRow NgramIds where
177 fromRow = NgramIds <$> field <*> field
178
179 ----------------------
180 withMap :: Map NgramsTerms NgramsId -> NgramsTerms -> NgramsId
181 withMap m n = maybe (panic "withMap: should not happen") identity (lookup n m)
182
183 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
184 indexNgramsT = fmap . indexNgramsWith . withMap
185
186 indexNgrams :: Map NgramsTerms NgramsId -> Ngrams -> NgramsIndexed
187 indexNgrams = indexNgramsWith . withMap
188
189 -- NP: not sure we need it anymore
190 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams -> NgramsT NgramsIndexed
191 indexNgramsTWith = fmap . indexNgramsWith
192
193 indexNgramsWith :: (NgramsTerms -> NgramsId) -> Ngrams -> NgramsIndexed
194 indexNgramsWith f n = NgramsIndexed n (f $ _ngramsTerms n)
195
196 -- TODO-ACCESS: access must not be checked here but when insertNgrams is called.
197 insertNgrams :: [Ngrams] -> Cmd err (Map NgramsTerms NgramsId)
198 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
199
200 -- TODO-ACCESS: access must not be checked here but when insertNgrams' is called.
201 insertNgrams' :: [Ngrams] -> Cmd err [NgramIds]
202 insertNgrams' ns = runPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
203 where
204 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
205
206 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd err ByteString
207 insertNgrams_Debug ns = formatPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
208 where
209 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
210
211 ----------------------
212 queryInsertNgrams :: PGS.Query
213 queryInsertNgrams = [sql|
214 WITH input_rows(terms,n) AS (?)
215 , ins AS (
216 INSERT INTO ngrams (terms,n)
217 SELECT * FROM input_rows
218 ON CONFLICT (terms) DO NOTHING -- unique index created here
219 RETURNING id,terms
220 )
221
222 SELECT id, terms
223 FROM ins
224 UNION ALL
225 SELECT c.id, terms
226 FROM input_rows
227 JOIN ngrams c USING (terms); -- columns of unique index
228 |]
229
230
231 -- | Ngrams Table
232 -- TODO: the way we are getting main Master Corpus and List ID is not clean
233 -- TODO: if ids are not present -> create
234 -- TODO: Global Env State Monad to keep in memory the ids without retrieving it each time
235 getNgramsTableDb :: NodeType -> NgramsType
236 -> NgramsTableParamUser
237 -> Limit -> Offset
238 -> Cmd err [NgramsTableData]
239 getNgramsTableDb nt ngrt ntp limit_ offset_ = do
240
241
242 maybeRoot <- head <$> getRoot userMaster
243 let path = "Garg.Db.Ngrams.getTableNgrams: "
244 let masterRootId = maybe (panic $ path <> "no userMaster Tree") (view node_id) maybeRoot
245 -- let errMess = panic "Error"
246
247 corpusMasterId <- maybe (panic "error master corpus") (view node_id) <$> head <$> getCorporaWithParentId masterRootId
248
249 listMasterId <- maybe (panic "error master list") (view node_id) <$> head <$> getListsWithParentId corpusMasterId
250
251 getNgramsTableData nt ngrt ntp (NgramsTableParam listMasterId corpusMasterId) limit_ offset_
252
253 data NgramsTableParam =
254 NgramsTableParam { _nt_listId :: NodeId
255 , _nt_corpusId :: NodeId
256 }
257
258 type NgramsTableParamUser = NgramsTableParam
259 type NgramsTableParamMaster = NgramsTableParam
260
261
262 data NgramsTableData = NgramsTableData { _ntd_id :: Int
263 , _ntd_parent_id :: Maybe Int
264 , _ntd_terms :: Text
265 , _ntd_n :: Int
266 , _ntd_listType :: Maybe ListType
267 , _ntd_weight :: Double
268 } deriving (Show)
269
270
271
272 getNgramsTableData :: NodeType -> NgramsType
273 -> NgramsTableParamUser -> NgramsTableParamMaster
274 -> Limit -> Offset
275 -> Cmd err [NgramsTableData]
276 getNgramsTableData nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) limit_ offset_ =
277 trace ("Ngrams table params: " <> show params) <$>
278 map (\(i,p,t,n,lt,w) -> NgramsTableData i p t n (fromListTypeId lt) w) <$>
279 runPGSQuery querySelectTableNgramsTrees params
280 where
281 nodeTId = nodeTypeId nodeT
282 ngrmTId = ngramsTypeId ngrmT
283 params = (ul,ml,uc,mc,nodeTId,ngrmTId) :. (limit_, offset_)
284
285 getNgramsTableDataDebug :: PGS.ToRow a => a -> Cmd err ByteString
286 getNgramsTableDataDebug = formatPGSQuery querySelectTableNgramsTrees
287
288
289 querySelectTableNgrams :: PGS.Query
290 querySelectTableNgrams = [sql|
291
292 WITH tableUser AS (
293 SELECT ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
294 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
295 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
296 JOIN nodes_nodes nn ON nn.node2_id = corp.node_id
297 JOIN nodes n ON n.id = corp.node_id
298
299 WHERE list.node_id = ? -- User listId
300 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
301 AND n.typename = ? -- both type of childs (Documents or Contacts)
302 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms or...)
303 AND list.parent_id IS NULL
304 )
305 , tableMaster AS (
306 SELECT ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
307 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
308 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
309 JOIN nodes n ON n.id = corp.node_id
310 JOIN nodes_nodes nn ON nn.node2_id = n.id
311
312 WHERE list.node_id = ? -- Master listId
313 AND n.parent_id = ? -- Master CorpusId or AnnuaireId
314 AND n.typename = ? -- Master childs (Documents or Contacts)
315 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
316 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
317 AND list.parent_id IS NULL
318 )
319
320 SELECT COALESCE(tu.terms,tm.terms) AS terms
321 , COALESCE(tu.n,tm.n) AS n
322 , COALESCE(tu.list_type,tm.list_type) AS ngrams_type
323 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
324 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
325 GROUP BY tu.terms,tm.terms,tu.n,tm.n,tu.list_type,tm.list_type
326 ORDER BY 1,2
327 LIMIT ?
328 OFFSET ?;
329
330 |]
331
332
333 querySelectTableNgramsTrees :: PGS.Query
334 querySelectTableNgramsTrees = [sql|
335
336 -- DROP FUNCTION tree_start(integer,integer,integer,integer,integer,integer,integer,integer);
337 -- DROP FUNCTION tree_end(integer,integer,integer,integer,integer,integer);
338 -- DROP FUNCTION tree_ngrams(integer,integer,integer,integer,integer,integer,integer,integer);
339
340 CREATE OR REPLACE FUNCTION public.tree_start(luid INT, lmid INT,cuid INT, cmid INT, tdoc INT, tngrams INT, lmt INT, ofst INT)
341 RETURNS TABLE (id INT, parent_id INT, terms VARCHAR(255), n int, list_type int, weight float8) AS $$
342 BEGIN
343 RETURN QUERY
344 WITH tableUser AS (
345 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
346 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
347 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
348 JOIN nodes_nodes nn ON nn.node2_id = corp.node_id
349 JOIN nodes n ON n.id = corp.node_id
350
351 WHERE list.node_id = luid -- User listId
352 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
353 AND n.typename = tdoc -- both type of childs (Documents or Contacts)
354 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms or...)
355 AND list.parent_id IS NULL
356 ),
357 tableMaster AS (
358 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
359 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
360 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
361 JOIN nodes n ON n.id = corp.node_id
362 JOIN nodes_nodes nn ON nn.node2_id = n.id
363
364 WHERE list.node_id = lmid -- Master listId
365 AND n.parent_id = cmid -- Master CorpusId or AnnuaireId
366 AND n.typename = tdoc -- Master childs (Documents or Contacts)
367 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms1)
368 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
369 AND list.parent_id IS NULL
370 )
371
372 SELECT COALESCE(tu.id,tm.id) AS id
373 , COALESCE(tu.parent_id,tm.parent_id) AS parent_id
374 , COALESCE(tu.terms,tm.terms) AS terms
375 , COALESCE(tu.n,tm.n) AS n
376 , COALESCE(tu.list_type,tm.list_type) AS ngrams_type
377 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
378 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
379 GROUP BY tu.id,tm.id,tu.parent_id,tm.parent_id,tu.terms,tm.terms,tu.n,tm.n,tu.list_type,tm.list_type
380 ORDER BY 3
381 LIMIT lmt
382 OFFSET ofst
383 ;
384 END $$
385 LANGUAGE plpgsql ;
386
387 CREATE OR REPLACE FUNCTION public.tree_end(luid INT, lmid INT,cuid INT, cmid INT, tdoc INT, tngrams INT)
388 RETURNS TABLE (id INT, parent_id INT, terms VARCHAR(255), n int, list_type int, weight float8) AS $$
389 BEGIN
390 RETURN QUERY
391 WITH tableUser2 AS (
392 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
393 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
394 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
395 JOIN nodes_nodes nn ON nn.node2_id = corp.node_id
396 JOIN nodes n ON n.id = corp.node_id
397
398 WHERE list.node_id = luid -- User listId
399 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
400 AND n.typename = tdoc -- both type of childs (Documents or Contacts)
401 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms or...)
402 )
403 , tableMaster2 AS (
404 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
405 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
406 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
407 JOIN nodes n ON n.id = corp.node_id
408 JOIN nodes_nodes nn ON nn.node2_id = n.id
409
410 WHERE list.node_id = lmid -- Master listId
411 AND n.parent_id = cmid -- Master CorpusId or AnnuaireId
412 AND n.typename = tdoc -- Master childs (Documents or Contacts)
413 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms1)
414 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
415 )
416 SELECT COALESCE(tu.id,tm.id) as id
417 , COALESCE(tu.parent_id,tm.parent_id) as parent_id
418 , COALESCE(tu.terms,tm.terms) AS terms
419 , COALESCE(tu.n,tm.n) AS n
420 , COALESCE(tu.list_type,tm.list_type) AS list_type
421 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
422 FROM tableUser2 tu RIGHT JOIN tableMaster2 tm ON tu.terms = tm.terms
423 GROUP BY tu.id,tm.id,tu.parent_id,tm.parent_id,tu.terms,tm.terms,tu.n,tm.n,tu.list_type,tm.list_type
424 ;
425 END $$
426 LANGUAGE plpgsql ;
427
428
429 CREATE OR REPLACE FUNCTION public.tree_ngrams(luid INT, lmid INT,cuid INT, cmid INT, tdoc INT, tngrams INT, lmt INT, ofst INT)
430 RETURNS TABLE (id INT, parent_id INT, terms VARCHAR(255), n int, list_type int, weight float8) AS $$
431 BEGIN
432 RETURN QUERY WITH RECURSIVE
433 ngrams_tree (id,parent_id,terms,n,list_type,weight) AS (
434 SELECT ts.id,ts.parent_id,ts.terms,ts.n,ts.list_type,ts.weight FROM tree_start($1,$2,$3,$4,$5,$6,$7,$8) ts
435 UNION
436 SELECT te.id,te.parent_id,te.terms,te.n,te.list_type,te.weight FROM tree_end($1,$2,$3,$4,$5,$6) as te
437 INNER JOIN ngrams_tree ON te.parent_id = ngrams_tree.id
438 )
439 SELECT * from ngrams_tree;
440 END $$
441 LANGUAGE plpgsql ;
442
443 select * from tree_ngrams(?,?,?,?,?,?,?,?)
444
445 |]
446
447
448
449 type ListIdUser = NodeId
450 type ListIdMaster = NodeId
451
452 type MapToChildren = Map Text (Set Text)
453 type MapToParent = Map Text Text
454
455 getNgramsGroup :: ListIdUser -> ListIdMaster -> Cmd err (MapToParent, MapToChildren)
456 getNgramsGroup lu lm = do
457 groups <- runPGSQuery querySelectNgramsGroup (lu,lm)
458 let mapChildren = fromListWith (<>) $ map (\(a,b) -> (a, DS.singleton b)) groups
459 let mapParent = fromListWith (<>) $ map (\(a,b) -> (b, a)) groups
460 pure (mapParent, mapChildren)
461
462 querySelectNgramsGroup :: PGS.Query
463 querySelectNgramsGroup = [sql|
464 WITH groupUser AS (
465 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
466 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
467 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
468 WHERE
469 nnn.node_id = ? -- User listId
470 ),
471 groupMaster AS (
472 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
473 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
474 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
475 WHERE
476 nnn.node_id = ? -- Master listId
477 )
478 SELECT COALESCE(gu.t1,gm.t1) AS ngram1_id
479 , COALESCE(gu.t2,gm.t2) AS ngram2_id
480 FROM groupUser gu LEFT JOIN groupMaster gm ON gu.t1 = gm.t1
481 |]
482