]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Schema/Ngrams.hs
Merge branch 'dev-phylo' of https://gitlab.iscpif.fr/gargantext/haskell-gargantext...
[gargantext.git] / src / Gargantext / Database / Schema / Ngrams.hs
1 {-|
2 Module : Gargantext.Database.Schema.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams connection to the Database.
11
12 -}
13
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE GeneralizedNewtypeDeriving #-}
19 {-# LANGUAGE MultiParamTypeClasses #-}
20 {-# LANGUAGE NoImplicitPrelude #-}
21 {-# LANGUAGE OverloadedStrings #-}
22 {-# LANGUAGE QuasiQuotes #-}
23 {-# LANGUAGE RankNTypes #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Schema.Ngrams where
27
28 import Control.Lens (makeLenses, view, over)
29 import Control.Monad (mzero)
30 import Data.Aeson
31 import Data.Aeson.Types (toJSONKeyText)
32 import Data.ByteString.Internal (ByteString)
33 import Data.Map (Map, fromList, lookup, fromListWith)
34 import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
35 import Data.Set (Set)
36 import Data.Text (Text, splitOn, pack)
37 import Database.PostgreSQL.Simple ((:.)(..))
38 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
39 import Database.PostgreSQL.Simple.SqlQQ (sql)
40 import Database.PostgreSQL.Simple.ToField (toField, ToField)
41 import Database.PostgreSQL.Simple.FromField (FromField, fromField)
42 import Database.PostgreSQL.Simple.ToRow (toRow)
43 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
44 import Debug.Trace (trace)
45 import GHC.Generics (Generic)
46 import Gargantext.Core.Types -- (fromListTypeId, ListType, NodePoly(Node))
47 import Gargantext.Database.Config (nodeTypeId,userMaster)
48 import Gargantext.Database.Root (getRoot)
49 import Gargantext.Database.Types.Node (NodeType)
50 import Gargantext.Database.Schema.Node (getListsWithParentId, getCorporaWithParentId)
51 import Gargantext.Database.Utils (Cmd, runPGSQuery, runOpaQuery, formatPGSQuery)
52 import Gargantext.Prelude
53 import Opaleye hiding (FromField)
54 import Prelude (Enum, Bounded, minBound, maxBound, Functor)
55 import qualified Data.Set as DS
56 import qualified Database.PostgreSQL.Simple as PGS
57
58
59 type NgramsId = Int
60 type NgramsTerms = Text
61 type Size = Int
62
63 data NgramsPoly id terms n = NgramsDb { ngrams_id :: id
64 , ngrams_terms :: terms
65 , ngrams_n :: n
66 } deriving (Show)
67
68 type NgramsWrite = NgramsPoly (Maybe (Column PGInt4))
69 (Column PGText)
70 (Column PGInt4)
71
72 type NgramsRead = NgramsPoly (Column PGInt4)
73 (Column PGText)
74 (Column PGInt4)
75
76 type NgramsReadNull = NgramsPoly (Column (Nullable PGInt4))
77 (Column (Nullable PGText))
78 (Column (Nullable PGInt4))
79
80 type NgramsDb = NgramsPoly Int Text Int
81
82 $(makeAdaptorAndInstance "pNgramsDb" ''NgramsPoly)
83 -- $(makeLensesWith abbreviatedFields ''NgramsPoly)
84
85 ngramsTable :: Table NgramsWrite NgramsRead
86 ngramsTable = Table "ngrams" (pNgramsDb NgramsDb { ngrams_id = optional "id"
87 , ngrams_terms = required "terms"
88 , ngrams_n = required "n"
89 }
90 )
91
92 queryNgramsTable :: Query NgramsRead
93 queryNgramsTable = queryTable ngramsTable
94
95 dbGetNgramsDb :: Cmd err [NgramsDb]
96 dbGetNgramsDb = runOpaQuery queryNgramsTable
97
98 -- | Main Ngrams Types
99 -- | Typed Ngrams
100 -- Typed Ngrams localize the context of the ngrams
101 -- ngrams in source field of document has Sources Type
102 -- ngrams in authors field of document has Authors Type
103 -- ngrams in text (title or abstract) of documents has Terms Type
104 data NgramsType = Authors | Institutes | Sources | NgramsTerms
105 deriving (Eq, Show, Ord, Enum, Bounded, Generic)
106
107 instance FromJSON NgramsType
108 instance FromJSONKey NgramsType where
109 fromJSONKey = FromJSONKeyTextParser (parseJSON . String)
110 instance ToJSON NgramsType
111 instance ToJSONKey NgramsType where
112 toJSONKey = toJSONKeyText (pack . show)
113
114 newtype NgramsTypeId = NgramsTypeId Int
115 deriving (Eq, Show, Ord, Num)
116
117 instance ToField NgramsTypeId where
118 toField (NgramsTypeId n) = toField n
119
120 instance FromField NgramsTypeId where
121 fromField fld mdata = do
122 n <- fromField fld mdata
123 if (n :: Int) > 0 then return $ NgramsTypeId n
124 else mzero
125
126 instance QueryRunnerColumnDefault (Nullable PGInt4) NgramsTypeId
127 where
128 queryRunnerColumnDefault = fieldQueryRunnerColumn
129
130 pgNgramsType :: NgramsType -> Column PGInt4
131 pgNgramsType = pgNgramsTypeId . ngramsTypeId
132
133 pgNgramsTypeId :: NgramsTypeId -> Column PGInt4
134 pgNgramsTypeId (NgramsTypeId n) = pgInt4 n
135
136 ngramsTypeId :: NgramsType -> NgramsTypeId
137 ngramsTypeId Authors = 1
138 ngramsTypeId Institutes = 2
139 ngramsTypeId Sources = 3
140 ngramsTypeId NgramsTerms = 4
141
142 fromNgramsTypeId :: NgramsTypeId -> Maybe NgramsType
143 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
144
145 ------------------------------------------------------------------------
146 -- | TODO put it in Gargantext.Text.Ngrams
147 data Ngrams = Ngrams { _ngramsTerms :: Text
148 , _ngramsSize :: Int
149 } deriving (Generic, Show, Eq, Ord)
150
151 makeLenses ''Ngrams
152 instance PGS.ToRow Ngrams where
153 toRow (Ngrams t s) = [toField t, toField s]
154
155 text2ngrams :: Text -> Ngrams
156 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
157
158 -------------------------------------------------------------------------
159 -- | TODO put it in Gargantext.Text.Ngrams
160 -- Named entity are typed ngrams of Terms Ngrams
161 data NgramsT a =
162 NgramsT { _ngramsType :: NgramsType
163 , _ngramsT :: a
164 } deriving (Generic, Show, Eq, Ord)
165
166 makeLenses ''NgramsT
167
168 instance Functor NgramsT where
169 fmap = over ngramsT
170 -----------------------------------------------------------------------
171 data NgramsIndexed =
172 NgramsIndexed
173 { _ngrams :: Ngrams
174 , _ngramsId :: NgramsId
175 } deriving (Show, Generic, Eq, Ord)
176
177 makeLenses ''NgramsIndexed
178 ------------------------------------------------------------------------
179 data NgramIds =
180 NgramIds
181 { ngramId :: Int
182 , ngramTerms :: Text
183 } deriving (Show, Generic, Eq, Ord)
184
185 instance PGS.FromRow NgramIds where
186 fromRow = NgramIds <$> field <*> field
187
188 ----------------------
189 withMap :: Map NgramsTerms NgramsId -> NgramsTerms -> NgramsId
190 withMap m n = maybe (panic "withMap: should not happen") identity (lookup n m)
191
192 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
193 indexNgramsT = fmap . indexNgramsWith . withMap
194
195 indexNgrams :: Map NgramsTerms NgramsId -> Ngrams -> NgramsIndexed
196 indexNgrams = indexNgramsWith . withMap
197
198 -- NP: not sure we need it anymore
199 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams -> NgramsT NgramsIndexed
200 indexNgramsTWith = fmap . indexNgramsWith
201
202 indexNgramsWith :: (NgramsTerms -> NgramsId) -> Ngrams -> NgramsIndexed
203 indexNgramsWith f n = NgramsIndexed n (f $ _ngramsTerms n)
204
205 -- TODO-ACCESS: access must not be checked here but when insertNgrams is called.
206 insertNgrams :: [Ngrams] -> Cmd err (Map NgramsTerms NgramsId)
207 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
208
209 -- TODO-ACCESS: access must not be checked here but when insertNgrams' is called.
210 insertNgrams' :: [Ngrams] -> Cmd err [NgramIds]
211 insertNgrams' ns = runPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
212 where
213 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
214
215 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd err ByteString
216 insertNgrams_Debug ns = formatPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
217 where
218 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
219
220 ----------------------
221 queryInsertNgrams :: PGS.Query
222 queryInsertNgrams = [sql|
223 WITH input_rows(terms,n) AS (?)
224 , ins AS (
225 INSERT INTO ngrams (terms,n)
226 SELECT * FROM input_rows
227 ON CONFLICT (terms) DO NOTHING -- unique index created here
228 RETURNING id,terms
229 )
230
231 SELECT id, terms
232 FROM ins
233 UNION ALL
234 SELECT c.id, terms
235 FROM input_rows
236 JOIN ngrams c USING (terms); -- columns of unique index
237 |]
238
239
240 -- | Ngrams Table
241 -- TODO: the way we are getting main Master Corpus and List ID is not clean
242 -- TODO: if ids are not present -> create
243 -- TODO: Global Env State Monad to keep in memory the ids without retrieving it each time
244 getNgramsTableDb :: NodeType -> NgramsType
245 -> NgramsTableParamUser
246 -> Limit -> Offset
247 -> Cmd err [NgramsTableData]
248 getNgramsTableDb nt ngrt ntp limit_ offset_ = do
249
250
251 maybeRoot <- head <$> getRoot userMaster
252 let path = "Garg.Db.Ngrams.getTableNgrams: "
253 let masterRootId = maybe (panic $ path <> "no userMaster Tree") (view node_id) maybeRoot
254 -- let errMess = panic "Error"
255
256 corpusMasterId <- maybe (panic "error master corpus") (view node_id) <$> head <$> getCorporaWithParentId masterRootId
257
258 listMasterId <- maybe (panic "error master list") (view node_id) <$> head <$> getListsWithParentId corpusMasterId
259
260 getNgramsTableData nt ngrt ntp (NgramsTableParam listMasterId corpusMasterId) limit_ offset_
261
262 data NgramsTableParam =
263 NgramsTableParam { _nt_listId :: NodeId
264 , _nt_corpusId :: NodeId
265 }
266
267 type NgramsTableParamUser = NgramsTableParam
268 type NgramsTableParamMaster = NgramsTableParam
269
270
271 data NgramsTableData = NgramsTableData { _ntd_id :: Int
272 , _ntd_parent_id :: Maybe Int
273 , _ntd_terms :: Text
274 , _ntd_n :: Int
275 , _ntd_listType :: Maybe ListType
276 , _ntd_weight :: Double
277 } deriving (Show)
278
279
280
281 getNgramsTableData :: NodeType -> NgramsType
282 -> NgramsTableParamUser -> NgramsTableParamMaster
283 -> Limit -> Offset
284 -> Cmd err [NgramsTableData]
285 getNgramsTableData nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) limit_ offset_ =
286 trace ("Ngrams table params: " <> show params) <$>
287 map (\(i,p,t,n,lt,w) -> NgramsTableData i p t n (fromListTypeId lt) w) <$>
288 runPGSQuery querySelectTableNgramsTrees params
289 where
290 nodeTId = nodeTypeId nodeT
291 ngrmTId = ngramsTypeId ngrmT
292 params = (ul,ml,uc,mc,nodeTId,ngrmTId) :. (limit_, offset_)
293
294 getNgramsTableDataDebug :: PGS.ToRow a => a -> Cmd err ByteString
295 getNgramsTableDataDebug = formatPGSQuery querySelectTableNgramsTrees
296
297
298 querySelectTableNgrams :: PGS.Query
299 querySelectTableNgrams = [sql|
300
301 WITH tableUser AS (
302 SELECT ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
303 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
304 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
305 JOIN nodes_nodes nn ON nn.node2_id = corp.node_id
306 JOIN nodes n ON n.id = corp.node_id
307
308 WHERE list.node_id = ? -- User listId
309 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
310 AND n.typename = ? -- both type of childs (Documents or Contacts)
311 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms or...)
312 AND list.parent_id IS NULL
313 )
314 , tableMaster AS (
315 SELECT ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
316 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
317 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
318 JOIN nodes n ON n.id = corp.node_id
319 JOIN nodes_nodes nn ON nn.node2_id = n.id
320
321 WHERE list.node_id = ? -- Master listId
322 AND n.parent_id = ? -- Master CorpusId or AnnuaireId
323 AND n.typename = ? -- Master childs (Documents or Contacts)
324 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
325 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
326 AND list.parent_id IS NULL
327 )
328
329 SELECT COALESCE(tu.terms,tm.terms) AS terms
330 , COALESCE(tu.n,tm.n) AS n
331 , COALESCE(tu.list_type,tm.list_type) AS ngrams_type
332 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
333 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
334 GROUP BY tu.terms,tm.terms,tu.n,tm.n,tu.list_type,tm.list_type
335 ORDER BY 1,2
336 LIMIT ?
337 OFFSET ?;
338
339 |]
340
341
342 querySelectTableNgramsTrees :: PGS.Query
343 querySelectTableNgramsTrees = [sql|
344
345 -- DROP FUNCTION tree_start(integer,integer,integer,integer,integer,integer,integer,integer);
346 -- DROP FUNCTION tree_end(integer,integer,integer,integer,integer,integer);
347 -- DROP FUNCTION tree_ngrams(integer,integer,integer,integer,integer,integer,integer,integer);
348
349 CREATE OR REPLACE FUNCTION public.tree_start(luid INT, lmid INT,cuid INT, cmid INT, tdoc INT, tngrams INT, lmt INT, ofst INT)
350 RETURNS TABLE (id INT, parent_id INT, terms VARCHAR(255), n int, list_type int, weight float8) AS $$
351 BEGIN
352 RETURN QUERY
353 WITH tableUser AS (
354 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
355 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
356 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
357 JOIN nodes_nodes nn ON nn.node2_id = corp.node_id
358 JOIN nodes n ON n.id = corp.node_id
359
360 WHERE list.node_id = luid -- User listId
361 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
362 AND n.typename = tdoc -- both type of childs (Documents or Contacts)
363 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms or...)
364 AND list.parent_id IS NULL
365 ),
366 tableMaster AS (
367 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
368 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
369 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
370 JOIN nodes n ON n.id = corp.node_id
371 JOIN nodes_nodes nn ON nn.node2_id = n.id
372
373 WHERE list.node_id = lmid -- Master listId
374 AND n.parent_id = cmid -- Master CorpusId or AnnuaireId
375 AND n.typename = tdoc -- Master childs (Documents or Contacts)
376 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms1)
377 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
378 AND list.parent_id IS NULL
379 )
380
381 SELECT COALESCE(tu.id,tm.id) AS id
382 , COALESCE(tu.parent_id,tm.parent_id) AS parent_id
383 , COALESCE(tu.terms,tm.terms) AS terms
384 , COALESCE(tu.n,tm.n) AS n
385 , COALESCE(tu.list_type,tm.list_type) AS ngrams_type
386 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
387 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
388 GROUP BY tu.id,tm.id,tu.parent_id,tm.parent_id,tu.terms,tm.terms,tu.n,tm.n,tu.list_type,tm.list_type
389 ORDER BY 3
390 LIMIT lmt
391 OFFSET ofst
392 ;
393 END $$
394 LANGUAGE plpgsql ;
395
396 CREATE OR REPLACE FUNCTION public.tree_end(luid INT, lmid INT,cuid INT, cmid INT, tdoc INT, tngrams INT)
397 RETURNS TABLE (id INT, parent_id INT, terms VARCHAR(255), n int, list_type int, weight float8) AS $$
398 BEGIN
399 RETURN QUERY
400 WITH tableUser2 AS (
401 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
402 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
403 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
404 JOIN nodes_nodes nn ON nn.node2_id = corp.node_id
405 JOIN nodes n ON n.id = corp.node_id
406
407 WHERE list.node_id = luid -- User listId
408 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
409 AND n.typename = tdoc -- both type of childs (Documents or Contacts)
410 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms or...)
411 )
412 , tableMaster2 AS (
413 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
414 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
415 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
416 JOIN nodes n ON n.id = corp.node_id
417 JOIN nodes_nodes nn ON nn.node2_id = n.id
418
419 WHERE list.node_id = lmid -- Master listId
420 AND n.parent_id = cmid -- Master CorpusId or AnnuaireId
421 AND n.typename = tdoc -- Master childs (Documents or Contacts)
422 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms1)
423 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
424 )
425 SELECT COALESCE(tu.id,tm.id) as id
426 , COALESCE(tu.parent_id,tm.parent_id) as parent_id
427 , COALESCE(tu.terms,tm.terms) AS terms
428 , COALESCE(tu.n,tm.n) AS n
429 , COALESCE(tu.list_type,tm.list_type) AS list_type
430 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
431 FROM tableUser2 tu RIGHT JOIN tableMaster2 tm ON tu.terms = tm.terms
432 GROUP BY tu.id,tm.id,tu.parent_id,tm.parent_id,tu.terms,tm.terms,tu.n,tm.n,tu.list_type,tm.list_type
433 ;
434 END $$
435 LANGUAGE plpgsql ;
436
437
438 CREATE OR REPLACE FUNCTION public.tree_ngrams(luid INT, lmid INT,cuid INT, cmid INT, tdoc INT, tngrams INT, lmt INT, ofst INT)
439 RETURNS TABLE (id INT, parent_id INT, terms VARCHAR(255), n int, list_type int, weight float8) AS $$
440 BEGIN
441 RETURN QUERY WITH RECURSIVE
442 ngrams_tree (id,parent_id,terms,n,list_type,weight) AS (
443 SELECT ts.id,ts.parent_id,ts.terms,ts.n,ts.list_type,ts.weight FROM tree_start($1,$2,$3,$4,$5,$6,$7,$8) ts
444 UNION
445 SELECT te.id,te.parent_id,te.terms,te.n,te.list_type,te.weight FROM tree_end($1,$2,$3,$4,$5,$6) as te
446 INNER JOIN ngrams_tree ON te.parent_id = ngrams_tree.id
447 )
448 SELECT * from ngrams_tree;
449 END $$
450 LANGUAGE plpgsql ;
451
452 select * from tree_ngrams(?,?,?,?,?,?,?,?)
453
454 |]
455
456
457
458 type ListIdUser = NodeId
459 type ListIdMaster = NodeId
460
461 type MapToChildren = Map Text (Set Text)
462 type MapToParent = Map Text Text
463
464 getNgramsGroup :: ListIdUser -> ListIdMaster -> Cmd err (MapToParent, MapToChildren)
465 getNgramsGroup lu lm = do
466 groups <- runPGSQuery querySelectNgramsGroup (lu,lm)
467 let mapChildren = fromListWith (<>) $ map (\(a,b) -> (a, DS.singleton b)) groups
468 let mapParent = fromListWith (<>) $ map (\(a,b) -> (b, a)) groups
469 pure (mapParent, mapChildren)
470
471 querySelectNgramsGroup :: PGS.Query
472 querySelectNgramsGroup = [sql|
473 WITH groupUser AS (
474 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
475 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
476 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
477 WHERE
478 nnn.node_id = ? -- User listId
479 ),
480 groupMaster AS (
481 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
482 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
483 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
484 WHERE
485 nnn.node_id = ? -- Master listId
486 )
487 SELECT COALESCE(gu.t1,gm.t1) AS ngram1_id
488 , COALESCE(gu.t2,gm.t2) AS ngram2_id
489 FROM groupUser gu LEFT JOIN groupMaster gm ON gu.t1 = gm.t1
490 |]
491