]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Schema/Ngrams.hs
Add the weightedLogJaccard function
[gargantext.git] / src / Gargantext / Database / Schema / Ngrams.hs
1 {-|
2 Module : Gargantext.Database.Schema.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams connection to the Database.
11
12 -}
13
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE GeneralizedNewtypeDeriving #-}
19 {-# LANGUAGE MultiParamTypeClasses #-}
20 {-# LANGUAGE NoImplicitPrelude #-}
21 {-# LANGUAGE OverloadedStrings #-}
22 {-# LANGUAGE QuasiQuotes #-}
23 {-# LANGUAGE RankNTypes #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Schema.Ngrams where
27
28 import Data.Aeson (FromJSON, FromJSONKey)
29 import Control.Lens (makeLenses, view, over)
30 import Control.Monad (mzero)
31 import Data.Aeson
32 import Data.ByteString.Internal (ByteString)
33 import Data.Map (Map, fromList, lookup, fromListWith)
34 import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
35 import Data.Set (Set)
36 import Data.Text (Text, splitOn)
37 import Database.PostgreSQL.Simple ((:.)(..))
38 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
39 import Database.PostgreSQL.Simple.SqlQQ (sql)
40 import Database.PostgreSQL.Simple.ToField (toField, ToField)
41 import Database.PostgreSQL.Simple.FromField (FromField, fromField)
42 import Database.PostgreSQL.Simple.ToRow (toRow)
43 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
44 import Debug.Trace (trace)
45 import GHC.Generics (Generic)
46 import Gargantext.Core.Types -- (fromListTypeId, ListType, NodePoly(Node))
47 import Gargantext.Database.Config (nodeTypeId,userMaster)
48 import Gargantext.Database.Root (getRoot)
49 import Gargantext.Database.Types.Node (NodeType)
50 import Gargantext.Database.Schema.Node (getListsWithParentId, getCorporaWithParentId)
51 import Gargantext.Database.Utils (Cmd, runPGSQuery, runOpaQuery, formatPGSQuery)
52 import Gargantext.Prelude
53 import Opaleye hiding (FromField)
54 import Prelude (Enum, Bounded, minBound, maxBound, Functor)
55 import qualified Data.Set as DS
56 import qualified Database.PostgreSQL.Simple as PGS
57
58
59 type NgramsTerms = Text
60 type NgramsId = Int
61 type Size = Int
62
63 data NgramsPoly id terms n = NgramsDb { ngrams_id :: id
64 , ngrams_terms :: terms
65 , ngrams_n :: n
66 } deriving (Show)
67
68 type NgramsWrite = NgramsPoly (Maybe (Column PGInt4))
69 (Column PGText)
70 (Column PGInt4)
71
72 type NgramsRead = NgramsPoly (Column PGInt4)
73 (Column PGText)
74 (Column PGInt4)
75
76 type NgramsReadNull = NgramsPoly (Column (Nullable PGInt4))
77 (Column (Nullable PGText))
78 (Column (Nullable PGInt4))
79
80 type NgramsDb = NgramsPoly Int Text Int
81
82 $(makeAdaptorAndInstance "pNgramsDb" ''NgramsPoly)
83 -- $(makeLensesWith abbreviatedFields ''NgramsPoly)
84
85 ngramsTable :: Table NgramsWrite NgramsRead
86 ngramsTable = Table "ngrams" (pNgramsDb NgramsDb { ngrams_id = optional "id"
87 , ngrams_terms = required "terms"
88 , ngrams_n = required "n"
89 }
90 )
91
92 queryNgramsTable :: Query NgramsRead
93 queryNgramsTable = queryTable ngramsTable
94
95 dbGetNgramsDb :: Cmd err [NgramsDb]
96 dbGetNgramsDb = runOpaQuery queryNgramsTable
97
98 -- | Main Ngrams Types
99 -- | Typed Ngrams
100 -- Typed Ngrams localize the context of the ngrams
101 -- ngrams in source field of document has Sources Type
102 -- ngrams in authors field of document has Authors Type
103 -- ngrams in text (title or abstract) of documents has Terms Type
104 data NgramsType = Authors | Institutes | Sources | NgramsTerms
105 deriving (Eq, Show, Ord, Enum, Bounded, Generic)
106
107 instance FromJSON NgramsType
108 instance FromJSONKey NgramsType
109 instance ToJSON NgramsType
110 instance ToJSONKey NgramsType
111
112 newtype NgramsTypeId = NgramsTypeId Int
113 deriving (Eq, Show, Ord, Num)
114
115 instance ToField NgramsTypeId where
116 toField (NgramsTypeId n) = toField n
117
118 instance FromField NgramsTypeId where
119 fromField fld mdata = do
120 n <- fromField fld mdata
121 if (n :: Int) > 0 then return $ NgramsTypeId n
122 else mzero
123
124 pgNgramsType :: NgramsType -> Column PGInt4
125 pgNgramsType = pgNgramsTypeId . ngramsTypeId
126
127 pgNgramsTypeId :: NgramsTypeId -> Column PGInt4
128 pgNgramsTypeId (NgramsTypeId n) = pgInt4 n
129
130 ngramsTypeId :: NgramsType -> NgramsTypeId
131 ngramsTypeId Authors = 1
132 ngramsTypeId Institutes = 2
133 ngramsTypeId Sources = 3
134 ngramsTypeId NgramsTerms = 4
135
136 fromNgramsTypeId :: NgramsTypeId -> Maybe NgramsType
137 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
138
139 ------------------------------------------------------------------------
140 -- | TODO put it in Gargantext.Text.Ngrams
141 data Ngrams = Ngrams { _ngramsTerms :: Text
142 , _ngramsSize :: Int
143 } deriving (Generic, Show, Eq, Ord)
144
145 makeLenses ''Ngrams
146 instance PGS.ToRow Ngrams where
147 toRow (Ngrams t s) = [toField t, toField s]
148
149 text2ngrams :: Text -> Ngrams
150 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
151
152 -------------------------------------------------------------------------
153 -- | TODO put it in Gargantext.Text.Ngrams
154 -- Named entity are typed ngrams of Terms Ngrams
155 data NgramsT a =
156 NgramsT { _ngramsType :: NgramsType
157 , _ngramsT :: a
158 } deriving (Generic, Show, Eq, Ord)
159
160 makeLenses ''NgramsT
161
162 instance Functor NgramsT where
163 fmap = over ngramsT
164 -----------------------------------------------------------------------
165 data NgramsIndexed =
166 NgramsIndexed
167 { _ngrams :: Ngrams
168 , _ngramsId :: NgramsId
169 } deriving (Show, Generic, Eq, Ord)
170
171 makeLenses ''NgramsIndexed
172 ------------------------------------------------------------------------
173 data NgramIds =
174 NgramIds
175 { ngramId :: Int
176 , ngramTerms :: Text
177 } deriving (Show, Generic, Eq, Ord)
178
179 instance PGS.FromRow NgramIds where
180 fromRow = NgramIds <$> field <*> field
181
182 ----------------------
183 withMap :: Map NgramsTerms NgramsId -> NgramsTerms -> NgramsId
184 withMap m n = maybe (panic "withMap: should not happen") identity (lookup n m)
185
186 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
187 indexNgramsT = fmap . indexNgramsWith . withMap
188
189 indexNgrams :: Map NgramsTerms NgramsId -> Ngrams -> NgramsIndexed
190 indexNgrams = indexNgramsWith . withMap
191
192 -- NP: not sure we need it anymore
193 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams -> NgramsT NgramsIndexed
194 indexNgramsTWith = fmap . indexNgramsWith
195
196 indexNgramsWith :: (NgramsTerms -> NgramsId) -> Ngrams -> NgramsIndexed
197 indexNgramsWith f n = NgramsIndexed n (f $ _ngramsTerms n)
198
199 -- TODO-ACCESS: access must not be checked here but when insertNgrams is called.
200 insertNgrams :: [Ngrams] -> Cmd err (Map NgramsTerms NgramsId)
201 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
202
203 -- TODO-ACCESS: access must not be checked here but when insertNgrams' is called.
204 insertNgrams' :: [Ngrams] -> Cmd err [NgramIds]
205 insertNgrams' ns = runPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
206 where
207 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
208
209 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd err ByteString
210 insertNgrams_Debug ns = formatPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
211 where
212 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
213
214 ----------------------
215 queryInsertNgrams :: PGS.Query
216 queryInsertNgrams = [sql|
217 WITH input_rows(terms,n) AS (?)
218 , ins AS (
219 INSERT INTO ngrams (terms,n)
220 SELECT * FROM input_rows
221 ON CONFLICT (terms) DO NOTHING -- unique index created here
222 RETURNING id,terms
223 )
224
225 SELECT id, terms
226 FROM ins
227 UNION ALL
228 SELECT c.id, terms
229 FROM input_rows
230 JOIN ngrams c USING (terms); -- columns of unique index
231 |]
232
233
234 -- | Ngrams Table
235 -- TODO: the way we are getting main Master Corpus and List ID is not clean
236 -- TODO: if ids are not present -> create
237 -- TODO: Global Env State Monad to keep in memory the ids without retrieving it each time
238 getNgramsTableDb :: NodeType -> NgramsType
239 -> NgramsTableParamUser
240 -> Limit -> Offset
241 -> Cmd err [NgramsTableData]
242 getNgramsTableDb nt ngrt ntp limit_ offset_ = do
243
244
245 maybeRoot <- head <$> getRoot userMaster
246 let path = "Garg.Db.Ngrams.getTableNgrams: "
247 let masterRootId = maybe (panic $ path <> "no userMaster Tree") (view node_id) maybeRoot
248 -- let errMess = panic "Error"
249
250 corpusMasterId <- maybe (panic "error master corpus") (view node_id) <$> head <$> getCorporaWithParentId masterRootId
251
252 listMasterId <- maybe (panic "error master list") (view node_id) <$> head <$> getListsWithParentId corpusMasterId
253
254 getNgramsTableData nt ngrt ntp (NgramsTableParam listMasterId corpusMasterId) limit_ offset_
255
256 data NgramsTableParam =
257 NgramsTableParam { _nt_listId :: NodeId
258 , _nt_corpusId :: NodeId
259 }
260
261 type NgramsTableParamUser = NgramsTableParam
262 type NgramsTableParamMaster = NgramsTableParam
263
264
265 data NgramsTableData = NgramsTableData { _ntd_id :: Int
266 , _ntd_parent_id :: Maybe Int
267 , _ntd_terms :: Text
268 , _ntd_n :: Int
269 , _ntd_listType :: Maybe ListType
270 , _ntd_weight :: Double
271 } deriving (Show)
272
273
274
275 getNgramsTableData :: NodeType -> NgramsType
276 -> NgramsTableParamUser -> NgramsTableParamMaster
277 -> Limit -> Offset
278 -> Cmd err [NgramsTableData]
279 getNgramsTableData nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) limit_ offset_ =
280 trace ("Ngrams table params: " <> show params) <$>
281 map (\(i,p,t,n,lt,w) -> NgramsTableData i p t n (fromListTypeId lt) w) <$>
282 runPGSQuery querySelectTableNgramsTrees params
283 where
284 nodeTId = nodeTypeId nodeT
285 ngrmTId = ngramsTypeId ngrmT
286 params = (ul,ml,uc,mc,nodeTId,ngrmTId) :. (limit_, offset_)
287
288 getNgramsTableDataDebug :: PGS.ToRow a => a -> Cmd err ByteString
289 getNgramsTableDataDebug = formatPGSQuery querySelectTableNgramsTrees
290
291
292 querySelectTableNgrams :: PGS.Query
293 querySelectTableNgrams = [sql|
294
295 WITH tableUser AS (
296 SELECT ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
297 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
298 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
299 JOIN nodes_nodes nn ON nn.node2_id = corp.node_id
300 JOIN nodes n ON n.id = corp.node_id
301
302 WHERE list.node_id = ? -- User listId
303 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
304 AND n.typename = ? -- both type of childs (Documents or Contacts)
305 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms or...)
306 AND list.parent_id IS NULL
307 )
308 , tableMaster AS (
309 SELECT ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
310 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
311 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
312 JOIN nodes n ON n.id = corp.node_id
313 JOIN nodes_nodes nn ON nn.node2_id = n.id
314
315 WHERE list.node_id = ? -- Master listId
316 AND n.parent_id = ? -- Master CorpusId or AnnuaireId
317 AND n.typename = ? -- Master childs (Documents or Contacts)
318 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
319 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
320 AND list.parent_id IS NULL
321 )
322
323 SELECT COALESCE(tu.terms,tm.terms) AS terms
324 , COALESCE(tu.n,tm.n) AS n
325 , COALESCE(tu.list_type,tm.list_type) AS ngrams_type
326 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
327 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
328 GROUP BY tu.terms,tm.terms,tu.n,tm.n,tu.list_type,tm.list_type
329 ORDER BY 1,2
330 LIMIT ?
331 OFFSET ?;
332
333 |]
334
335
336 querySelectTableNgramsTrees :: PGS.Query
337 querySelectTableNgramsTrees = [sql|
338
339 -- DROP FUNCTION tree_start(integer,integer,integer,integer,integer,integer,integer,integer);
340 -- DROP FUNCTION tree_end(integer,integer,integer,integer,integer,integer);
341 -- DROP FUNCTION tree_ngrams(integer,integer,integer,integer,integer,integer,integer,integer);
342
343 CREATE OR REPLACE FUNCTION public.tree_start(luid INT, lmid INT,cuid INT, cmid INT, tdoc INT, tngrams INT, lmt INT, ofst INT)
344 RETURNS TABLE (id INT, parent_id INT, terms VARCHAR(255), n int, list_type int, weight float8) AS $$
345 BEGIN
346 RETURN QUERY
347 WITH tableUser AS (
348 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
349 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
350 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
351 JOIN nodes_nodes nn ON nn.node2_id = corp.node_id
352 JOIN nodes n ON n.id = corp.node_id
353
354 WHERE list.node_id = luid -- User listId
355 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
356 AND n.typename = tdoc -- both type of childs (Documents or Contacts)
357 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms or...)
358 AND list.parent_id IS NULL
359 ),
360 tableMaster AS (
361 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
362 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
363 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
364 JOIN nodes n ON n.id = corp.node_id
365 JOIN nodes_nodes nn ON nn.node2_id = n.id
366
367 WHERE list.node_id = lmid -- Master listId
368 AND n.parent_id = cmid -- Master CorpusId or AnnuaireId
369 AND n.typename = tdoc -- Master childs (Documents or Contacts)
370 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms1)
371 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
372 AND list.parent_id IS NULL
373 )
374
375 SELECT COALESCE(tu.id,tm.id) AS id
376 , COALESCE(tu.parent_id,tm.parent_id) AS parent_id
377 , COALESCE(tu.terms,tm.terms) AS terms
378 , COALESCE(tu.n,tm.n) AS n
379 , COALESCE(tu.list_type,tm.list_type) AS ngrams_type
380 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
381 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
382 GROUP BY tu.id,tm.id,tu.parent_id,tm.parent_id,tu.terms,tm.terms,tu.n,tm.n,tu.list_type,tm.list_type
383 ORDER BY 3
384 LIMIT lmt
385 OFFSET ofst
386 ;
387 END $$
388 LANGUAGE plpgsql ;
389
390 CREATE OR REPLACE FUNCTION public.tree_end(luid INT, lmid INT,cuid INT, cmid INT, tdoc INT, tngrams INT)
391 RETURNS TABLE (id INT, parent_id INT, terms VARCHAR(255), n int, list_type int, weight float8) AS $$
392 BEGIN
393 RETURN QUERY
394 WITH tableUser2 AS (
395 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
396 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
397 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
398 JOIN nodes_nodes nn ON nn.node2_id = corp.node_id
399 JOIN nodes n ON n.id = corp.node_id
400
401 WHERE list.node_id = luid -- User listId
402 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
403 AND n.typename = tdoc -- both type of childs (Documents or Contacts)
404 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms or...)
405 )
406 , tableMaster2 AS (
407 SELECT list.id, list.parent_id, ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
408 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
409 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
410 JOIN nodes n ON n.id = corp.node_id
411 JOIN nodes_nodes nn ON nn.node2_id = n.id
412
413 WHERE list.node_id = lmid -- Master listId
414 AND n.parent_id = cmid -- Master CorpusId or AnnuaireId
415 AND n.typename = tdoc -- Master childs (Documents or Contacts)
416 AND corp.ngrams_type = tngrams -- both type of ngrams (Authors or Terms1)
417 AND nn.node1_id = cuid -- User CorpusId or AnnuaireId
418 )
419 SELECT COALESCE(tu.id,tm.id) as id
420 , COALESCE(tu.parent_id,tm.parent_id) as parent_id
421 , COALESCE(tu.terms,tm.terms) AS terms
422 , COALESCE(tu.n,tm.n) AS n
423 , COALESCE(tu.list_type,tm.list_type) AS list_type
424 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
425 FROM tableUser2 tu RIGHT JOIN tableMaster2 tm ON tu.terms = tm.terms
426 GROUP BY tu.id,tm.id,tu.parent_id,tm.parent_id,tu.terms,tm.terms,tu.n,tm.n,tu.list_type,tm.list_type
427 ;
428 END $$
429 LANGUAGE plpgsql ;
430
431
432 CREATE OR REPLACE FUNCTION public.tree_ngrams(luid INT, lmid INT,cuid INT, cmid INT, tdoc INT, tngrams INT, lmt INT, ofst INT)
433 RETURNS TABLE (id INT, parent_id INT, terms VARCHAR(255), n int, list_type int, weight float8) AS $$
434 BEGIN
435 RETURN QUERY WITH RECURSIVE
436 ngrams_tree (id,parent_id,terms,n,list_type,weight) AS (
437 SELECT ts.id,ts.parent_id,ts.terms,ts.n,ts.list_type,ts.weight FROM tree_start($1,$2,$3,$4,$5,$6,$7,$8) ts
438 UNION
439 SELECT te.id,te.parent_id,te.terms,te.n,te.list_type,te.weight FROM tree_end($1,$2,$3,$4,$5,$6) as te
440 INNER JOIN ngrams_tree ON te.parent_id = ngrams_tree.id
441 )
442 SELECT * from ngrams_tree;
443 END $$
444 LANGUAGE plpgsql ;
445
446 select * from tree_ngrams(?,?,?,?,?,?,?,?)
447
448 |]
449
450
451
452 type ListIdUser = NodeId
453 type ListIdMaster = NodeId
454
455 type MapToChildren = Map Text (Set Text)
456 type MapToParent = Map Text Text
457
458 getNgramsGroup :: ListIdUser -> ListIdMaster -> Cmd err (MapToParent, MapToChildren)
459 getNgramsGroup lu lm = do
460 groups <- runPGSQuery querySelectNgramsGroup (lu,lm)
461 let mapChildren = fromListWith (<>) $ map (\(a,b) -> (a, DS.singleton b)) groups
462 let mapParent = fromListWith (<>) $ map (\(a,b) -> (b, a)) groups
463 pure (mapParent, mapChildren)
464
465 querySelectNgramsGroup :: PGS.Query
466 querySelectNgramsGroup = [sql|
467 WITH groupUser AS (
468 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
469 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
470 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
471 WHERE
472 nnn.node_id = ? -- User listId
473 ),
474 groupMaster AS (
475 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
476 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
477 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
478 WHERE
479 nnn.node_id = ? -- Master listId
480 )
481 SELECT COALESCE(gu.t1,gm.t1) AS ngram1_id
482 , COALESCE(gu.t2,gm.t2) AS ngram2_id
483 FROM groupUser gu LEFT JOIN groupMaster gm ON gu.t1 = gm.t1
484 |]
485