]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Schema/Ngrams.hs
[Ngrams Table] Front and Back implemented (but need next step for optimization and...
[gargantext.git] / src / Gargantext / Database / Schema / Ngrams.hs
1 {-|
2 Module : Gargantext.Database.Schema.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams connection to the Database.
11
12 -}
13
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE GeneralizedNewtypeDeriving #-}
19 {-# LANGUAGE MultiParamTypeClasses #-}
20 {-# LANGUAGE NoImplicitPrelude #-}
21 {-# LANGUAGE OverloadedStrings #-}
22 {-# LANGUAGE QuasiQuotes #-}
23 {-# LANGUAGE RankNTypes #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Schema.Ngrams where
27
28
29 import Control.Lens (makeLenses, view)
30 import Control.Monad (mzero)
31 import Data.ByteString.Internal (ByteString)
32 import Data.Map (Map, fromList, lookup, fromListWith)
33 import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
34 import Data.Set (Set)
35 import Data.Text (Text, splitOn)
36 import Database.PostgreSQL.Simple ((:.)(..))
37 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
38 import Database.PostgreSQL.Simple.SqlQQ (sql)
39 import Database.PostgreSQL.Simple.ToField (toField, ToField)
40 import Database.PostgreSQL.Simple.FromField (FromField, fromField)
41 import Database.PostgreSQL.Simple.ToRow (toRow)
42 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
43 --import Debug.Trace (trace)
44 import GHC.Generics (Generic)
45 import Gargantext.Core.Types -- (fromListTypeId, ListType, NodePoly(Node))
46 import Gargantext.Database.Config (nodeTypeId,userMaster)
47 import Gargantext.Database.Root (getRoot)
48 import Gargantext.Database.Types.Node (NodeType)
49 import Gargantext.Database.Schema.Node (getListsWithParentId, getCorporaWithParentId)
50 import Gargantext.Database.Utils (Cmd, runPGSQuery, runOpaQuery, formatPGSQuery)
51 import Gargantext.Prelude
52 import Opaleye hiding (FromField)
53 import Prelude (Enum, Bounded, minBound, maxBound)
54 import qualified Data.Set as DS
55 import qualified Database.PostgreSQL.Simple as PGS
56
57 --{-
58 data NgramsPoly id terms n = NgramsDb { ngrams_id :: id
59 , ngrams_terms :: terms
60 , ngrams_n :: n
61 } deriving (Show)
62
63 --}
64 type NgramsWrite = NgramsPoly (Maybe (Column PGInt4))
65 (Column PGText)
66 (Column PGInt4)
67
68 type NgramsRead = NgramsPoly (Column PGInt4)
69 (Column PGText)
70 (Column PGInt4)
71
72 type NgramsReadNull = NgramsPoly (Column (Nullable PGInt4))
73 (Column (Nullable PGText))
74 (Column (Nullable PGInt4))
75
76 --{-
77 type NgramsDb = NgramsPoly Int Text Int
78
79 $(makeAdaptorAndInstance "pNgramsDb" ''NgramsPoly)
80 -- $(makeLensesWith abbreviatedFields ''NgramsPoly)
81
82 ngramsTable :: Table NgramsWrite NgramsRead
83 ngramsTable = Table "ngrams" (pNgramsDb NgramsDb { ngrams_id = optional "id"
84 , ngrams_terms = required "terms"
85 , ngrams_n = required "n"
86 }
87 )
88 --{-
89 queryNgramsTable :: Query NgramsRead
90 queryNgramsTable = queryTable ngramsTable
91
92 dbGetNgramsDb :: Cmd err [NgramsDb]
93 dbGetNgramsDb = runOpaQuery queryNgramsTable
94 --}
95
96 -- | Main Ngrams Types
97 -- | Typed Ngrams
98 -- Typed Ngrams localize the context of the ngrams
99 -- ngrams in source field of document has Sources Type
100 -- ngrams in authors field of document has Authors Type
101 -- ngrams in text (title or abstract) of documents has Terms Type
102 data NgramsType = Authors | Institutes | Sources | NgramsTerms
103 deriving (Eq, Show, Ord, Enum, Bounded)
104
105 newtype NgramsTypeId = NgramsTypeId Int
106 deriving (Eq, Show, Ord, Num)
107
108 instance ToField NgramsTypeId where
109 toField (NgramsTypeId n) = toField n
110
111 instance FromField NgramsTypeId where
112 fromField fld mdata = do
113 n <- fromField fld mdata
114 if (n :: Int) > 0 then return $ NgramsTypeId n
115 else mzero
116
117 pgNgramsType :: NgramsType -> Column PGInt4
118 pgNgramsType = pgNgramsTypeId . ngramsTypeId
119
120 pgNgramsTypeId :: NgramsTypeId -> Column PGInt4
121 pgNgramsTypeId (NgramsTypeId n) = pgInt4 n
122
123 ngramsTypeId :: NgramsType -> NgramsTypeId
124 ngramsTypeId Authors = 1
125 ngramsTypeId Institutes = 2
126 ngramsTypeId Sources = 3
127 ngramsTypeId NgramsTerms = 4
128
129 fromNgramsTypeId :: NgramsTypeId -> Maybe NgramsType
130 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
131
132 type NgramsTerms = Text
133 type NgramsId = Int
134 type Size = Int
135
136 ------------------------------------------------------------------------
137 -- | TODO put it in Gargantext.Text.Ngrams
138 data Ngrams = Ngrams { _ngramsTerms :: Text
139 , _ngramsSize :: Int
140 } deriving (Generic, Show, Eq, Ord)
141
142 makeLenses ''Ngrams
143 instance PGS.ToRow Ngrams where
144 toRow (Ngrams t s) = [toField t, toField s]
145
146 text2ngrams :: Text -> Ngrams
147 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
148
149 -------------------------------------------------------------------------
150 -- | TODO put it in Gargantext.Text.Ngrams
151 -- Named entity are typed ngrams of Terms Ngrams
152 data NgramsT a =
153 NgramsT { _ngramsType :: NgramsType
154 , _ngramsT :: a
155 } deriving (Generic, Show, Eq, Ord)
156
157 makeLenses ''NgramsT
158 -----------------------------------------------------------------------
159 data NgramsIndexed =
160 NgramsIndexed
161 { _ngrams :: Ngrams
162 , _ngramsId :: NgramsId
163 } deriving (Show, Generic, Eq, Ord)
164
165 makeLenses ''NgramsIndexed
166 ------------------------------------------------------------------------
167 data NgramIds =
168 NgramIds
169 { ngramId :: Int
170 , ngramTerms :: Text
171 } deriving (Show, Generic, Eq, Ord)
172
173 instance PGS.FromRow NgramIds where
174 fromRow = NgramIds <$> field <*> field
175
176 ----------------------
177 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
178 indexNgramsT m ngrId = indexNgramsTWith f ngrId
179 where
180 f n = maybe (panic "indexNgramsT: should not happen") identity (lookup n m)
181
182 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams-> NgramsT NgramsIndexed
183 indexNgramsTWith f (NgramsT t n) = NgramsT t (NgramsIndexed n ((f . _ngramsTerms) n))
184
185 insertNgrams :: [Ngrams] -> Cmd err (Map NgramsTerms NgramsId)
186 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
187
188 insertNgrams' :: [Ngrams] -> Cmd err [NgramIds]
189 insertNgrams' ns = runPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
190 where
191 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
192
193 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd err ByteString
194 insertNgrams_Debug ns = formatPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
195 where
196 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
197
198 ----------------------
199 queryInsertNgrams :: PGS.Query
200 queryInsertNgrams = [sql|
201 WITH input_rows(terms,n) AS (?)
202 , ins AS (
203 INSERT INTO ngrams (terms,n)
204 SELECT * FROM input_rows
205 ON CONFLICT (terms) DO NOTHING -- unique index created here
206 RETURNING id,terms
207 )
208
209 SELECT id, terms
210 FROM ins
211 UNION ALL
212 SELECT c.id, terms
213 FROM input_rows
214 JOIN ngrams c USING (terms); -- columns of unique index
215 |]
216
217
218 -- | Ngrams Table
219 -- TODO: the way we are getting main Master Corpus and List ID is not clean
220 -- TODO: if ids are not present -> create
221 -- TODO: Global Env State Monad to keep in memory the ids without retrieving it each time
222 getNgramsTableDb :: NodeType -> NgramsType
223 -> NgramsTableParamUser
224 -> Limit -> Offset
225 -> Cmd err ([NgramsTableData], MapToParent, MapToChildren)
226 getNgramsTableDb nt ngrt ntp@(NgramsTableParam listIdUser _) limit_ offset_ = do
227
228
229 maybeRoot <- head <$> getRoot userMaster
230 let path = "Garg.Db.Ngrams.getTableNgrams: "
231 let masterRootId = maybe (panic $ path <> "no userMaster Tree") (view node_id) maybeRoot
232 -- let errMess = panic "Error"
233
234 corpusMasterId <- maybe (panic "error master corpus") (view node_id) <$> head <$> getCorporaWithParentId masterRootId
235
236 listMasterId <- maybe (panic "error master list") (view node_id) <$> head <$> getListsWithParentId corpusMasterId
237
238 ngramsTableData <- getNgramsTableData nt ngrt ntp (NgramsTableParam listMasterId corpusMasterId) limit_ offset_
239
240 (mapToParent,mapToChildren) <- getNgramsGroup listIdUser listMasterId
241 pure (ngramsTableData, mapToParent,mapToChildren)
242
243
244 data NgramsTableParam =
245 NgramsTableParam { _nt_listId :: NodeId
246 , _nt_corpusId :: NodeId
247 }
248
249 type NgramsTableParamUser = NgramsTableParam
250 type NgramsTableParamMaster = NgramsTableParam
251
252 data NgramsTableData = NgramsTableData { _ntd_ngrams :: Text
253 , _ntd_n :: Int
254 , _ntd_listType :: Maybe ListType
255 , _ntd_weight :: Double
256 } deriving (Show)
257
258 getNgramsTableData :: NodeType -> NgramsType
259 -> NgramsTableParamUser -> NgramsTableParamMaster
260 -> Limit -> Offset
261 -> Cmd err [NgramsTableData]
262 getNgramsTableData nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) limit_ offset_ =
263 -- trace ("Ngrams table params" <> show params) <$>
264 map (\(t,n,nt,w) -> NgramsTableData t n (fromListTypeId nt) w) <$>
265 runPGSQuery querySelectTableNgrams params
266 where
267 nodeTId = nodeTypeId nodeT
268 ngrmTId = ngramsTypeId ngrmT
269 params = (ul,uc,nodeTId,ngrmTId,ml,mc,nodeTId,ngrmTId,uc) :.
270 (limit_, offset_)
271
272
273
274 querySelectTableNgrams :: PGS.Query
275 querySelectTableNgrams = [sql|
276
277 WITH tableUser AS (
278 SELECT ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
279 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
280 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
281 JOIN nodes_nodes nn ON nn.node2_id = corp.node_id
282 JOIN nodes n ON n.id = corp.node_id
283
284 WHERE list.node_id = ? -- User listId
285 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
286 AND n.typename = ? -- both type of childs (Documents or Contacts)
287 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms or...)
288 )
289 , tableMaster AS (
290 SELECT ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
291 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
292 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
293 JOIN nodes n ON n.id = corp.node_id
294 JOIN nodes_nodes nn ON nn.node2_id = n.id
295
296 WHERE list.node_id = ? -- Master listId
297 AND n.parent_id = ? -- Master CorpusId or AnnuaireId
298 AND n.typename = ? -- Master childs (Documents or Contacts)
299 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
300 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
301 )
302
303 SELECT COALESCE(tu.terms,tm.terms) AS terms
304 , COALESCE(tu.n,tm.n) AS n
305 , COALESCE(tu.list_type,tm.list_type) AS ngrams_type
306 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
307 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
308 GROUP BY tu.terms,tm.terms,tu.n,tm.n,tu.list_type,tm.list_type
309 ORDER BY 1,2
310 LIMIT ?
311 OFFSET ?;
312
313 |]
314
315 type ListIdUser = NodeId
316 type ListIdMaster = NodeId
317
318 type MapToChildren = Map Text (Set Text)
319 type MapToParent = Map Text Text
320
321 getNgramsGroup :: ListIdUser -> ListIdMaster -> Cmd err (MapToParent, MapToChildren)
322 getNgramsGroup lu lm = do
323 groups <- runPGSQuery querySelectNgramsGroup (lu,lm)
324 let mapChildren = fromListWith (<>) $ map (\(a,b) -> (a, DS.singleton b)) groups
325 let mapParent = fromListWith (<>) $ map (\(a,b) -> (b, a)) groups
326 pure (mapParent, mapChildren)
327
328 querySelectNgramsGroup :: PGS.Query
329 querySelectNgramsGroup = [sql|
330 WITH groupUser AS (
331 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
332 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
333 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
334 WHERE
335 nnn.node_id = ? -- User listId
336 ),
337 groupMaster AS (
338 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
339 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
340 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
341 WHERE
342 nnn.node_id = ? -- Master listId
343 )
344 SELECT COALESCE(gu.t1,gm.t1) AS ngram1_id
345 , COALESCE(gu.t2,gm.t2) AS ngram2_id
346 FROM groupUser gu LEFT JOIN groupMaster gm ON gu.t1 = gm.t1
347 |]
348