]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Schema/Ngrams.hs
[NgramsTable] WIP adding NgramsTypeId newtype.
[gargantext.git] / src / Gargantext / Database / Schema / Ngrams.hs
1 {-|
2 Module : Gargantext.Database.Schema.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams connection to the Database.
11
12 -}
13
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE GeneralizedNewtypeDeriving #-}
19 {-# LANGUAGE MultiParamTypeClasses #-}
20 {-# LANGUAGE NoImplicitPrelude #-}
21 {-# LANGUAGE OverloadedStrings #-}
22 {-# LANGUAGE QuasiQuotes #-}
23 {-# LANGUAGE RankNTypes #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Schema.Ngrams where
27
28
29 import Control.Lens (makeLenses, view)
30 import Control.Monad (mzero)
31 import Data.ByteString.Internal (ByteString)
32 import Data.Map (Map, fromList, lookup, fromListWith)
33 import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
34 import Data.Set (Set)
35 import Data.Text (Text, splitOn)
36 import Database.PostgreSQL.Simple ((:.)(..))
37 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
38 import Database.PostgreSQL.Simple.SqlQQ (sql)
39 import Database.PostgreSQL.Simple.ToField (toField, ToField)
40 import Database.PostgreSQL.Simple.FromField (FromField, fromField)
41 import Database.PostgreSQL.Simple.ToRow (toRow)
42 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
43 --import Debug.Trace (trace)
44 import GHC.Generics (Generic)
45 import Gargantext.Core.Types -- (fromListTypeId, ListType, NodePoly(Node))
46 import Gargantext.Database.Config (nodeTypeId,userMaster)
47 import Gargantext.Database.Root (getRoot)
48 import Gargantext.Database.Types.Node (NodeType)
49 import Gargantext.Database.Schema.Node (getListsWithParentId, getCorporaWithParentId)
50 import Gargantext.Database.Utils (Cmd, runPGSQuery, runOpaQuery, formatPGSQuery)
51 import Gargantext.Prelude
52 import Opaleye hiding (FromField)
53 import Prelude (Enum, Bounded, minBound, maxBound)
54 import qualified Data.Set as DS
55 import qualified Database.PostgreSQL.Simple as PGS
56
57 --{-
58 data NgramsPoly id terms n = NgramsDb { ngrams_id :: id
59 , ngrams_terms :: terms
60 , ngrams_n :: n
61 } deriving (Show)
62
63 --}
64 type NgramsWrite = NgramsPoly (Maybe (Column PGInt4))
65 (Column PGText)
66 (Column PGInt4)
67
68 type NgramsRead = NgramsPoly (Column PGInt4)
69 (Column PGText)
70 (Column PGInt4)
71
72 type NgramsReadNull = NgramsPoly (Column (Nullable PGInt4))
73 (Column (Nullable PGText))
74 (Column (Nullable PGInt4))
75
76 --{-
77 type NgramsDb = NgramsPoly Int Text Int
78
79 $(makeAdaptorAndInstance "pNgramsDb" ''NgramsPoly)
80 -- $(makeLensesWith abbreviatedFields ''NgramsPoly)
81
82 ngramsTable :: Table NgramsWrite NgramsRead
83 ngramsTable = Table "ngrams" (pNgramsDb NgramsDb { ngrams_id = optional "id"
84 , ngrams_terms = required "terms"
85 , ngrams_n = required "n"
86 }
87 )
88 --{-
89 queryNgramsTable :: Query NgramsRead
90 queryNgramsTable = queryTable ngramsTable
91
92 dbGetNgramsDb :: Cmd err [NgramsDb]
93 dbGetNgramsDb = runOpaQuery queryNgramsTable
94 --}
95
96 -- | Main Ngrams Types
97 -- | Typed Ngrams
98 -- Typed Ngrams localize the context of the ngrams
99 -- ngrams in source field of document has Sources Type
100 -- ngrams in authors field of document has Authors Type
101 -- ngrams in text (title or abstract) of documents has Terms Type
102 data NgramsType = Authors | Institutes | Sources | NgramsTerms
103 deriving (Eq, Show, Ord, Enum, Bounded)
104
105 newtype NgramsTypeId = NgramsTypeId Int
106 deriving (Eq, Show, Ord, Num)
107
108 instance ToField NgramsTypeId where
109 toField (NgramsTypeId n) = toField n
110
111 instance FromField NgramsTypeId where
112 fromField field mdata = do
113 n <- fromField field mdata
114 if (n :: Int) > 0 then return $ NgramsTypeId n
115 else mzero
116
117 pgNgramsTypeId :: NgramsTypeId -> Column PGInt4
118 pgNgramsTypeId (NgramsTypeId n) = pgInt4 n
119
120
121 ngramsTypeId :: NgramsType -> NgramsTypeId
122 ngramsTypeId Authors = 1
123 ngramsTypeId Institutes = 2
124 ngramsTypeId Sources = 3
125 ngramsTypeId NgramsTerms = 4
126
127 fromNgramsTypeId :: NgramsTypeId -> Maybe NgramsType
128 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
129
130 type NgramsTerms = Text
131 type NgramsId = Int
132 type Size = Int
133
134 ------------------------------------------------------------------------
135 -- | TODO put it in Gargantext.Text.Ngrams
136 data Ngrams = Ngrams { _ngramsTerms :: Text
137 , _ngramsSize :: Int
138 } deriving (Generic, Show, Eq, Ord)
139
140 makeLenses ''Ngrams
141 instance PGS.ToRow Ngrams where
142 toRow (Ngrams t s) = [toField t, toField s]
143
144 text2ngrams :: Text -> Ngrams
145 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
146
147 -------------------------------------------------------------------------
148 -- | TODO put it in Gargantext.Text.Ngrams
149 -- Named entity are typed ngrams of Terms Ngrams
150 data NgramsT a =
151 NgramsT { _ngramsType :: NgramsType
152 , _ngramsT :: a
153 } deriving (Generic, Show, Eq, Ord)
154
155 makeLenses ''NgramsT
156 -----------------------------------------------------------------------
157 data NgramsIndexed =
158 NgramsIndexed
159 { _ngrams :: Ngrams
160 , _ngramsId :: NgramsId
161 } deriving (Show, Generic, Eq, Ord)
162
163 makeLenses ''NgramsIndexed
164 ------------------------------------------------------------------------
165 data NgramIds =
166 NgramIds
167 { ngramId :: Int
168 , ngramTerms :: Text
169 } deriving (Show, Generic, Eq, Ord)
170
171 instance PGS.FromRow NgramIds where
172 fromRow = NgramIds <$> field <*> field
173
174 ----------------------
175 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
176 indexNgramsT m ngrId = indexNgramsTWith f ngrId
177 where
178 f n = maybe (panic "indexNgramsT: should not happen") identity (lookup n m)
179
180 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams-> NgramsT NgramsIndexed
181 indexNgramsTWith f (NgramsT t n) = NgramsT t (NgramsIndexed n ((f . _ngramsTerms) n))
182
183 insertNgrams :: [Ngrams] -> Cmd err (Map NgramsTerms NgramsId)
184 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
185
186 insertNgrams' :: [Ngrams] -> Cmd err [NgramIds]
187 insertNgrams' ns = runPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
188 where
189 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
190
191 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd err ByteString
192 insertNgrams_Debug ns = formatPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
193 where
194 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
195
196 ----------------------
197 queryInsertNgrams :: PGS.Query
198 queryInsertNgrams = [sql|
199 WITH input_rows(terms,n) AS (?)
200 , ins AS (
201 INSERT INTO ngrams (terms,n)
202 SELECT * FROM input_rows
203 ON CONFLICT (terms) DO NOTHING -- unique index created here
204 RETURNING id,terms
205 )
206
207 SELECT id, terms
208 FROM ins
209 UNION ALL
210 SELECT c.id, terms
211 FROM input_rows
212 JOIN ngrams c USING (terms); -- columns of unique index
213 |]
214
215
216 -- | Ngrams Table
217 -- TODO: the way we are getting main Master Corpus and List ID is not clean
218 -- TODO: if ids are not present -> create
219 -- TODO: Global Env State Monad to keep in memory the ids without retrieving it each time
220 getNgramsTableDb :: NodeType -> NgramsType
221 -> NgramsTableParamUser
222 -> Limit -> Offset
223 -> Cmd err ([NgramsTableData], MapToParent, MapToChildren)
224 getNgramsTableDb nt ngrt ntp@(NgramsTableParam listIdUser _) limit_ offset_ = do
225
226
227 maybeRoot <- head <$> getRoot userMaster
228 let path = "Garg.Db.Ngrams.getTableNgrams: "
229 let masterRootId = maybe (panic $ path <> "no userMaster Tree") (view node_id) maybeRoot
230 -- let errMess = panic "Error"
231
232 corpusMasterId <- maybe (panic "error master corpus") (view node_id) <$> head <$> getCorporaWithParentId masterRootId
233
234 listMasterId <- maybe (panic "error master list") (view node_id) <$> head <$> getListsWithParentId corpusMasterId
235
236 ngramsTableData <- getNgramsTableData nt ngrt ntp (NgramsTableParam listMasterId corpusMasterId) limit_ offset_
237
238 (mapToParent,mapToChildren) <- getNgramsGroup listIdUser listMasterId
239 pure (ngramsTableData, mapToParent,mapToChildren)
240
241
242 data NgramsTableParam =
243 NgramsTableParam { _nt_listId :: NodeId
244 , _nt_corpusId :: NodeId
245 }
246
247 type NgramsTableParamUser = NgramsTableParam
248 type NgramsTableParamMaster = NgramsTableParam
249
250 data NgramsTableData = NgramsTableData { _ntd_ngrams :: Text
251 , _ntd_n :: Int
252 , _ntd_listType :: Maybe ListType
253 , _ntd_weight :: Double
254 } deriving (Show)
255
256 getNgramsTableData :: NodeType -> NgramsType
257 -> NgramsTableParamUser -> NgramsTableParamMaster
258 -> Limit -> Offset
259 -> Cmd err [NgramsTableData]
260 getNgramsTableData nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) limit_ offset_ =
261 -- trace ("Ngrams table params" <> show params) <$>
262 map (\(t,n,nt,w) -> NgramsTableData t n (fromListTypeId nt) w) <$>
263 runPGSQuery querySelectTableNgrams params
264 where
265 nodeTId = nodeTypeId nodeT
266 ngrmTId = ngramsTypeId ngrmT
267 params = (ul,uc,nodeTId,ngrmTId,ml,mc,nodeTId,ngrmTId,uc) :.
268 (limit_, offset_)
269
270
271
272 querySelectTableNgrams :: PGS.Query
273 querySelectTableNgrams = [sql|
274
275 WITH tableUser AS (
276 SELECT ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
277 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
278 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
279 JOIN nodes_nodes nn ON nn.node2_id = corp.node_id
280 JOIN nodes n ON n.id = corp.node_id
281
282 WHERE list.node_id = ? -- User listId
283 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
284 AND n.typename = ? -- both type of childs (Documents or Contacts)
285 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms or...)
286 )
287 , tableMaster AS (
288 SELECT ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
289 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
290 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
291 JOIN nodes n ON n.id = corp.node_id
292 JOIN nodes_nodes nn ON nn.node2_id = n.id
293
294 WHERE list.node_id = ? -- Master listId
295 AND n.parent_id = ? -- Master CorpusId or AnnuaireId
296 AND n.typename = ? -- Master childs (Documents or Contacts)
297 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
298 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
299 )
300
301 SELECT COALESCE(tu.terms,tm.terms) AS terms
302 , COALESCE(tu.n,tm.n) AS n
303 , COALESCE(tu.list_type,tm.list_type) AS ngrams_type
304 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
305 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
306 GROUP BY tu.terms,tm.terms,tu.n,tm.n,tu.list_type,tm.list_type
307 ORDER BY 1,2
308 LIMIT ?
309 OFFSET ?;
310
311 |]
312
313 type ListIdUser = NodeId
314 type ListIdMaster = NodeId
315
316 type MapToChildren = Map Text (Set Text)
317 type MapToParent = Map Text Text
318
319 getNgramsGroup :: ListIdUser -> ListIdMaster -> Cmd err (MapToParent, MapToChildren)
320 getNgramsGroup lu lm = do
321 groups <- runPGSQuery querySelectNgramsGroup (lu,lm)
322 let mapChildren = fromListWith (<>) $ map (\(a,b) -> (a, DS.singleton b)) groups
323 let mapParent = fromListWith (<>) $ map (\(a,b) -> (b, a)) groups
324 pure (mapParent, mapChildren)
325
326 querySelectNgramsGroup :: PGS.Query
327 querySelectNgramsGroup = [sql|
328 WITH groupUser AS (
329 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
330 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
331 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
332 WHERE
333 nnn.node_id = ? -- User listId
334 ),
335 groupMaster AS (
336 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
337 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
338 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
339 WHERE
340 nnn.node_id = ? -- Master listId
341 )
342 SELECT COALESCE(gu.t1,gm.t1) AS ngram1_id
343 , COALESCE(gu.t2,gm.t2) AS ngram2_id
344 FROM groupUser gu LEFT JOIN groupMaster gm ON gu.t1 = gm.t1
345 |]
346