]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Schema/Ngrams.hs
[IMPORT] Fix build issues
[gargantext.git] / src / Gargantext / Database / Schema / Ngrams.hs
1 {-|
2 Module : Gargantext.Database.Schema.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams connection to the Database.
11
12 -}
13
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE GeneralizedNewtypeDeriving #-}
19 {-# LANGUAGE MultiParamTypeClasses #-}
20 {-# LANGUAGE NoImplicitPrelude #-}
21 {-# LANGUAGE OverloadedStrings #-}
22 {-# LANGUAGE QuasiQuotes #-}
23 {-# LANGUAGE RankNTypes #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Schema.Ngrams where
27
28
29 import Control.Lens (makeLenses, view, over)
30 import Control.Monad (mzero)
31 import Data.ByteString.Internal (ByteString)
32 import Data.Map (Map, fromList, lookup, fromListWith)
33 import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
34 import Data.Set (Set)
35 import Data.Text (Text, splitOn)
36 import Database.PostgreSQL.Simple ((:.)(..))
37 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
38 import Database.PostgreSQL.Simple.SqlQQ (sql)
39 import Database.PostgreSQL.Simple.ToField (toField, ToField)
40 import Database.PostgreSQL.Simple.FromField (FromField, fromField)
41 import Database.PostgreSQL.Simple.ToRow (toRow)
42 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
43 --import Debug.Trace (trace)
44 import GHC.Generics (Generic)
45 import Gargantext.Core.Types -- (fromListTypeId, ListType, NodePoly(Node))
46 import Gargantext.Database.Config (nodeTypeId,userMaster)
47 import Gargantext.Database.Root (getRoot)
48 import Gargantext.Database.Types.Node (NodeType)
49 import Gargantext.Database.Schema.Node (getListsWithParentId, getCorporaWithParentId)
50 import Gargantext.Database.Utils (Cmd, runPGSQuery, runOpaQuery, formatPGSQuery)
51 import Gargantext.Prelude
52 import Opaleye hiding (FromField)
53 import Prelude (Enum, Bounded, minBound, maxBound, Functor)
54 import qualified Data.Set as DS
55 import qualified Database.PostgreSQL.Simple as PGS
56
57 --{-
58 data NgramsPoly id terms n = NgramsDb { ngrams_id :: id
59 , ngrams_terms :: terms
60 , ngrams_n :: n
61 } deriving (Show)
62
63 --}
64 type NgramsWrite = NgramsPoly (Maybe (Column PGInt4))
65 (Column PGText)
66 (Column PGInt4)
67
68 type NgramsRead = NgramsPoly (Column PGInt4)
69 (Column PGText)
70 (Column PGInt4)
71
72 type NgramsReadNull = NgramsPoly (Column (Nullable PGInt4))
73 (Column (Nullable PGText))
74 (Column (Nullable PGInt4))
75
76 --{-
77 type NgramsDb = NgramsPoly Int Text Int
78
79 $(makeAdaptorAndInstance "pNgramsDb" ''NgramsPoly)
80 -- $(makeLensesWith abbreviatedFields ''NgramsPoly)
81
82 ngramsTable :: Table NgramsWrite NgramsRead
83 ngramsTable = Table "ngrams" (pNgramsDb NgramsDb { ngrams_id = optional "id"
84 , ngrams_terms = required "terms"
85 , ngrams_n = required "n"
86 }
87 )
88 --{-
89 queryNgramsTable :: Query NgramsRead
90 queryNgramsTable = queryTable ngramsTable
91
92 dbGetNgramsDb :: Cmd err [NgramsDb]
93 dbGetNgramsDb = runOpaQuery queryNgramsTable
94 --}
95
96 -- | Main Ngrams Types
97 -- | Typed Ngrams
98 -- Typed Ngrams localize the context of the ngrams
99 -- ngrams in source field of document has Sources Type
100 -- ngrams in authors field of document has Authors Type
101 -- ngrams in text (title or abstract) of documents has Terms Type
102 data NgramsType = Authors | Institutes | Sources | NgramsTerms
103 deriving (Eq, Show, Ord, Enum, Bounded)
104
105 newtype NgramsTypeId = NgramsTypeId Int
106 deriving (Eq, Show, Ord, Num)
107
108 instance ToField NgramsTypeId where
109 toField (NgramsTypeId n) = toField n
110
111 instance FromField NgramsTypeId where
112 fromField fld mdata = do
113 n <- fromField fld mdata
114 if (n :: Int) > 0 then return $ NgramsTypeId n
115 else mzero
116
117 pgNgramsType :: NgramsType -> Column PGInt4
118 pgNgramsType = pgNgramsTypeId . ngramsTypeId
119
120 pgNgramsTypeId :: NgramsTypeId -> Column PGInt4
121 pgNgramsTypeId (NgramsTypeId n) = pgInt4 n
122
123 ngramsTypeId :: NgramsType -> NgramsTypeId
124 ngramsTypeId Authors = 1
125 ngramsTypeId Institutes = 2
126 ngramsTypeId Sources = 3
127 ngramsTypeId NgramsTerms = 4
128
129 fromNgramsTypeId :: NgramsTypeId -> Maybe NgramsType
130 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
131
132 type NgramsTerms = Text
133 type NgramsId = Int
134 type Size = Int
135
136 ------------------------------------------------------------------------
137 -- | TODO put it in Gargantext.Text.Ngrams
138 data Ngrams = Ngrams { _ngramsTerms :: Text
139 , _ngramsSize :: Int
140 } deriving (Generic, Show, Eq, Ord)
141
142 makeLenses ''Ngrams
143 instance PGS.ToRow Ngrams where
144 toRow (Ngrams t s) = [toField t, toField s]
145
146 text2ngrams :: Text -> Ngrams
147 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
148
149 -------------------------------------------------------------------------
150 -- | TODO put it in Gargantext.Text.Ngrams
151 -- Named entity are typed ngrams of Terms Ngrams
152 data NgramsT a =
153 NgramsT { _ngramsType :: NgramsType
154 , _ngramsT :: a
155 } deriving (Generic, Show, Eq, Ord)
156
157 makeLenses ''NgramsT
158
159 instance Functor NgramsT where
160 fmap = over ngramsT
161 -----------------------------------------------------------------------
162 data NgramsIndexed =
163 NgramsIndexed
164 { _ngrams :: Ngrams
165 , _ngramsId :: NgramsId
166 } deriving (Show, Generic, Eq, Ord)
167
168 makeLenses ''NgramsIndexed
169 ------------------------------------------------------------------------
170 data NgramIds =
171 NgramIds
172 { ngramId :: Int
173 , ngramTerms :: Text
174 } deriving (Show, Generic, Eq, Ord)
175
176 instance PGS.FromRow NgramIds where
177 fromRow = NgramIds <$> field <*> field
178
179 ----------------------
180 withMap :: Map NgramsTerms NgramsId -> NgramsTerms -> NgramsId
181 withMap m n = maybe (panic "withMap: should not happen") identity (lookup n m)
182
183 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
184 indexNgramsT = fmap . indexNgramsWith . withMap
185
186 indexNgrams :: Map NgramsTerms NgramsId -> Ngrams -> NgramsIndexed
187 indexNgrams = indexNgramsWith . withMap
188
189 -- NP: not sure we need it anymore
190 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams -> NgramsT NgramsIndexed
191 indexNgramsTWith = fmap . indexNgramsWith
192
193 indexNgramsWith :: (NgramsTerms -> NgramsId) -> Ngrams -> NgramsIndexed
194 indexNgramsWith f n = NgramsIndexed n (f $ _ngramsTerms n)
195
196 insertNgrams :: [Ngrams] -> Cmd err (Map NgramsTerms NgramsId)
197 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
198
199 insertNgrams' :: [Ngrams] -> Cmd err [NgramIds]
200 insertNgrams' ns = runPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
201 where
202 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
203
204 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd err ByteString
205 insertNgrams_Debug ns = formatPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
206 where
207 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
208
209 ----------------------
210 queryInsertNgrams :: PGS.Query
211 queryInsertNgrams = [sql|
212 WITH input_rows(terms,n) AS (?)
213 , ins AS (
214 INSERT INTO ngrams (terms,n)
215 SELECT * FROM input_rows
216 ON CONFLICT (terms) DO NOTHING -- unique index created here
217 RETURNING id,terms
218 )
219
220 SELECT id, terms
221 FROM ins
222 UNION ALL
223 SELECT c.id, terms
224 FROM input_rows
225 JOIN ngrams c USING (terms); -- columns of unique index
226 |]
227
228
229 -- | Ngrams Table
230 -- TODO: the way we are getting main Master Corpus and List ID is not clean
231 -- TODO: if ids are not present -> create
232 -- TODO: Global Env State Monad to keep in memory the ids without retrieving it each time
233 getNgramsTableDb :: NodeType -> NgramsType
234 -> NgramsTableParamUser
235 -> Limit -> Offset
236 -> Cmd err ([NgramsTableData], MapToParent, MapToChildren)
237 getNgramsTableDb nt ngrt ntp@(NgramsTableParam listIdUser _) limit_ offset_ = do
238
239
240 maybeRoot <- head <$> getRoot userMaster
241 let path = "Garg.Db.Ngrams.getTableNgrams: "
242 let masterRootId = maybe (panic $ path <> "no userMaster Tree") (view node_id) maybeRoot
243 -- let errMess = panic "Error"
244
245 corpusMasterId <- maybe (panic "error master corpus") (view node_id) <$> head <$> getCorporaWithParentId masterRootId
246
247 listMasterId <- maybe (panic "error master list") (view node_id) <$> head <$> getListsWithParentId corpusMasterId
248
249 ngramsTableData <- getNgramsTableData nt ngrt ntp (NgramsTableParam listMasterId corpusMasterId) limit_ offset_
250
251 (mapToParent,mapToChildren) <- getNgramsGroup listIdUser listMasterId
252 pure (ngramsTableData, mapToParent,mapToChildren)
253
254
255 data NgramsTableParam =
256 NgramsTableParam { _nt_listId :: NodeId
257 , _nt_corpusId :: NodeId
258 }
259
260 type NgramsTableParamUser = NgramsTableParam
261 type NgramsTableParamMaster = NgramsTableParam
262
263 data NgramsTableData = NgramsTableData { _ntd_ngrams :: Text
264 , _ntd_n :: Int
265 , _ntd_listType :: Maybe ListType
266 , _ntd_weight :: Double
267 } deriving (Show)
268
269 getNgramsTableData :: NodeType -> NgramsType
270 -> NgramsTableParamUser -> NgramsTableParamMaster
271 -> Limit -> Offset
272 -> Cmd err [NgramsTableData]
273 getNgramsTableData nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) limit_ offset_ =
274 -- trace ("Ngrams table params" <> show params) <$>
275 map (\(t,n,nt,w) -> NgramsTableData t n (fromListTypeId nt) w) <$>
276 runPGSQuery querySelectTableNgrams params
277 where
278 nodeTId = nodeTypeId nodeT
279 ngrmTId = ngramsTypeId ngrmT
280 params = (ul,uc,nodeTId,ngrmTId,ml,mc,nodeTId,ngrmTId,uc) :.
281 (limit_, offset_)
282
283
284
285 querySelectTableNgrams :: PGS.Query
286 querySelectTableNgrams = [sql|
287
288 WITH tableUser AS (
289 SELECT ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
290 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
291 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
292 JOIN nodes_nodes nn ON nn.node2_id = corp.node_id
293 JOIN nodes n ON n.id = corp.node_id
294
295 WHERE list.node_id = ? -- User listId
296 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
297 AND n.typename = ? -- both type of childs (Documents or Contacts)
298 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms or...)
299 )
300 , tableMaster AS (
301 SELECT ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
302 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
303 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
304 JOIN nodes n ON n.id = corp.node_id
305 JOIN nodes_nodes nn ON nn.node2_id = n.id
306
307 WHERE list.node_id = ? -- Master listId
308 AND n.parent_id = ? -- Master CorpusId or AnnuaireId
309 AND n.typename = ? -- Master childs (Documents or Contacts)
310 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
311 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
312 )
313
314 SELECT COALESCE(tu.terms,tm.terms) AS terms
315 , COALESCE(tu.n,tm.n) AS n
316 , COALESCE(tu.list_type,tm.list_type) AS ngrams_type
317 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
318 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
319 GROUP BY tu.terms,tm.terms,tu.n,tm.n,tu.list_type,tm.list_type
320 ORDER BY 1,2
321 LIMIT ?
322 OFFSET ?;
323
324 |]
325
326 type ListIdUser = NodeId
327 type ListIdMaster = NodeId
328
329 type MapToChildren = Map Text (Set Text)
330 type MapToParent = Map Text Text
331
332 getNgramsGroup :: ListIdUser -> ListIdMaster -> Cmd err (MapToParent, MapToChildren)
333 getNgramsGroup lu lm = do
334 groups <- runPGSQuery querySelectNgramsGroup (lu,lm)
335 let mapChildren = fromListWith (<>) $ map (\(a,b) -> (a, DS.singleton b)) groups
336 let mapParent = fromListWith (<>) $ map (\(a,b) -> (b, a)) groups
337 pure (mapParent, mapChildren)
338
339 querySelectNgramsGroup :: PGS.Query
340 querySelectNgramsGroup = [sql|
341 WITH groupUser AS (
342 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
343 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
344 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
345 WHERE
346 nnn.node_id = ? -- User listId
347 ),
348 groupMaster AS (
349 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
350 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
351 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
352 WHERE
353 nnn.node_id = ? -- Master listId
354 )
355 SELECT COALESCE(gu.t1,gm.t1) AS ngram1_id
356 , COALESCE(gu.t2,gm.t2) AS ngram2_id
357 FROM groupUser gu LEFT JOIN groupMaster gm ON gu.t1 = gm.t1
358 |]
359