]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Schema/Ngrams.hs
[FLOW][Database] ngrams extraction and insertion.
[gargantext.git] / src / Gargantext / Database / Schema / Ngrams.hs
1 {-|
2 Module : Gargantext.Database.Schema.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams connection to the Database.
11
12 -}
13
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE MultiParamTypeClasses #-}
19 {-# LANGUAGE NoImplicitPrelude #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE QuasiQuotes #-}
22 {-# LANGUAGE TemplateHaskell #-}
23
24 module Gargantext.Database.Schema.Ngrams where
25
26
27 import Control.Lens (makeLenses, view)
28 import Data.ByteString.Internal (ByteString)
29 import Data.Map (Map, fromList, lookup, fromListWith)
30 import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
31 import Data.Set (Set)
32 import Data.Text (Text, splitOn)
33 import Database.PostgreSQL.Simple ((:.)(..))
34 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
35 import Database.PostgreSQL.Simple.SqlQQ (sql)
36 import Database.PostgreSQL.Simple.ToField (toField)
37 import Database.PostgreSQL.Simple.ToRow (toRow)
38 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
39 import Debug.Trace (trace)
40 import GHC.Generics (Generic)
41 import Gargantext.Core.Types -- (fromListTypeId, ListType, NodePoly(Node))
42 import Gargantext.Database.Config (nodeTypeId,userMaster)
43 import Gargantext.Database.Root (getRoot)
44 import Gargantext.Database.Types.Node (NodeType)
45 import Gargantext.Database.Schema.Node (getListsWithParentId, getCorporaWithParentId)
46 import Gargantext.Database.Utils (mkCmd, Cmd(..))
47 import Gargantext.Prelude
48 import Opaleye
49 import Prelude (Enum, Bounded, minBound, maxBound)
50 import qualified Data.Set as DS
51 import qualified Database.PostgreSQL.Simple as DPS
52
53 --{-
54 data NgramsPoly id terms n = NgramsDb { ngrams_id :: id
55 , ngrams_terms :: terms
56 , ngrams_n :: n
57 } deriving (Show)
58
59 --}
60 type NgramsWrite = NgramsPoly (Maybe (Column PGInt4))
61 (Column PGText)
62 (Column PGInt4)
63
64 type NgramsRead = NgramsPoly (Column PGInt4)
65 (Column PGText)
66 (Column PGInt4)
67
68 type NgramsReadNull = NgramsPoly (Column (Nullable PGInt4))
69 (Column (Nullable PGText))
70 (Column (Nullable PGInt4))
71
72 --{-
73 type NgramsDb = NgramsPoly Int Text Int
74
75 $(makeAdaptorAndInstance "pNgramsDb" ''NgramsPoly)
76 -- $(makeLensesWith abbreviatedFields ''NgramsPoly)
77
78 ngramsTable :: Table NgramsWrite NgramsRead
79 ngramsTable = Table "ngrams" (pNgramsDb NgramsDb { ngrams_id = optional "id"
80 , ngrams_terms = required "terms"
81 , ngrams_n = required "n"
82 }
83 )
84 --{-
85 queryNgramsTable :: Query NgramsRead
86 queryNgramsTable = queryTable ngramsTable
87
88 dbGetNgramsDb :: DPS.Connection -> IO [NgramsDb]
89 dbGetNgramsDb conn = runQuery conn queryNgramsTable
90 --}
91
92 -- | Main Ngrams Types
93 -- | Typed Ngrams
94 -- Typed Ngrams localize the context of the ngrams
95 -- ngrams in source field of document has Sources Type
96 -- ngrams in authors field of document has Authors Type
97 -- ngrams in text (title or abstract) of documents has Terms Type
98 data NgramsType = Authors | Institutes | Sources | NgramsTerms
99 deriving (Eq, Show, Ord, Enum, Bounded)
100
101 ngramsTypeId :: NgramsType -> Int
102 ngramsTypeId Authors = 1
103 ngramsTypeId Institutes = 2
104 ngramsTypeId Sources = 3
105 ngramsTypeId NgramsTerms = 4
106
107 fromNgramsTypeId :: Int -> Maybe NgramsType
108 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
109
110 type NgramsTerms = Text
111 type NgramsId = Int
112 type Size = Int
113
114 ------------------------------------------------------------------------
115 -- | TODO put it in Gargantext.Text.Ngrams
116 data Ngrams = Ngrams { _ngramsTerms :: Text
117 , _ngramsSize :: Int
118 } deriving (Generic, Show, Eq, Ord)
119
120 makeLenses ''Ngrams
121 instance DPS.ToRow Ngrams where
122 toRow (Ngrams t s) = [toField t, toField s]
123
124 text2ngrams :: Text -> Ngrams
125 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
126
127 -------------------------------------------------------------------------
128 -- | TODO put it in Gargantext.Text.Ngrams
129 -- Named entity are typed ngrams of Terms Ngrams
130 data NgramsT a =
131 NgramsT { _ngramsType :: NgramsType
132 , _ngramsT :: a
133 } deriving (Generic, Show, Eq, Ord)
134
135 makeLenses ''NgramsT
136 -----------------------------------------------------------------------
137 data NgramsIndexed =
138 NgramsIndexed
139 { _ngrams :: Ngrams
140 , _ngramsId :: NgramsId
141 } deriving (Show, Generic, Eq, Ord)
142
143 makeLenses ''NgramsIndexed
144 ------------------------------------------------------------------------
145 data NgramIds =
146 NgramIds
147 { ngramId :: Int
148 , ngramTerms :: Text
149 } deriving (Show, Generic, Eq, Ord)
150
151 instance DPS.FromRow NgramIds where
152 fromRow = NgramIds <$> field <*> field
153
154 ----------------------
155 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
156 indexNgramsT m ngrId = indexNgramsTWith f ngrId
157 where
158 f n = maybe (panic "indexNgramsT: should not happen") identity (lookup n m)
159
160 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams-> NgramsT NgramsIndexed
161 indexNgramsTWith f (NgramsT t n) = NgramsT t (NgramsIndexed n ((f . _ngramsTerms) n))
162
163 insertNgrams :: [Ngrams] -> Cmd (Map NgramsTerms NgramsId)
164 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
165
166 insertNgrams' :: [Ngrams] -> Cmd [NgramIds]
167 insertNgrams' ns = mkCmd $ \conn -> DPS.query conn queryInsertNgrams (DPS.Only $ Values fields ns)
168 where
169 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
170
171 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd ByteString
172 insertNgrams_Debug ns = mkCmd $ \conn -> DPS.formatQuery conn queryInsertNgrams (DPS.Only $ Values fields ns)
173 where
174 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
175
176 ----------------------
177 queryInsertNgrams :: DPS.Query
178 queryInsertNgrams = [sql|
179 WITH input_rows(terms,n) AS (?)
180 , ins AS (
181 INSERT INTO ngrams (terms,n)
182 SELECT * FROM input_rows
183 ON CONFLICT (terms) DO NOTHING -- unique index created here
184 RETURNING id,terms
185 )
186
187 SELECT id, terms
188 FROM ins
189 UNION ALL
190 SELECT c.id, terms
191 FROM input_rows
192 JOIN ngrams c USING (terms); -- columns of unique index
193 |]
194
195
196 -- | Ngrams Table
197 -- TODO: the way we are getting main Master Corpus and List ID is not clean
198 -- TODO: if ids are not present -> create
199 -- TODO: Global Env State Monad to keep in memory the ids without retrieving it each time
200 getNgramsTableDb :: DPS.Connection
201 -> NodeType -> NgramsType
202 -> NgramsTableParamUser
203 -> Limit -> Offset
204 -> IO ([NgramsTableData], MapToParent, MapToChildren)
205 getNgramsTableDb c nt ngrt ntp@(NgramsTableParam listIdUser _) limit_ offset_ = do
206
207
208 maybeRoot <- head <$> getRoot userMaster c
209 let path = "Garg.Db.Ngrams.getTableNgrams: "
210 let masterRootId = maybe (panic $ path <> "no userMaster Tree") (view node_id) maybeRoot
211 -- let errMess = panic "Error"
212
213 corpusMasterId <- maybe (panic "error master corpus") (view node_id) <$> head <$> getCorporaWithParentId c masterRootId
214
215 listMasterId <- maybe (panic "error master list") (view node_id) <$> head <$> getListsWithParentId c corpusMasterId
216
217 ngramsTableData <- getNgramsTableData c nt ngrt ntp (NgramsTableParam listMasterId corpusMasterId) limit_ offset_
218
219 (mapToParent,mapToChildren) <- getNgramsGroup c listIdUser listMasterId
220 pure (ngramsTableData, mapToParent,mapToChildren)
221
222
223 data NgramsTableParam =
224 NgramsTableParam { _nt_listId :: Int
225 , _nt_corpusId :: Int
226 }
227
228 type NgramsTableParamUser = NgramsTableParam
229 type NgramsTableParamMaster = NgramsTableParam
230
231 data NgramsTableData = NgramsTableData { _ntd_ngrams :: Text
232 , _ntd_n :: Int
233 , _ntd_listType :: Maybe ListType
234 , _ntd_weight :: Double
235 } deriving (Show)
236
237 getNgramsTableData :: DPS.Connection
238 -> NodeType -> NgramsType
239 -> NgramsTableParamUser -> NgramsTableParamMaster
240 -> Limit -> Offset
241 -> IO [NgramsTableData]
242 getNgramsTableData conn nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) limit_ offset_ =
243 trace ("Ngrams table params" <> show params) <$>
244 map (\(t,n,nt,w) -> NgramsTableData t n (fromListTypeId nt) w) <$>
245 DPS.query conn querySelectTableNgrams params
246 where
247 nodeTId = nodeTypeId nodeT
248 ngrmTId = ngramsTypeId ngrmT
249 params = (ul,uc,nodeTId,ngrmTId,ml,mc,nodeTId,ngrmTId,uc) :.
250 (limit_, offset_)
251
252
253
254 querySelectTableNgrams :: DPS.Query
255 querySelectTableNgrams = [sql|
256
257 WITH tableUser AS (
258 SELECT ngs.terms, ngs.n, list.ngrams_type, corp.weight FROM ngrams ngs
259 JOIN nodes_ngrams list ON list.ngram_id = ngs.id
260 JOIN nodes_ngrams corp ON corp.ngram_id = ngs.id
261 JOIN nodes n ON n.id = corp.node_id
262
263 WHERE list.node_id = ? -- User listId
264 AND n.parent_id = ? -- User CorpusId or AnnuaireId
265 AND n.typename = ? -- both type of childs (Documents or Contacts)
266 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms or...)
267 )
268 , tableMaster AS (
269 SELECT ngs.terms, ngs.n, list.ngrams_type, corp.weight FROM ngrams ngs
270 JOIN nodes_ngrams list ON list.ngram_id = ngs.id
271 JOIN nodes_ngrams corp ON corp.ngram_id = ngs.id
272 JOIN nodes n ON n.id = corp.node_id
273 JOIN nodes_nodes nn ON nn.node2_id = n.id
274
275 WHERE list.node_id = ? -- Master listId
276 AND n.parent_id = ? -- Master CorpusId or AnnuaireId
277 AND n.typename = ? -- Master childs (Documents or Contacts)
278 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
279 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
280 )
281
282 SELECT COALESCE(tu.terms,tm.terms) AS terms
283 , COALESCE(tu.n,tm.n) AS n
284 , COALESCE(tu.ngrams_type,tm.ngrams_type) AS ngrams_type
285 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
286 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
287 GROUP BY tu.terms,tm.terms,tu.n,tm.n,tu.ngrams_type,tm.ngrams_type
288 LIMIT ?
289 OFFSET ?;
290
291 |]
292
293 type ListIdUser = Int
294 type ListIdMaster = Int
295
296 type MapToChildren = Map Text (Set Text)
297 type MapToParent = Map Text Text
298
299 getNgramsGroup :: DPS.Connection -> ListIdUser -> ListIdMaster -> IO (MapToParent, MapToChildren)
300 getNgramsGroup conn lu lm = do
301 groups <- getNgramsGroup' conn lu lm
302 let mapChildren = fromListWith (<>) $ map (\(a,b) -> (a, DS.singleton b)) groups
303 let mapParent = fromListWith (<>) $ map (\(a,b) -> (b, a)) groups
304 pure (mapParent, mapChildren)
305
306 getNgramsGroup' :: DPS.Connection -> ListIdUser -> ListIdMaster -> IO [(Text,Text)]
307 getNgramsGroup' conn lu lm = DPS.query conn querySelectNgramsGroup (lu,lm)
308
309 getNgramsGroup'' :: ListIdUser -> ListIdMaster -> Cmd [(Text, Text)]
310 getNgramsGroup'' lu lm = mkCmd $ \conn -> DPS.query conn querySelectNgramsGroup (lu,lm)
311
312 querySelectNgramsGroup :: DPS.Query
313 querySelectNgramsGroup = [sql|
314 WITH groupUser AS (
315 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
316 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
317 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
318 WHERE
319 nnn.node_id = ? -- User listId
320 ),
321 groupMaster AS (
322 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
323 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
324 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
325 WHERE
326 nnn.node_id = ? -- Master listId
327 )
328 SELECT COALESCE(gu.t1,gm.t1) AS ngram1_id
329 , COALESCE(gu.t2,gm.t2) AS ngram2_id
330 FROM groupUser gu RIGHT JOIN groupMaster gm ON gu.t1 = gm.t1
331 |]
332