]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Schema/Ngrams.hs
[Text][Parsers][Isidore] acceptable query for next step (typing the result).
[gargantext.git] / src / Gargantext / Database / Schema / Ngrams.hs
1 {-|
2 Module : Gargantext.Database.Schema.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams connection to the Database.
11
12 -}
13
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE MultiParamTypeClasses #-}
19 {-# LANGUAGE NoImplicitPrelude #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE QuasiQuotes #-}
22 {-# LANGUAGE RankNTypes #-}
23 {-# LANGUAGE TemplateHaskell #-}
24
25 module Gargantext.Database.Schema.Ngrams where
26
27
28 import Control.Lens (makeLenses, view)
29 import Data.ByteString.Internal (ByteString)
30 import Data.Map (Map, fromList, lookup, fromListWith)
31 import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
32 import Data.Set (Set)
33 import Data.Text (Text, splitOn)
34 import Database.PostgreSQL.Simple ((:.)(..))
35 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
36 import Database.PostgreSQL.Simple.SqlQQ (sql)
37 import Database.PostgreSQL.Simple.ToField (toField)
38 import Database.PostgreSQL.Simple.ToRow (toRow)
39 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
40 import Debug.Trace (trace)
41 import GHC.Generics (Generic)
42 import Gargantext.Core.Types -- (fromListTypeId, ListType, NodePoly(Node))
43 import Gargantext.Database.Config (nodeTypeId,userMaster)
44 import Gargantext.Database.Root (getRoot)
45 import Gargantext.Database.Types.Node (NodeType)
46 import Gargantext.Database.Schema.Node (getListsWithParentId, getCorporaWithParentId)
47 import Gargantext.Database.Utils (Cmd, runPGSQuery, runOpaQuery, formatPGSQuery)
48 import Gargantext.Prelude
49 import Opaleye
50 import Prelude (Enum, Bounded, minBound, maxBound)
51 import qualified Data.Set as DS
52 import qualified Database.PostgreSQL.Simple as PGS
53
54 --{-
55 data NgramsPoly id terms n = NgramsDb { ngrams_id :: id
56 , ngrams_terms :: terms
57 , ngrams_n :: n
58 } deriving (Show)
59
60 --}
61 type NgramsWrite = NgramsPoly (Maybe (Column PGInt4))
62 (Column PGText)
63 (Column PGInt4)
64
65 type NgramsRead = NgramsPoly (Column PGInt4)
66 (Column PGText)
67 (Column PGInt4)
68
69 type NgramsReadNull = NgramsPoly (Column (Nullable PGInt4))
70 (Column (Nullable PGText))
71 (Column (Nullable PGInt4))
72
73 --{-
74 type NgramsDb = NgramsPoly Int Text Int
75
76 $(makeAdaptorAndInstance "pNgramsDb" ''NgramsPoly)
77 -- $(makeLensesWith abbreviatedFields ''NgramsPoly)
78
79 ngramsTable :: Table NgramsWrite NgramsRead
80 ngramsTable = Table "ngrams" (pNgramsDb NgramsDb { ngrams_id = optional "id"
81 , ngrams_terms = required "terms"
82 , ngrams_n = required "n"
83 }
84 )
85 --{-
86 queryNgramsTable :: Query NgramsRead
87 queryNgramsTable = queryTable ngramsTable
88
89 dbGetNgramsDb :: Cmd err [NgramsDb]
90 dbGetNgramsDb = runOpaQuery queryNgramsTable
91 --}
92
93 -- | Main Ngrams Types
94 -- | Typed Ngrams
95 -- Typed Ngrams localize the context of the ngrams
96 -- ngrams in source field of document has Sources Type
97 -- ngrams in authors field of document has Authors Type
98 -- ngrams in text (title or abstract) of documents has Terms Type
99 data NgramsType = Authors | Institutes | Sources | NgramsTerms
100 deriving (Eq, Show, Ord, Enum, Bounded)
101
102 ngramsTypeId :: NgramsType -> Int
103 ngramsTypeId Authors = 1
104 ngramsTypeId Institutes = 2
105 ngramsTypeId Sources = 3
106 ngramsTypeId NgramsTerms = 4
107
108 fromNgramsTypeId :: Int -> Maybe NgramsType
109 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
110
111 type NgramsTerms = Text
112 type NgramsId = Int
113 type Size = Int
114
115 ------------------------------------------------------------------------
116 -- | TODO put it in Gargantext.Text.Ngrams
117 data Ngrams = Ngrams { _ngramsTerms :: Text
118 , _ngramsSize :: Int
119 } deriving (Generic, Show, Eq, Ord)
120
121 makeLenses ''Ngrams
122 instance PGS.ToRow Ngrams where
123 toRow (Ngrams t s) = [toField t, toField s]
124
125 text2ngrams :: Text -> Ngrams
126 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
127
128 -------------------------------------------------------------------------
129 -- | TODO put it in Gargantext.Text.Ngrams
130 -- Named entity are typed ngrams of Terms Ngrams
131 data NgramsT a =
132 NgramsT { _ngramsType :: NgramsType
133 , _ngramsT :: a
134 } deriving (Generic, Show, Eq, Ord)
135
136 makeLenses ''NgramsT
137 -----------------------------------------------------------------------
138 data NgramsIndexed =
139 NgramsIndexed
140 { _ngrams :: Ngrams
141 , _ngramsId :: NgramsId
142 } deriving (Show, Generic, Eq, Ord)
143
144 makeLenses ''NgramsIndexed
145 ------------------------------------------------------------------------
146 data NgramIds =
147 NgramIds
148 { ngramId :: Int
149 , ngramTerms :: Text
150 } deriving (Show, Generic, Eq, Ord)
151
152 instance PGS.FromRow NgramIds where
153 fromRow = NgramIds <$> field <*> field
154
155 ----------------------
156 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
157 indexNgramsT m ngrId = indexNgramsTWith f ngrId
158 where
159 f n = maybe (panic "indexNgramsT: should not happen") identity (lookup n m)
160
161 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams-> NgramsT NgramsIndexed
162 indexNgramsTWith f (NgramsT t n) = NgramsT t (NgramsIndexed n ((f . _ngramsTerms) n))
163
164 insertNgrams :: [Ngrams] -> Cmd err (Map NgramsTerms NgramsId)
165 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
166
167 insertNgrams' :: [Ngrams] -> Cmd err [NgramIds]
168 insertNgrams' ns = runPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
169 where
170 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
171
172 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd err ByteString
173 insertNgrams_Debug ns = formatPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
174 where
175 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
176
177 ----------------------
178 queryInsertNgrams :: PGS.Query
179 queryInsertNgrams = [sql|
180 WITH input_rows(terms,n) AS (?)
181 , ins AS (
182 INSERT INTO ngrams (terms,n)
183 SELECT * FROM input_rows
184 ON CONFLICT (terms) DO NOTHING -- unique index created here
185 RETURNING id,terms
186 )
187
188 SELECT id, terms
189 FROM ins
190 UNION ALL
191 SELECT c.id, terms
192 FROM input_rows
193 JOIN ngrams c USING (terms); -- columns of unique index
194 |]
195
196
197 -- | Ngrams Table
198 -- TODO: the way we are getting main Master Corpus and List ID is not clean
199 -- TODO: if ids are not present -> create
200 -- TODO: Global Env State Monad to keep in memory the ids without retrieving it each time
201 getNgramsTableDb :: NodeType -> NgramsType
202 -> NgramsTableParamUser
203 -> Limit -> Offset
204 -> Cmd err ([NgramsTableData], MapToParent, MapToChildren)
205 getNgramsTableDb nt ngrt ntp@(NgramsTableParam listIdUser _) limit_ offset_ = do
206
207
208 maybeRoot <- head <$> getRoot userMaster
209 let path = "Garg.Db.Ngrams.getTableNgrams: "
210 let masterRootId = maybe (panic $ path <> "no userMaster Tree") (view node_id) maybeRoot
211 -- let errMess = panic "Error"
212
213 corpusMasterId <- maybe (panic "error master corpus") (view node_id) <$> head <$> getCorporaWithParentId masterRootId
214
215 listMasterId <- maybe (panic "error master list") (view node_id) <$> head <$> getListsWithParentId corpusMasterId
216
217 ngramsTableData <- getNgramsTableData nt ngrt ntp (NgramsTableParam listMasterId corpusMasterId) limit_ offset_
218
219 (mapToParent,mapToChildren) <- getNgramsGroup listIdUser listMasterId
220 pure (ngramsTableData, mapToParent,mapToChildren)
221
222
223 data NgramsTableParam =
224 NgramsTableParam { _nt_listId :: Int
225 , _nt_corpusId :: Int
226 }
227
228 type NgramsTableParamUser = NgramsTableParam
229 type NgramsTableParamMaster = NgramsTableParam
230
231 data NgramsTableData = NgramsTableData { _ntd_ngrams :: Text
232 , _ntd_n :: Int
233 , _ntd_listType :: Maybe ListType
234 , _ntd_weight :: Double
235 } deriving (Show)
236
237 getNgramsTableData :: NodeType -> NgramsType
238 -> NgramsTableParamUser -> NgramsTableParamMaster
239 -> Limit -> Offset
240 -> Cmd err [NgramsTableData]
241 getNgramsTableData nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) limit_ offset_ =
242 trace ("Ngrams table params" <> show params) <$>
243 map (\(t,n,nt,w) -> NgramsTableData t n (fromListTypeId nt) w) <$>
244 runPGSQuery querySelectTableNgrams params
245 where
246 nodeTId = nodeTypeId nodeT
247 ngrmTId = ngramsTypeId ngrmT
248 params = (ul,uc,nodeTId,ngrmTId,ml,mc,nodeTId,ngrmTId,uc) :.
249 (limit_, offset_)
250
251
252
253 querySelectTableNgrams :: PGS.Query
254 querySelectTableNgrams = [sql|
255
256 WITH tableUser AS (
257 SELECT ngs.terms, ngs.n, list.ngrams_type, corp.weight FROM ngrams ngs
258 JOIN nodes_ngrams list ON list.ngram_id = ngs.id
259 JOIN nodes_ngrams corp ON corp.ngram_id = ngs.id
260 JOIN nodes n ON n.id = corp.node_id
261
262 WHERE list.node_id = ? -- User listId
263 AND n.parent_id = ? -- User CorpusId or AnnuaireId
264 AND n.typename = ? -- both type of childs (Documents or Contacts)
265 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms or...)
266 )
267 , tableMaster AS (
268 SELECT ngs.terms, ngs.n, list.ngrams_type, corp.weight FROM ngrams ngs
269 JOIN nodes_ngrams list ON list.ngram_id = ngs.id
270 JOIN nodes_ngrams corp ON corp.ngram_id = ngs.id
271 JOIN nodes n ON n.id = corp.node_id
272 JOIN nodes_nodes nn ON nn.node2_id = n.id
273
274 WHERE list.node_id = ? -- Master listId
275 AND n.parent_id = ? -- Master CorpusId or AnnuaireId
276 AND n.typename = ? -- Master childs (Documents or Contacts)
277 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
278 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
279 )
280
281 SELECT COALESCE(tu.terms,tm.terms) AS terms
282 , COALESCE(tu.n,tm.n) AS n
283 , COALESCE(tu.ngrams_type,tm.ngrams_type) AS ngrams_type
284 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
285 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
286 GROUP BY tu.terms,tm.terms,tu.n,tm.n,tu.ngrams_type,tm.ngrams_type
287 LIMIT ?
288 OFFSET ?;
289
290 |]
291
292 type ListIdUser = Int
293 type ListIdMaster = Int
294
295 type MapToChildren = Map Text (Set Text)
296 type MapToParent = Map Text Text
297
298 getNgramsGroup :: ListIdUser -> ListIdMaster -> Cmd err (MapToParent, MapToChildren)
299 getNgramsGroup lu lm = do
300 groups <- runPGSQuery querySelectNgramsGroup (lu,lm)
301 let mapChildren = fromListWith (<>) $ map (\(a,b) -> (a, DS.singleton b)) groups
302 let mapParent = fromListWith (<>) $ map (\(a,b) -> (b, a)) groups
303 pure (mapParent, mapChildren)
304
305 querySelectNgramsGroup :: PGS.Query
306 querySelectNgramsGroup = [sql|
307 WITH groupUser AS (
308 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
309 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
310 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
311 WHERE
312 nnn.node_id = ? -- User listId
313 ),
314 groupMaster AS (
315 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
316 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
317 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
318 WHERE
319 nnn.node_id = ? -- Master listId
320 )
321 SELECT COALESCE(gu.t1,gm.t1) AS ngram1_id
322 , COALESCE(gu.t2,gm.t2) AS ngram2_id
323 FROM groupUser gu RIGHT JOIN groupMaster gm ON gu.t1 = gm.t1
324 |]
325