]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Schema/Ngrams.hs
[Database] Clean duplicate functions
[gargantext.git] / src / Gargantext / Database / Schema / Ngrams.hs
1 {-|
2 Module : Gargantext.Database.Schema.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams connection to the Database.
11
12 -}
13
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE MultiParamTypeClasses #-}
19 {-# LANGUAGE NoImplicitPrelude #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE QuasiQuotes #-}
22 {-# LANGUAGE TemplateHaskell #-}
23
24 module Gargantext.Database.Schema.Ngrams where
25
26
27 import Control.Lens (makeLenses, view)
28 import Data.ByteString.Internal (ByteString)
29 import Data.Map (Map, fromList, lookup, fromListWith)
30 import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
31 import Data.Set (Set)
32 import Data.Text (Text, splitOn)
33 import Database.PostgreSQL.Simple as DPS (Connection)
34 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
35 import Database.PostgreSQL.Simple.SqlQQ (sql)
36 import Database.PostgreSQL.Simple.ToField (toField)
37 import Database.PostgreSQL.Simple.ToRow (toRow)
38 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
39 import Debug.Trace (trace)
40 import GHC.Generics (Generic)
41 import Gargantext.Core.Types (CorpusId)
42 import Gargantext.Core.Types -- (fromListTypeId, ListType, NodePoly(Node))
43 import Gargantext.Database.Config (nodeTypeId,userMaster)
44 import Gargantext.Database.Root (getRoot)
45 import Gargantext.Database.Types.Node (NodeType)
46 import Gargantext.Database.Schema.Node (getListsWithParentId, getCorporaWithParentId)
47 import Gargantext.Database.Utils (mkCmd, Cmd(..))
48 import Gargantext.Prelude
49 import Opaleye
50 import Prelude (Enum, Bounded, minBound, maxBound)
51 import qualified Data.Set as DS
52 import qualified Database.PostgreSQL.Simple as DPS
53
54 --{-
55 data NgramsPoly id terms n = NgramsDb { ngrams_id :: id
56 , ngrams_terms :: terms
57 , ngrams_n :: n
58 } deriving (Show)
59
60 --}
61 type NgramsWrite = NgramsPoly (Maybe (Column PGInt4))
62 (Column PGText)
63 (Column PGInt4)
64
65 type NgramsRead = NgramsPoly (Column PGInt4)
66 (Column PGText)
67 (Column PGInt4)
68
69 type NgramsReadNull = NgramsPoly (Column (Nullable PGInt4))
70 (Column (Nullable PGText))
71 (Column (Nullable PGInt4))
72
73 --{-
74 type NgramsDb = NgramsPoly Int Text Int
75
76 $(makeAdaptorAndInstance "pNgramsDb" ''NgramsPoly)
77 -- $(makeLensesWith abbreviatedFields ''NgramsPoly)
78
79 ngramsTable :: Table NgramsWrite NgramsRead
80 ngramsTable = Table "ngrams" (pNgramsDb NgramsDb { ngrams_id = optional "id"
81 , ngrams_terms = required "terms"
82 , ngrams_n = required "n"
83 }
84 )
85 --{-
86 queryNgramsTable :: Query NgramsRead
87 queryNgramsTable = queryTable ngramsTable
88
89 dbGetNgramsDb :: DPS.Connection -> IO [NgramsDb]
90 dbGetNgramsDb conn = runQuery conn queryNgramsTable
91 --}
92
93 -- | Main Ngrams Types
94 -- | Typed Ngrams
95 -- Typed Ngrams localize the context of the ngrams
96 -- ngrams in source field of document has Sources Type
97 -- ngrams in authors field of document has Authors Type
98 -- ngrams in text (title or abstract) of documents has Terms Type
99 data NgramsType = Authors | Institutes | Sources | NgramsTerms
100 deriving (Eq, Show, Ord, Enum, Bounded)
101
102 ngramsTypeId :: NgramsType -> Int
103 ngramsTypeId Authors = 1
104 ngramsTypeId Institutes = 2
105 ngramsTypeId Sources = 3
106 ngramsTypeId NgramsTerms = 4
107
108 fromNgramsTypeId :: Int -> Maybe NgramsType
109 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
110
111 type NgramsTerms = Text
112 type NgramsId = Int
113 type Size = Int
114
115 ------------------------------------------------------------------------
116 -- | TODO put it in Gargantext.Text.Ngrams
117 data Ngrams = Ngrams { _ngramsTerms :: Text
118 , _ngramsSize :: Int
119 } deriving (Generic, Show, Eq, Ord)
120
121 makeLenses ''Ngrams
122 instance DPS.ToRow Ngrams where
123 toRow (Ngrams t s) = [toField t, toField s]
124
125 text2ngrams :: Text -> Ngrams
126 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
127
128 -------------------------------------------------------------------------
129 -- | TODO put it in Gargantext.Text.Ngrams
130 -- Named entity are typed ngrams of Terms Ngrams
131 data NgramsT a =
132 NgramsT { _ngramsType :: NgramsType
133 , _ngramsT :: a
134 } deriving (Generic, Show, Eq, Ord)
135
136 makeLenses ''NgramsT
137 -----------------------------------------------------------------------
138 data NgramsIndexed =
139 NgramsIndexed
140 { _ngrams :: Ngrams
141 , _ngramsId :: NgramsId
142 } deriving (Show, Generic, Eq, Ord)
143
144 makeLenses ''NgramsIndexed
145 ------------------------------------------------------------------------
146 data NgramIds =
147 NgramIds
148 { ngramId :: Int
149 , ngramTerms :: Text
150 } deriving (Show, Generic, Eq, Ord)
151
152 instance DPS.FromRow NgramIds where
153 fromRow = NgramIds <$> field <*> field
154
155 ----------------------
156 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
157 indexNgramsT m ngrId = indexNgramsTWith f ngrId
158 where
159 f n = maybe (panic "indexNgramsT: should not happen") identity (lookup n m)
160
161 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams-> NgramsT NgramsIndexed
162 indexNgramsTWith f (NgramsT t n) = NgramsT t (NgramsIndexed n ((f . _ngramsTerms) n))
163
164 insertNgrams :: [Ngrams] -> Cmd (Map NgramsTerms NgramsId)
165 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
166
167 insertNgrams' :: [Ngrams] -> Cmd [NgramIds]
168 insertNgrams' ns = mkCmd $ \conn -> DPS.query conn queryInsertNgrams (DPS.Only $ Values fields ns)
169 where
170 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
171
172 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd ByteString
173 insertNgrams_Debug ns = mkCmd $ \conn -> DPS.formatQuery conn queryInsertNgrams (DPS.Only $ Values fields ns)
174 where
175 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
176
177 ----------------------
178 queryInsertNgrams :: DPS.Query
179 queryInsertNgrams = [sql|
180 WITH input_rows(terms,n) AS (?)
181 , ins AS (
182 INSERT INTO ngrams (terms,n)
183 SELECT * FROM input_rows
184 ON CONFLICT (terms) DO NOTHING -- unique index created here
185 RETURNING id,terms
186 )
187
188 SELECT id, terms
189 FROM ins
190 UNION ALL
191 SELECT c.id, terms
192 FROM input_rows
193 JOIN ngrams c USING (terms); -- columns of unique index
194 |]
195
196 defaultList :: DPS.Connection -> CorpusId -> IO ListId
197 defaultList c cId = view node_id <$> maybe (panic errMessage) identity
198 <$> head
199 <$> getListsWithParentId c cId
200 where
201 errMessage = "Gargantext.API.Ngrams.defaultList: no list found"
202
203 -- | Ngrams Table
204 -- TODO: the way we are getting main Master Corpus and List ID is not clean
205 -- TODO: if ids are not present -> create
206 -- TODO: Global Env State Monad to keep in memory the ids without retrieving it each time
207 getNgramsTableDb :: DPS.Connection
208 -> NodeType -> NgramsType
209 -> NgramsTableParamUser
210 -> IO ([NgramsTableData], MapToParent, MapToChildren)
211 getNgramsTableDb c nt ngrt ntp@(NgramsTableParam listIdUser _) = do
212
213
214 maybeRoot <- head <$> getRoot userMaster c
215 let path = "Garg.Db.Ngrams.getTableNgrams: "
216 let masterRootId = maybe (panic $ path <> "no userMaster Tree") (view node_id) maybeRoot
217 -- let errMess = panic "Error"
218
219 corpusMasterId <- maybe (panic "error master corpus") (view node_id) <$> head <$> getCorporaWithParentId c masterRootId
220
221 listMasterId <- maybe (panic "error master list") (view node_id) <$> head <$> getListsWithParentId c corpusMasterId
222
223 ngramsTableData <- getNgramsTableData c nt ngrt ntp (NgramsTableParam listMasterId corpusMasterId)
224
225 (mapToParent,mapToChildren) <- getNgramsGroup c listIdUser listMasterId
226 pure (ngramsTableData, mapToParent,mapToChildren)
227
228
229 data NgramsTableParam =
230 NgramsTableParam { _nt_listId :: Int
231 , _nt_corpusId :: Int
232 }
233
234 type NgramsTableParamUser = NgramsTableParam
235 type NgramsTableParamMaster = NgramsTableParam
236
237 data NgramsTableData = NgramsTableData { _ntd_ngrams :: Text
238 , _ntd_n :: Int
239 , _ntd_listType :: Maybe ListType
240 , _ntd_weight :: Double
241 } deriving (Show)
242
243 getNgramsTableData :: DPS.Connection
244 -> NodeType -> NgramsType
245 -> NgramsTableParamUser -> NgramsTableParamMaster
246 -> IO [NgramsTableData]
247 getNgramsTableData conn nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) =
248 trace ("Ngrams table params" <> show params) <$>
249 map (\(t,n,nt,w) -> NgramsTableData t n (fromListTypeId nt) w) <$>
250 DPS.query conn querySelectTableNgrams params
251 where
252 nodeTId = nodeTypeId nodeT
253 ngrmTId = ngramsTypeId ngrmT
254 params = (ul,uc,nodeTId,ngrmTId,ml,mc,nodeTId,ngrmTId,uc)
255
256
257
258 querySelectTableNgrams :: DPS.Query
259 querySelectTableNgrams = [sql|
260
261 WITH tableUser AS (
262 SELECT ngs.terms, ngs.n, list.ngrams_type, corp.weight FROM ngrams ngs
263 JOIN nodes_ngrams list ON list.ngram_id = ngs.id
264 JOIN nodes_ngrams corp ON corp.ngram_id = ngs.id
265 JOIN nodes n ON n.id = corp.node_id
266
267 WHERE list.node_id = ? -- User listId
268 AND n.parent_id = ? -- User CorpusId or AnnuaireId
269 AND n.typename = ? -- both type of childs (Documents or Contacts)
270 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms or...)
271 )
272 , tableMaster AS (
273 SELECT ngs.terms, ngs.n, list.ngrams_type, corp.weight FROM ngrams ngs
274 JOIN nodes_ngrams list ON list.ngram_id = ngs.id
275 JOIN nodes_ngrams corp ON corp.ngram_id = ngs.id
276 JOIN nodes n ON n.id = corp.node_id
277 JOIN nodes_nodes nn ON nn.node2_id = n.id
278
279 WHERE list.node_id = ? -- Master listId
280 AND n.parent_id = ? -- Master CorpusId or AnnuaireId
281 AND n.typename = ? -- Master childs (Documents or Contacts)
282 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
283 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
284 )
285
286 SELECT COALESCE(tu.terms,tm.terms) AS terms
287 , COALESCE(tu.n,tm.n) AS n
288 , COALESCE(tu.ngrams_type,tm.ngrams_type) AS ngrams_type
289 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
290 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
291 GROUP BY tu.terms,tm.terms,tu.n,tm.n,tu.ngrams_type,tm.ngrams_type;
292
293 |]
294
295 type ListIdUser = Int
296 type ListIdMaster = Int
297
298 type MapToChildren = Map Text (Set Text)
299 type MapToParent = Map Text Text
300
301 getNgramsGroup :: DPS.Connection -> ListIdUser -> ListIdMaster -> IO (MapToParent, MapToChildren)
302 getNgramsGroup conn lu lm = do
303 groups <- getNgramsGroup' conn lu lm
304 let mapChildren = fromListWith (<>) $ map (\(a,b) -> (a, DS.singleton b)) groups
305 let mapParent = fromListWith (<>) $ map (\(a,b) -> (b, a)) groups
306 pure (mapParent, mapChildren)
307
308 getNgramsGroup' :: DPS.Connection -> ListIdUser -> ListIdMaster -> IO [(Text,Text)]
309 getNgramsGroup' conn lu lm = DPS.query conn querySelectNgramsGroup (lu,lm)
310
311 getNgramsGroup'' :: ListIdUser -> ListIdMaster -> Cmd [(Text, Text)]
312 getNgramsGroup'' lu lm = mkCmd $ \conn -> DPS.query conn querySelectNgramsGroup (lu,lm)
313
314 querySelectNgramsGroup :: DPS.Query
315 querySelectNgramsGroup = [sql|
316 WITH groupUser AS (
317 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
318 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
319 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
320 WHERE
321 nnn.node_id = ? -- User listId
322 ),
323 groupMaster AS (
324 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
325 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
326 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
327 WHERE
328 nnn.node_id = ? -- Master listId
329 )
330 SELECT COALESCE(gu.t1,gm.t1) AS ngram1_id
331 , COALESCE(gu.t2,gm.t2) AS ngram2_id
332 FROM groupUser gu RIGHT JOIN groupMaster gm ON gu.t1 = gm.t1
333 |]
334