]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Ngrams.hs
[DOC] adding comments for todo.
[gargantext.git] / src / Gargantext / Database / Ngrams.hs
1 {-|
2 Module : Gargantext.Database.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams connection to the Database.
11
12 -}
13
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE MultiParamTypeClasses #-}
19 {-# LANGUAGE NoImplicitPrelude #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE QuasiQuotes #-}
22 {-# LANGUAGE TemplateHaskell #-}
23
24 module Gargantext.Database.Ngrams where
25
26
27 import Database.PostgreSQL.Simple as DPS (Connection)
28 import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
29 import Opaleye
30 import Control.Lens (makeLenses, view)
31 import Data.ByteString.Internal (ByteString)
32 import Data.Map (Map, fromList, lookup, fromListWith)
33 import Data.Set (Set)
34 import Data.Text (Text, splitOn)
35 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
36 import Database.PostgreSQL.Simple.SqlQQ (sql)
37 import Database.PostgreSQL.Simple.ToField (toField)
38 import Database.PostgreSQL.Simple.ToRow (toRow)
39 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
40 import Debug.Trace (trace)
41 import GHC.Generics (Generic)
42 import Gargantext.Core.Types -- (fromListTypeId, ListType, NodePoly(Node))
43 import Gargantext.Database.Config (nodeTypeId,userMaster)
44 import Gargantext.Database.Node (mkCmd, Cmd(..),getListsWithParentId, getCorporaWithParentId)
45 import Gargantext.Database.Root (getRoot)
46 import Gargantext.Core.Types (CorpusId)
47 import Gargantext.Database.Types.Node (NodeType)
48 import Gargantext.Prelude
49 import Prelude (Enum, Bounded, minBound, maxBound)
50 import qualified Data.Set as DS
51 import qualified Database.PostgreSQL.Simple as DPS
52
53 --{-
54 data NgramsPoly id terms n = NgramsDb { ngrams_id :: id
55 , ngrams_terms :: terms
56 , ngrams_n :: n
57 } deriving (Show)
58
59 --}
60 type NgramsWrite = NgramsPoly (Maybe (Column PGInt4))
61 (Column PGText)
62 (Column PGInt4)
63
64 type NgramsRead = NgramsPoly (Column PGInt4)
65 (Column PGText)
66 (Column PGInt4)
67
68 type NgramsReadNull = NgramsPoly (Column (Nullable PGInt4))
69 (Column (Nullable PGText))
70 (Column (Nullable PGInt4))
71
72 --{-
73 type NgramsDb = NgramsPoly Int Text Int
74
75 $(makeAdaptorAndInstance "pNgramsDb" ''NgramsPoly)
76 -- $(makeLensesWith abbreviatedFields ''NgramsPoly)
77
78 ngramsTable :: Table NgramsWrite NgramsRead
79 ngramsTable = Table "ngrams" (pNgramsDb NgramsDb { ngrams_id = optional "id"
80 , ngrams_terms = required "terms"
81 , ngrams_n = required "n"
82 }
83 )
84 --{-
85 queryNgramsTable :: Query NgramsRead
86 queryNgramsTable = queryTable ngramsTable
87
88 dbGetNgramsDb :: DPS.Connection -> IO [NgramsDb]
89 dbGetNgramsDb conn = runQuery conn queryNgramsTable
90 --}
91
92 -- | Main Ngrams Types
93 -- | Typed Ngrams
94 -- Typed Ngrams localize the context of the ngrams
95 -- ngrams in source field of document has Sources Type
96 -- ngrams in authors field of document has Authors Type
97 -- ngrams in text (title or abstract) of documents has Terms Type
98 data NgramsType = Authors | Institutes | Sources | NgramsTerms
99 deriving (Eq, Show, Ord, Enum, Bounded)
100
101 ngramsTypeId :: NgramsType -> Int
102 ngramsTypeId Authors = 1
103 ngramsTypeId Institutes = 2
104 ngramsTypeId Sources = 3
105 ngramsTypeId NgramsTerms = 4
106
107 fromNgramsTypeId :: Int -> Maybe NgramsType
108 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
109
110 type NgramsTerms = Text
111 type NgramsId = Int
112 type Size = Int
113
114 ------------------------------------------------------------------------
115 -- | TODO put it in Gargantext.Text.Ngrams
116 data Ngrams = Ngrams { _ngramsTerms :: Text
117 , _ngramsSize :: Int
118 } deriving (Generic, Show, Eq, Ord)
119
120 makeLenses ''Ngrams
121 instance DPS.ToRow Ngrams where
122 toRow (Ngrams t s) = [toField t, toField s]
123
124 text2ngrams :: Text -> Ngrams
125 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
126
127 -------------------------------------------------------------------------
128 -- | TODO put it in Gargantext.Text.Ngrams
129 -- Named entity are typed ngrams of Terms Ngrams
130 data NgramsT a =
131 NgramsT { _ngramsType :: NgramsType
132 , _ngramsT :: a
133 } deriving (Generic, Show, Eq, Ord)
134
135 makeLenses ''NgramsT
136 -----------------------------------------------------------------------
137 data NgramsIndexed =
138 NgramsIndexed
139 { _ngrams :: Ngrams
140 , _ngramsId :: NgramsId
141 } deriving (Show, Generic, Eq, Ord)
142
143 makeLenses ''NgramsIndexed
144 ------------------------------------------------------------------------
145 data NgramIds =
146 NgramIds
147 { ngramId :: Int
148 , ngramTerms :: Text
149 } deriving (Show, Generic, Eq, Ord)
150
151 instance DPS.FromRow NgramIds where
152 fromRow = NgramIds <$> field <*> field
153
154 ----------------------
155 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
156 indexNgramsT m ngrId = indexNgramsTWith f ngrId
157 where
158 f n = maybe (panic "indexNgramsT: should not happen") identity (lookup n m)
159
160 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams-> NgramsT NgramsIndexed
161 indexNgramsTWith f (NgramsT t n) = NgramsT t (NgramsIndexed n ((f . _ngramsTerms) n))
162
163 insertNgrams :: [Ngrams] -> Cmd (Map NgramsTerms NgramsId)
164 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
165
166 insertNgrams' :: [Ngrams] -> Cmd [NgramIds]
167 insertNgrams' ns = mkCmd $ \conn -> DPS.query conn queryInsertNgrams (DPS.Only $ Values fields ns)
168 where
169 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
170
171 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd ByteString
172 insertNgrams_Debug ns = mkCmd $ \conn -> DPS.formatQuery conn queryInsertNgrams (DPS.Only $ Values fields ns)
173 where
174 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
175
176 ----------------------
177 queryInsertNgrams :: DPS.Query
178 queryInsertNgrams = [sql|
179 WITH input_rows(terms,n) AS (?)
180 , ins AS (
181 INSERT INTO ngrams (terms,n)
182 SELECT * FROM input_rows
183 ON CONFLICT (terms) DO NOTHING -- unique index created here
184 RETURNING id,terms
185 )
186
187 SELECT id, terms
188 FROM ins
189 UNION ALL
190 SELECT c.id, terms
191 FROM input_rows
192 JOIN ngrams c USING (terms); -- columns of unique index
193 |]
194
195 defaultList :: DPS.Connection -> CorpusId -> IO ListId
196 defaultList c cId = view node_id <$> maybe (panic errMessage) identity
197 <$> head
198 <$> getListsWithParentId c cId
199 where
200 errMessage = "Gargantext.API.Ngrams.defaultList: no list found"
201
202 -- | Ngrams Table
203 -- TODO: the way we are getting main Master Corpus and List ID is not clean
204 -- TODO: if ids are not present -> create
205 -- TODO: Global Env State Monad to keep in memory the ids without retrieving it each time
206 getNgramsTableDb :: DPS.Connection
207 -> NodeType -> NgramsType
208 -> NgramsTableParamUser
209 -> IO ([NgramsTableData], MapToParent, MapToChildren)
210 getNgramsTableDb c nt ngrt ntp@(NgramsTableParam listIdUser _) = do
211
212
213 maybeRoot <- head <$> getRoot userMaster c
214 let path = "Garg.Db.Ngrams.getTableNgrams: "
215 let masterRootId = maybe (panic $ path <> "no userMaster Tree") (view node_id) maybeRoot
216 -- let errMess = panic "Error"
217
218 corpusMasterId <- maybe (panic "error master corpus") (view node_id) <$> head <$> getCorporaWithParentId c masterRootId
219
220 listMasterId <- maybe (panic "error master list") (view node_id) <$> head <$> getListsWithParentId c corpusMasterId
221
222 ngramsTableData <- getNgramsTableData c nt ngrt ntp (NgramsTableParam listMasterId corpusMasterId)
223
224 (mapToParent,mapToChildren) <- getNgramsGroup c listIdUser listMasterId
225 pure (ngramsTableData, mapToParent,mapToChildren)
226
227
228 data NgramsTableParam =
229 NgramsTableParam { _nt_listId :: Int
230 , _nt_corpusId :: Int
231 }
232
233 type NgramsTableParamUser = NgramsTableParam
234 type NgramsTableParamMaster = NgramsTableParam
235
236 data NgramsTableData = NgramsTableData { _ntd_ngrams :: Text
237 , _ntd_n :: Int
238 , _ntd_listType :: Maybe ListType
239 , _ntd_weight :: Double
240 } deriving (Show)
241
242 getNgramsTableData :: DPS.Connection
243 -> NodeType -> NgramsType
244 -> NgramsTableParamUser -> NgramsTableParamMaster
245 -> IO [NgramsTableData]
246 getNgramsTableData conn nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) =
247 trace ("Ngrams table params" <> show params) <$>
248 map (\(t,n,nt,w) -> NgramsTableData t n (fromListTypeId nt) w) <$>
249 DPS.query conn querySelectTableNgrams params
250 where
251 nodeTId = nodeTypeId nodeT
252 ngrmTId = ngramsTypeId ngrmT
253 params = (ul,uc,nodeTId,ngrmTId,ml,mc,nodeTId,ngrmTId,uc)
254
255
256
257 querySelectTableNgrams :: DPS.Query
258 querySelectTableNgrams = [sql|
259
260 WITH tableUser AS (
261 SELECT ngs.terms, ngs.n, list.ngrams_type, corp.weight FROM ngrams ngs
262 JOIN nodes_ngrams list ON list.ngram_id = ngs.id
263 JOIN nodes_ngrams corp ON corp.ngram_id = ngs.id
264 JOIN nodes n ON n.id = corp.node_id
265
266 WHERE list.node_id = ? -- User listId
267 AND n.parent_id = ? -- User CorpusId or AnnuaireId
268 AND n.typename = ? -- both type of childs (Documents or Contacts)
269 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms or...)
270 )
271 , tableMaster AS (
272 SELECT ngs.terms, ngs.n, list.ngrams_type, corp.weight FROM ngrams ngs
273 JOIN nodes_ngrams list ON list.ngram_id = ngs.id
274 JOIN nodes_ngrams corp ON corp.ngram_id = ngs.id
275 JOIN nodes n ON n.id = corp.node_id
276 JOIN nodes_nodes nn ON nn.node2_id = n.id
277
278 WHERE list.node_id = ? -- Master listId
279 AND n.parent_id = ? -- Master CorpusId or AnnuaireId
280 AND n.typename = ? -- Master childs (Documents or Contacts)
281 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
282 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
283 )
284
285 SELECT COALESCE(tu.terms,tm.terms) AS terms
286 , COALESCE(tu.n,tm.n) AS n
287 , COALESCE(tu.ngrams_type,tm.ngrams_type) AS ngrams_type
288 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
289 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
290 GROUP BY tu.terms,tm.terms,tu.n,tm.n,tu.ngrams_type,tm.ngrams_type;
291
292 |]
293
294 type ListIdUser = Int
295 type ListIdMaster = Int
296
297 type MapToChildren = Map Text (Set Text)
298 type MapToParent = Map Text Text
299
300 getNgramsGroup :: DPS.Connection -> ListIdUser -> ListIdMaster -> IO (MapToParent, MapToChildren)
301 getNgramsGroup conn lu lm = do
302 groups <- getNgramsGroup' conn lu lm
303 let mapChildren = fromListWith (<>) $ map (\(a,b) -> (a, DS.singleton b)) groups
304 let mapParent = fromListWith (<>) $ map (\(a,b) -> (b, a)) groups
305 pure (mapParent, mapChildren)
306
307 getNgramsGroup' :: DPS.Connection -> ListIdUser -> ListIdMaster -> IO [(Text,Text)]
308 getNgramsGroup' conn lu lm = DPS.query conn querySelectNgramsGroup (lu,lm)
309
310 getNgramsGroup'' :: ListIdUser -> ListIdMaster -> Cmd [(Text, Text)]
311 getNgramsGroup'' lu lm = mkCmd $ \conn -> DPS.query conn querySelectNgramsGroup (lu,lm)
312
313 querySelectNgramsGroup :: DPS.Query
314 querySelectNgramsGroup = [sql|
315 WITH groupUser AS (
316 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
317 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
318 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
319 WHERE
320 nnn.node_id = ? -- User listId
321 ),
322 groupMaster AS (
323 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
324 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
325 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
326 WHERE
327 nnn.node_id = ? -- Master listId
328 )
329 SELECT COALESCE(gu.t1,gm.t1) AS ngram1_id
330 , COALESCE(gu.t2,gm.t2) AS ngram2_id
331 FROM groupUser gu RIGHT JOIN groupMaster gm ON gu.t1 = gm.t1
332 |]
333