]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Ngrams.hs
[Children][TextQueries]
[gargantext.git] / src / Gargantext / Database / Ngrams.hs
1 {-|
2 Module : Gargantext.Database.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams connection to the Database.
11
12 -}
13
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE MultiParamTypeClasses #-}
19 {-# LANGUAGE NoImplicitPrelude #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE QuasiQuotes #-}
22 {-# LANGUAGE TemplateHaskell #-}
23
24 module Gargantext.Database.Ngrams where
25
26 import Control.Lens (makeLenses, view)
27 import Data.ByteString.Internal (ByteString)
28 import Data.Map (Map, fromList, lookup, fromListWith)
29 import Data.Set (Set)
30 import Data.Text (Text, splitOn)
31 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
32 import Database.PostgreSQL.Simple.SqlQQ (sql)
33 import Database.PostgreSQL.Simple.ToField (toField)
34 import Database.PostgreSQL.Simple.ToRow (toRow)
35 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
36 import Debug.Trace (trace)
37 import GHC.Generics (Generic)
38 import Gargantext.Core.Types -- (fromListTypeId, ListType, NodePoly(Node))
39 import Gargantext.Database.Config (nodeTypeId,userMaster)
40 import Gargantext.Database.Node (mkCmd, Cmd(..),getListsWithParentId, getCorporaWithParentId)
41 import Gargantext.Database.Root (getRoot)
42 import Gargantext.Core.Types (CorpusId)
43 import Gargantext.Database.Types.Node (NodeType)
44 import Gargantext.Prelude
45 import Prelude (Enum, Bounded, minBound, maxBound)
46 import qualified Data.Set as DS
47 import qualified Database.PostgreSQL.Simple as DPS
48
49 --data NgramPoly id terms n = NgramDb { ngram_id :: id
50 -- , ngram_terms :: terms
51 -- , ngram_n :: n
52 -- } deriving (Show)
53 --
54 --type NgramWrite = NgramPoly (Maybe (Column PGInt4))
55 -- (Column PGText)
56 -- (Column PGInt4)
57 --
58 --type NgramRead = NgramPoly (Column PGInt4)
59 -- (Column PGText)
60 -- (Column PGInt4)
61 --
62 ----type Ngram = NgramPoly Int Text Int
63 --
64 -- $(makeAdaptorAndInstance "pNgram" ''NgramPoly)
65 -- $(makeLensesWith abbreviatedFields ''NgramPoly)
66 --
67 --ngramTable :: Table NgramWrite NgramRead
68 --ngramTable = Table "ngrams" (pNgram NgramDb { ngram_id = optional "id"
69 -- , ngram_terms = required "terms"
70 -- , ngram_n = required "n"
71 -- }
72 -- )
73 --
74 --queryNgramTable :: Query NgramRead
75 --queryNgramTable = queryTable ngramTable
76 --
77 --dbGetNgrams :: PGS.Connection -> IO [NgramDb]
78 --dbGetNgrams conn = runQuery conn queryNgramTable
79
80 -- | Main Ngrams Types
81 -- | Typed Ngrams
82 -- Typed Ngrams localize the context of the ngrams
83 -- ngrams in source field of document has Sources Type
84 -- ngrams in authors field of document has Authors Type
85 -- ngrams in text (title or abstract) of documents has Terms Type
86 data NgramsType = Authors | Institutes | Sources | NgramsTerms
87 deriving (Eq, Show, Ord, Enum, Bounded)
88
89 ngramsTypeId :: NgramsType -> Int
90 ngramsTypeId Authors = 1
91 ngramsTypeId Institutes = 2
92 ngramsTypeId Sources = 3
93 ngramsTypeId NgramsTerms = 4
94
95 fromNgramsTypeId :: Int -> Maybe NgramsType
96 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
97
98 type NgramsTerms = Text
99 type NgramsId = Int
100 type Size = Int
101
102 ------------------------------------------------------------------------
103 -- | TODO put it in Gargantext.Text.Ngrams
104 data Ngrams = Ngrams { _ngramsTerms :: Text
105 , _ngramsSize :: Int
106 } deriving (Generic, Show, Eq, Ord)
107
108 makeLenses ''Ngrams
109 instance DPS.ToRow Ngrams where
110 toRow (Ngrams t s) = [toField t, toField s]
111
112 text2ngrams :: Text -> Ngrams
113 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
114
115 -------------------------------------------------------------------------
116 -- | TODO put it in Gargantext.Text.Ngrams
117 -- Named entity are typed ngrams of Terms Ngrams
118 data NgramsT a =
119 NgramsT { _ngramsType :: NgramsType
120 , _ngramsT :: a
121 } deriving (Generic, Show, Eq, Ord)
122
123 makeLenses ''NgramsT
124 -----------------------------------------------------------------------
125 data NgramsIndexed =
126 NgramsIndexed
127 { _ngrams :: Ngrams
128 , _ngramsId :: NgramsId
129 } deriving (Show, Generic, Eq, Ord)
130
131 makeLenses ''NgramsIndexed
132 ------------------------------------------------------------------------
133 data NgramIds =
134 NgramIds
135 { ngramId :: Int
136 , ngramTerms :: Text
137 } deriving (Show, Generic, Eq, Ord)
138
139 instance DPS.FromRow NgramIds where
140 fromRow = NgramIds <$> field <*> field
141
142 ----------------------
143 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
144 indexNgramsT m ngrId = indexNgramsTWith f ngrId
145 where
146 f n = maybe (panic "indexNgramsT: should not happen") identity (lookup n m)
147
148 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams-> NgramsT NgramsIndexed
149 indexNgramsTWith f (NgramsT t n) = NgramsT t (NgramsIndexed n ((f . _ngramsTerms) n))
150
151 insertNgrams :: [Ngrams] -> Cmd (Map NgramsTerms NgramsId)
152 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
153
154 insertNgrams' :: [Ngrams] -> Cmd [NgramIds]
155 insertNgrams' ns = mkCmd $ \conn -> DPS.query conn queryInsertNgrams (DPS.Only $ Values fields ns)
156 where
157 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
158
159 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd ByteString
160 insertNgrams_Debug ns = mkCmd $ \conn -> DPS.formatQuery conn queryInsertNgrams (DPS.Only $ Values fields ns)
161 where
162 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
163
164 ----------------------
165 queryInsertNgrams :: DPS.Query
166 queryInsertNgrams = [sql|
167 WITH input_rows(terms,n) AS (?)
168 , ins AS (
169 INSERT INTO ngrams (terms,n)
170 SELECT * FROM input_rows
171 ON CONFLICT (terms) DO NOTHING -- unique index created here
172 RETURNING id,terms
173 )
174
175 SELECT id, terms
176 FROM ins
177 UNION ALL
178 SELECT c.id, terms
179 FROM input_rows
180 JOIN ngrams c USING (terms); -- columns of unique index
181 |]
182
183 defaultList :: DPS.Connection -> CorpusId -> IO ListId
184 defaultList c cId = view node_id <$> maybe (panic errMessage) identity
185 <$> head
186 <$> getListsWithParentId c cId
187 where
188 errMessage = "Gargantext.API.Ngrams.defaultList: no list found"
189
190 -- | Ngrams Table
191 -- TODO: the way we are getting main Master Corpus and List ID is not clean
192 -- TODO: if ids are not present -> create
193 -- TODO: Global Env State Monad to keep in memory the ids without retrieving it each time
194 getNgramsTableDb :: DPS.Connection
195 -> NodeType -> NgramsType
196 -> NgramsTableParamUser
197 -> IO ([NgramsTableData], MapToParent, MapToChildren)
198 getNgramsTableDb c nt ngrt ntp@(NgramsTableParam listIdUser _) = do
199
200
201 maybeRoot <- head <$> getRoot userMaster c
202 let path = "Garg.Db.Ngrams.getTableNgrams: "
203 let masterRootId = maybe (panic $ path <> "no userMaster Tree") (view node_id) maybeRoot
204 -- let errMess = panic "Error"
205
206 corpusMasterId <- maybe (panic "error master corpus") (view node_id) <$> head <$> getCorporaWithParentId c masterRootId
207
208 listMasterId <- maybe (panic "error master list") (view node_id) <$> head <$> getListsWithParentId c corpusMasterId
209
210 ngramsTableData <- getNgramsTableData c nt ngrt ntp (NgramsTableParam listMasterId corpusMasterId)
211
212 (mapToParent,mapToChildren) <- getNgramsGroup c listIdUser listMasterId
213 pure (ngramsTableData, mapToParent,mapToChildren)
214
215
216 data NgramsTableParam =
217 NgramsTableParam { _nt_listId :: Int
218 , _nt_corpusId :: Int
219 }
220
221 type NgramsTableParamUser = NgramsTableParam
222 type NgramsTableParamMaster = NgramsTableParam
223
224 data NgramsTableData = NgramsTableData { _ntd_ngrams :: Text
225 , _ntd_n :: Int
226 , _ntd_listType :: Maybe ListType
227 , _ntd_weight :: Double
228 } deriving (Show)
229
230 getNgramsTableData :: DPS.Connection
231 -> NodeType -> NgramsType
232 -> NgramsTableParamUser -> NgramsTableParamMaster
233 -> IO [NgramsTableData]
234 getNgramsTableData conn nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) =
235 trace ("Ngrams table params" <> show params) <$>
236 map (\(t,n,nt,w) -> NgramsTableData t n (fromListTypeId nt) w) <$>
237 DPS.query conn querySelectTableNgrams params
238 where
239 nodeTId = nodeTypeId nodeT
240 ngrmTId = ngramsTypeId ngrmT
241 params = (ul,uc,nodeTId,ngrmTId,ml,mc,nodeTId,ngrmTId,uc)
242
243
244
245 querySelectTableNgrams :: DPS.Query
246 querySelectTableNgrams = [sql|
247
248 WITH tableUser AS (
249 SELECT ngs.terms, ngs.n, list.ngrams_type, corp.weight FROM ngrams ngs
250 JOIN nodes_ngrams list ON list.ngram_id = ngs.id
251 JOIN nodes_ngrams corp ON corp.ngram_id = ngs.id
252 JOIN nodes n ON n.id = corp.node_id
253
254 WHERE list.node_id = ? -- User listId
255 AND n.parent_id = ? -- User CorpusId or AnnuaireId
256 AND n.typename = ? -- both type of childs (Documents or Contacts)
257 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms or...)
258 )
259 , tableMaster AS (
260 SELECT ngs.terms, ngs.n, list.ngrams_type, corp.weight FROM ngrams ngs
261 JOIN nodes_ngrams list ON list.ngram_id = ngs.id
262 JOIN nodes_ngrams corp ON corp.ngram_id = ngs.id
263 JOIN nodes n ON n.id = corp.node_id
264 JOIN nodes_nodes nn ON nn.node2_id = n.id
265
266 WHERE list.node_id = ? -- Master listId
267 AND n.parent_id = ? -- Master CorpusId or AnnuaireId
268 AND n.typename = ? -- Master childs (Documents or Contacts)
269 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
270 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
271 )
272
273 SELECT COALESCE(tu.terms,tm.terms) AS terms
274 , COALESCE(tu.n,tm.n) AS n
275 , COALESCE(tu.ngrams_type,tm.ngrams_type) AS ngrams_type
276 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
277 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
278 GROUP BY tu.terms,tm.terms,tu.n,tm.n,tu.ngrams_type,tm.ngrams_type;
279
280 |]
281
282 type ListIdUser = Int
283 type ListIdMaster = Int
284
285 type MapToChildren = Map Text (Set Text)
286 type MapToParent = Map Text Text
287
288 getNgramsGroup :: DPS.Connection -> ListIdUser -> ListIdMaster -> IO (MapToParent, MapToChildren)
289 getNgramsGroup conn lu lm = do
290 groups <- getNgramsGroup' conn lu lm
291 let mapChildren = fromListWith (<>) $ map (\(a,b) -> (a, DS.singleton b)) groups
292 let mapParent = fromListWith (<>) $ map (\(a,b) -> (b, a)) groups
293 pure (mapParent, mapChildren)
294
295 getNgramsGroup' :: DPS.Connection -> ListIdUser -> ListIdMaster -> IO [(Text,Text)]
296 getNgramsGroup' conn lu lm = DPS.query conn querySelectNgramsGroup (lu,lm)
297
298 getNgramsGroup'' :: ListIdUser -> ListIdMaster -> Cmd [(Text, Text)]
299 getNgramsGroup'' lu lm = mkCmd $ \conn -> DPS.query conn querySelectNgramsGroup (lu,lm)
300
301 querySelectNgramsGroup :: DPS.Query
302 querySelectNgramsGroup = [sql|
303 WITH groupUser AS (
304 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
305 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
306 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
307 WHERE
308 nnn.node_id = ? -- User listId
309 ),
310 groupMaster AS (
311 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
312 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
313 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
314 WHERE
315 nnn.node_id = ? -- Master listId
316 )
317 SELECT COALESCE(gu.t1,gm.t1) AS ngram1_id
318 , COALESCE(gu.t2,gm.t2) AS ngram2_id
319 FROM groupUser gu RIGHT JOIN groupMaster gm ON gu.t1 = gm.t1
320 |]
321