]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Ngrams.hs
[PAIRING] pairing (quite tested but not roughly). Need to add list to the pairing...
[gargantext.git] / src / Gargantext / Database / Ngrams.hs
1 {-|
2 Module : Gargantext.Database.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams connection to the Database.
11
12 -}
13
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE MultiParamTypeClasses #-}
19 {-# LANGUAGE NoImplicitPrelude #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE QuasiQuotes #-}
22 {-# LANGUAGE TemplateHaskell #-}
23
24 module Gargantext.Database.Ngrams where
25
26
27 import Database.PostgreSQL.Simple as DPS (Connection)
28 --import Control.Lens.TH (makeLensesWith, abbreviatedFields)
29 --import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
30 --import Opaleye
31 import Control.Lens (makeLenses, view)
32 import Data.ByteString.Internal (ByteString)
33 import Data.Map (Map, fromList, lookup, fromListWith)
34 import Data.Set (Set)
35 import Data.Text (Text, splitOn)
36 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
37 import Database.PostgreSQL.Simple.SqlQQ (sql)
38 import Database.PostgreSQL.Simple.ToField (toField)
39 import Database.PostgreSQL.Simple.ToRow (toRow)
40 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
41 import Debug.Trace (trace)
42 import GHC.Generics (Generic)
43 import Gargantext.Core.Types -- (fromListTypeId, ListType, NodePoly(Node))
44 import Gargantext.Database.Config (nodeTypeId,userMaster)
45 import Gargantext.Database.Node (mkCmd, Cmd(..),getListsWithParentId, getCorporaWithParentId)
46 import Gargantext.Database.Root (getRoot)
47 import Gargantext.Core.Types (CorpusId)
48 import Gargantext.Database.Types.Node (NodeType)
49 import Gargantext.Prelude
50 import Prelude (Enum, Bounded, minBound, maxBound)
51 import qualified Data.Set as DS
52 import qualified Database.PostgreSQL.Simple as DPS
53
54 {-
55 data NgramPoly id terms n = NgramDb { ngram_id :: id
56 , ngram_terms :: terms
57 , ngram_n :: n
58 } deriving (Show)
59
60 type NgramWrite = NgramPoly (Maybe (Column PGInt4))
61 (Column PGText)
62 (Column PGInt4)
63
64 type NgramRead = NgramPoly (Column PGInt4)
65 (Column PGText)
66 (Column PGInt4)
67
68 --type Ngram = NgramPoly Int Text Int
69
70 $(makeAdaptorAndInstance "pNgram" ''NgramPoly)
71 $(makeLensesWith abbreviatedFields ''NgramPoly)
72
73 ngramTable :: Table NgramWrite NgramRead
74 ngramTable = Table "ngrams" (pNgram NgramDb { ngram_id = optional "id"
75 , ngram_terms = required "terms"
76 , ngram_n = required "n"
77 }
78 )
79
80 queryNgramTable :: Query NgramRead
81 queryNgramTable = queryTable ngramTable
82
83 dbGetNgrams :: DPS.Connection -> IO [NgramDb]
84 dbGetNgrams conn = runQuery conn queryNgramTable
85 -}
86
87 -- | Main Ngrams Types
88 -- | Typed Ngrams
89 -- Typed Ngrams localize the context of the ngrams
90 -- ngrams in source field of document has Sources Type
91 -- ngrams in authors field of document has Authors Type
92 -- ngrams in text (title or abstract) of documents has Terms Type
93 data NgramsType = Authors | Institutes | Sources | NgramsTerms
94 deriving (Eq, Show, Ord, Enum, Bounded)
95
96 ngramsTypeId :: NgramsType -> Int
97 ngramsTypeId Authors = 1
98 ngramsTypeId Institutes = 2
99 ngramsTypeId Sources = 3
100 ngramsTypeId NgramsTerms = 4
101
102 fromNgramsTypeId :: Int -> Maybe NgramsType
103 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
104
105 type NgramsTerms = Text
106 type NgramsId = Int
107 type Size = Int
108
109 ------------------------------------------------------------------------
110 -- | TODO put it in Gargantext.Text.Ngrams
111 data Ngrams = Ngrams { _ngramsTerms :: Text
112 , _ngramsSize :: Int
113 } deriving (Generic, Show, Eq, Ord)
114
115 makeLenses ''Ngrams
116 instance DPS.ToRow Ngrams where
117 toRow (Ngrams t s) = [toField t, toField s]
118
119 text2ngrams :: Text -> Ngrams
120 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
121
122 -------------------------------------------------------------------------
123 -- | TODO put it in Gargantext.Text.Ngrams
124 -- Named entity are typed ngrams of Terms Ngrams
125 data NgramsT a =
126 NgramsT { _ngramsType :: NgramsType
127 , _ngramsT :: a
128 } deriving (Generic, Show, Eq, Ord)
129
130 makeLenses ''NgramsT
131 -----------------------------------------------------------------------
132 data NgramsIndexed =
133 NgramsIndexed
134 { _ngrams :: Ngrams
135 , _ngramsId :: NgramsId
136 } deriving (Show, Generic, Eq, Ord)
137
138 makeLenses ''NgramsIndexed
139 ------------------------------------------------------------------------
140 data NgramIds =
141 NgramIds
142 { ngramId :: Int
143 , ngramTerms :: Text
144 } deriving (Show, Generic, Eq, Ord)
145
146 instance DPS.FromRow NgramIds where
147 fromRow = NgramIds <$> field <*> field
148
149 ----------------------
150 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
151 indexNgramsT m ngrId = indexNgramsTWith f ngrId
152 where
153 f n = maybe (panic "indexNgramsT: should not happen") identity (lookup n m)
154
155 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams-> NgramsT NgramsIndexed
156 indexNgramsTWith f (NgramsT t n) = NgramsT t (NgramsIndexed n ((f . _ngramsTerms) n))
157
158 insertNgrams :: [Ngrams] -> Cmd (Map NgramsTerms NgramsId)
159 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
160
161 insertNgrams' :: [Ngrams] -> Cmd [NgramIds]
162 insertNgrams' ns = mkCmd $ \conn -> DPS.query conn queryInsertNgrams (DPS.Only $ Values fields ns)
163 where
164 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
165
166 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd ByteString
167 insertNgrams_Debug ns = mkCmd $ \conn -> DPS.formatQuery conn queryInsertNgrams (DPS.Only $ Values fields ns)
168 where
169 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
170
171 ----------------------
172 queryInsertNgrams :: DPS.Query
173 queryInsertNgrams = [sql|
174 WITH input_rows(terms,n) AS (?)
175 , ins AS (
176 INSERT INTO ngrams (terms,n)
177 SELECT * FROM input_rows
178 ON CONFLICT (terms) DO NOTHING -- unique index created here
179 RETURNING id,terms
180 )
181
182 SELECT id, terms
183 FROM ins
184 UNION ALL
185 SELECT c.id, terms
186 FROM input_rows
187 JOIN ngrams c USING (terms); -- columns of unique index
188 |]
189
190 defaultList :: DPS.Connection -> CorpusId -> IO ListId
191 defaultList c cId = view node_id <$> maybe (panic errMessage) identity
192 <$> head
193 <$> getListsWithParentId c cId
194 where
195 errMessage = "Gargantext.API.Ngrams.defaultList: no list found"
196
197 -- | Ngrams Table
198 -- TODO: the way we are getting main Master Corpus and List ID is not clean
199 -- TODO: if ids are not present -> create
200 -- TODO: Global Env State Monad to keep in memory the ids without retrieving it each time
201 getNgramsTableDb :: DPS.Connection
202 -> NodeType -> NgramsType
203 -> NgramsTableParamUser
204 -> IO ([NgramsTableData], MapToParent, MapToChildren)
205 getNgramsTableDb c nt ngrt ntp@(NgramsTableParam listIdUser _) = do
206
207
208 maybeRoot <- head <$> getRoot userMaster c
209 let path = "Garg.Db.Ngrams.getTableNgrams: "
210 let masterRootId = maybe (panic $ path <> "no userMaster Tree") (view node_id) maybeRoot
211 -- let errMess = panic "Error"
212
213 corpusMasterId <- maybe (panic "error master corpus") (view node_id) <$> head <$> getCorporaWithParentId c masterRootId
214
215 listMasterId <- maybe (panic "error master list") (view node_id) <$> head <$> getListsWithParentId c corpusMasterId
216
217 ngramsTableData <- getNgramsTableData c nt ngrt ntp (NgramsTableParam listMasterId corpusMasterId)
218
219 (mapToParent,mapToChildren) <- getNgramsGroup c listIdUser listMasterId
220 pure (ngramsTableData, mapToParent,mapToChildren)
221
222
223 data NgramsTableParam =
224 NgramsTableParam { _nt_listId :: Int
225 , _nt_corpusId :: Int
226 }
227
228 type NgramsTableParamUser = NgramsTableParam
229 type NgramsTableParamMaster = NgramsTableParam
230
231 data NgramsTableData = NgramsTableData { _ntd_ngrams :: Text
232 , _ntd_n :: Int
233 , _ntd_listType :: Maybe ListType
234 , _ntd_weight :: Double
235 } deriving (Show)
236
237 getNgramsTableData :: DPS.Connection
238 -> NodeType -> NgramsType
239 -> NgramsTableParamUser -> NgramsTableParamMaster
240 -> IO [NgramsTableData]
241 getNgramsTableData conn nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) =
242 trace ("Ngrams table params" <> show params) <$>
243 map (\(t,n,nt,w) -> NgramsTableData t n (fromListTypeId nt) w) <$>
244 DPS.query conn querySelectTableNgrams params
245 where
246 nodeTId = nodeTypeId nodeT
247 ngrmTId = ngramsTypeId ngrmT
248 params = (ul,uc,nodeTId,ngrmTId,ml,mc,nodeTId,ngrmTId,uc)
249
250
251
252 querySelectTableNgrams :: DPS.Query
253 querySelectTableNgrams = [sql|
254
255 WITH tableUser AS (
256 SELECT ngs.terms, ngs.n, list.ngrams_type, corp.weight FROM ngrams ngs
257 JOIN nodes_ngrams list ON list.ngram_id = ngs.id
258 JOIN nodes_ngrams corp ON corp.ngram_id = ngs.id
259 JOIN nodes n ON n.id = corp.node_id
260
261 WHERE list.node_id = ? -- User listId
262 AND n.parent_id = ? -- User CorpusId or AnnuaireId
263 AND n.typename = ? -- both type of childs (Documents or Contacts)
264 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms or...)
265 )
266 , tableMaster AS (
267 SELECT ngs.terms, ngs.n, list.ngrams_type, corp.weight FROM ngrams ngs
268 JOIN nodes_ngrams list ON list.ngram_id = ngs.id
269 JOIN nodes_ngrams corp ON corp.ngram_id = ngs.id
270 JOIN nodes n ON n.id = corp.node_id
271 JOIN nodes_nodes nn ON nn.node2_id = n.id
272
273 WHERE list.node_id = ? -- Master listId
274 AND n.parent_id = ? -- Master CorpusId or AnnuaireId
275 AND n.typename = ? -- Master childs (Documents or Contacts)
276 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
277 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
278 )
279
280 SELECT COALESCE(tu.terms,tm.terms) AS terms
281 , COALESCE(tu.n,tm.n) AS n
282 , COALESCE(tu.ngrams_type,tm.ngrams_type) AS ngrams_type
283 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
284 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
285 GROUP BY tu.terms,tm.terms,tu.n,tm.n,tu.ngrams_type,tm.ngrams_type;
286
287 |]
288
289 type ListIdUser = Int
290 type ListIdMaster = Int
291
292 type MapToChildren = Map Text (Set Text)
293 type MapToParent = Map Text Text
294
295 getNgramsGroup :: DPS.Connection -> ListIdUser -> ListIdMaster -> IO (MapToParent, MapToChildren)
296 getNgramsGroup conn lu lm = do
297 groups <- getNgramsGroup' conn lu lm
298 let mapChildren = fromListWith (<>) $ map (\(a,b) -> (a, DS.singleton b)) groups
299 let mapParent = fromListWith (<>) $ map (\(a,b) -> (b, a)) groups
300 pure (mapParent, mapChildren)
301
302 getNgramsGroup' :: DPS.Connection -> ListIdUser -> ListIdMaster -> IO [(Text,Text)]
303 getNgramsGroup' conn lu lm = DPS.query conn querySelectNgramsGroup (lu,lm)
304
305 getNgramsGroup'' :: ListIdUser -> ListIdMaster -> Cmd [(Text, Text)]
306 getNgramsGroup'' lu lm = mkCmd $ \conn -> DPS.query conn querySelectNgramsGroup (lu,lm)
307
308 querySelectNgramsGroup :: DPS.Query
309 querySelectNgramsGroup = [sql|
310 WITH groupUser AS (
311 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
312 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
313 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
314 WHERE
315 nnn.node_id = ? -- User listId
316 ),
317 groupMaster AS (
318 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
319 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
320 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
321 WHERE
322 nnn.node_id = ? -- Master listId
323 )
324 SELECT COALESCE(gu.t1,gm.t1) AS ngram1_id
325 , COALESCE(gu.t2,gm.t2) AS ngram2_id
326 FROM groupUser gu RIGHT JOIN groupMaster gm ON gu.t1 = gm.t1
327 |]
328