2 Module : Gargantext.Database.Schema.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams connection to the Database.
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE MultiParamTypeClasses #-}
19 {-# LANGUAGE NoImplicitPrelude #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE QuasiQuotes #-}
22 {-# LANGUAGE TemplateHaskell #-}
24 module Gargantext.Database.Schema.Ngrams where
27 import Database.PostgreSQL.Simple as DPS (Connection)
28 import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
30 import Control.Lens (makeLenses, view)
31 import Data.ByteString.Internal (ByteString)
32 import Data.Map (Map, fromList, lookup, fromListWith)
34 import Data.Text (Text, splitOn)
35 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
36 import Database.PostgreSQL.Simple.SqlQQ (sql)
37 import Database.PostgreSQL.Simple.ToField (toField)
38 import Database.PostgreSQL.Simple.ToRow (toRow)
39 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
40 import Debug.Trace (trace)
41 import GHC.Generics (Generic)
42 import Gargantext.Core.Types -- (fromListTypeId, ListType, NodePoly(Node))
43 import Gargantext.Database.Config (nodeTypeId,userMaster)
44 import Gargantext.Database.Schema.Node (mkCmd, Cmd(..),getListsWithParentId, getCorporaWithParentId)
45 import Gargantext.Database.Root (getRoot)
46 import Gargantext.Core.Types (CorpusId)
47 import Gargantext.Database.Types.Node (NodeType)
48 import Gargantext.Prelude
49 import Prelude (Enum, Bounded, minBound, maxBound)
50 import qualified Data.Set as DS
51 import qualified Database.PostgreSQL.Simple as DPS
54 data NgramsPoly id terms n = NgramsDb { ngrams_id :: id
55 , ngrams_terms :: terms
60 type NgramsWrite = NgramsPoly (Maybe (Column PGInt4))
64 type NgramsRead = NgramsPoly (Column PGInt4)
68 type NgramsReadNull = NgramsPoly (Column (Nullable PGInt4))
69 (Column (Nullable PGText))
70 (Column (Nullable PGInt4))
73 type NgramsDb = NgramsPoly Int Text Int
75 $(makeAdaptorAndInstance "pNgramsDb" ''NgramsPoly)
76 -- $(makeLensesWith abbreviatedFields ''NgramsPoly)
78 ngramsTable :: Table NgramsWrite NgramsRead
79 ngramsTable = Table "ngrams" (pNgramsDb NgramsDb { ngrams_id = optional "id"
80 , ngrams_terms = required "terms"
81 , ngrams_n = required "n"
85 queryNgramsTable :: Query NgramsRead
86 queryNgramsTable = queryTable ngramsTable
88 dbGetNgramsDb :: DPS.Connection -> IO [NgramsDb]
89 dbGetNgramsDb conn = runQuery conn queryNgramsTable
92 -- | Main Ngrams Types
94 -- Typed Ngrams localize the context of the ngrams
95 -- ngrams in source field of document has Sources Type
96 -- ngrams in authors field of document has Authors Type
97 -- ngrams in text (title or abstract) of documents has Terms Type
98 data NgramsType = Authors | Institutes | Sources | NgramsTerms
99 deriving (Eq, Show, Ord, Enum, Bounded)
101 ngramsTypeId :: NgramsType -> Int
102 ngramsTypeId Authors = 1
103 ngramsTypeId Institutes = 2
104 ngramsTypeId Sources = 3
105 ngramsTypeId NgramsTerms = 4
107 fromNgramsTypeId :: Int -> Maybe NgramsType
108 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
110 type NgramsTerms = Text
114 ------------------------------------------------------------------------
115 -- | TODO put it in Gargantext.Text.Ngrams
116 data Ngrams = Ngrams { _ngramsTerms :: Text
118 } deriving (Generic, Show, Eq, Ord)
121 instance DPS.ToRow Ngrams where
122 toRow (Ngrams t s) = [toField t, toField s]
124 text2ngrams :: Text -> Ngrams
125 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
127 -------------------------------------------------------------------------
128 -- | TODO put it in Gargantext.Text.Ngrams
129 -- Named entity are typed ngrams of Terms Ngrams
131 NgramsT { _ngramsType :: NgramsType
133 } deriving (Generic, Show, Eq, Ord)
136 -----------------------------------------------------------------------
140 , _ngramsId :: NgramsId
141 } deriving (Show, Generic, Eq, Ord)
143 makeLenses ''NgramsIndexed
144 ------------------------------------------------------------------------
149 } deriving (Show, Generic, Eq, Ord)
151 instance DPS.FromRow NgramIds where
152 fromRow = NgramIds <$> field <*> field
154 ----------------------
155 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
156 indexNgramsT m ngrId = indexNgramsTWith f ngrId
158 f n = maybe (panic "indexNgramsT: should not happen") identity (lookup n m)
160 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams-> NgramsT NgramsIndexed
161 indexNgramsTWith f (NgramsT t n) = NgramsT t (NgramsIndexed n ((f . _ngramsTerms) n))
163 insertNgrams :: [Ngrams] -> Cmd (Map NgramsTerms NgramsId)
164 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
166 insertNgrams' :: [Ngrams] -> Cmd [NgramIds]
167 insertNgrams' ns = mkCmd $ \conn -> DPS.query conn queryInsertNgrams (DPS.Only $ Values fields ns)
169 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
171 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd ByteString
172 insertNgrams_Debug ns = mkCmd $ \conn -> DPS.formatQuery conn queryInsertNgrams (DPS.Only $ Values fields ns)
174 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
176 ----------------------
177 queryInsertNgrams :: DPS.Query
178 queryInsertNgrams = [sql|
179 WITH input_rows(terms,n) AS (?)
181 INSERT INTO ngrams (terms,n)
182 SELECT * FROM input_rows
183 ON CONFLICT (terms) DO NOTHING -- unique index created here
192 JOIN ngrams c USING (terms); -- columns of unique index
195 defaultList :: DPS.Connection -> CorpusId -> IO ListId
196 defaultList c cId = view node_id <$> maybe (panic errMessage) identity
198 <$> getListsWithParentId c cId
200 errMessage = "Gargantext.API.Ngrams.defaultList: no list found"
203 -- TODO: the way we are getting main Master Corpus and List ID is not clean
204 -- TODO: if ids are not present -> create
205 -- TODO: Global Env State Monad to keep in memory the ids without retrieving it each time
206 getNgramsTableDb :: DPS.Connection
207 -> NodeType -> NgramsType
208 -> NgramsTableParamUser
209 -> IO ([NgramsTableData], MapToParent, MapToChildren)
210 getNgramsTableDb c nt ngrt ntp@(NgramsTableParam listIdUser _) = do
213 maybeRoot <- head <$> getRoot userMaster c
214 let path = "Garg.Db.Ngrams.getTableNgrams: "
215 let masterRootId = maybe (panic $ path <> "no userMaster Tree") (view node_id) maybeRoot
216 -- let errMess = panic "Error"
218 corpusMasterId <- maybe (panic "error master corpus") (view node_id) <$> head <$> getCorporaWithParentId c masterRootId
220 listMasterId <- maybe (panic "error master list") (view node_id) <$> head <$> getListsWithParentId c corpusMasterId
222 ngramsTableData <- getNgramsTableData c nt ngrt ntp (NgramsTableParam listMasterId corpusMasterId)
224 (mapToParent,mapToChildren) <- getNgramsGroup c listIdUser listMasterId
225 pure (ngramsTableData, mapToParent,mapToChildren)
228 data NgramsTableParam =
229 NgramsTableParam { _nt_listId :: Int
230 , _nt_corpusId :: Int
233 type NgramsTableParamUser = NgramsTableParam
234 type NgramsTableParamMaster = NgramsTableParam
236 data NgramsTableData = NgramsTableData { _ntd_ngrams :: Text
238 , _ntd_listType :: Maybe ListType
239 , _ntd_weight :: Double
242 getNgramsTableData :: DPS.Connection
243 -> NodeType -> NgramsType
244 -> NgramsTableParamUser -> NgramsTableParamMaster
245 -> IO [NgramsTableData]
246 getNgramsTableData conn nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) =
247 trace ("Ngrams table params" <> show params) <$>
248 map (\(t,n,nt,w) -> NgramsTableData t n (fromListTypeId nt) w) <$>
249 DPS.query conn querySelectTableNgrams params
251 nodeTId = nodeTypeId nodeT
252 ngrmTId = ngramsTypeId ngrmT
253 params = (ul,uc,nodeTId,ngrmTId,ml,mc,nodeTId,ngrmTId,uc)
257 querySelectTableNgrams :: DPS.Query
258 querySelectTableNgrams = [sql|
261 SELECT ngs.terms, ngs.n, list.ngrams_type, corp.weight FROM ngrams ngs
262 JOIN nodes_ngrams list ON list.ngram_id = ngs.id
263 JOIN nodes_ngrams corp ON corp.ngram_id = ngs.id
264 JOIN nodes n ON n.id = corp.node_id
266 WHERE list.node_id = ? -- User listId
267 AND n.parent_id = ? -- User CorpusId or AnnuaireId
268 AND n.typename = ? -- both type of childs (Documents or Contacts)
269 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms or...)
272 SELECT ngs.terms, ngs.n, list.ngrams_type, corp.weight FROM ngrams ngs
273 JOIN nodes_ngrams list ON list.ngram_id = ngs.id
274 JOIN nodes_ngrams corp ON corp.ngram_id = ngs.id
275 JOIN nodes n ON n.id = corp.node_id
276 JOIN nodes_nodes nn ON nn.node2_id = n.id
278 WHERE list.node_id = ? -- Master listId
279 AND n.parent_id = ? -- Master CorpusId or AnnuaireId
280 AND n.typename = ? -- Master childs (Documents or Contacts)
281 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
282 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
285 SELECT COALESCE(tu.terms,tm.terms) AS terms
286 , COALESCE(tu.n,tm.n) AS n
287 , COALESCE(tu.ngrams_type,tm.ngrams_type) AS ngrams_type
288 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
289 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
290 GROUP BY tu.terms,tm.terms,tu.n,tm.n,tu.ngrams_type,tm.ngrams_type;
294 type ListIdUser = Int
295 type ListIdMaster = Int
297 type MapToChildren = Map Text (Set Text)
298 type MapToParent = Map Text Text
300 getNgramsGroup :: DPS.Connection -> ListIdUser -> ListIdMaster -> IO (MapToParent, MapToChildren)
301 getNgramsGroup conn lu lm = do
302 groups <- getNgramsGroup' conn lu lm
303 let mapChildren = fromListWith (<>) $ map (\(a,b) -> (a, DS.singleton b)) groups
304 let mapParent = fromListWith (<>) $ map (\(a,b) -> (b, a)) groups
305 pure (mapParent, mapChildren)
307 getNgramsGroup' :: DPS.Connection -> ListIdUser -> ListIdMaster -> IO [(Text,Text)]
308 getNgramsGroup' conn lu lm = DPS.query conn querySelectNgramsGroup (lu,lm)
310 getNgramsGroup'' :: ListIdUser -> ListIdMaster -> Cmd [(Text, Text)]
311 getNgramsGroup'' lu lm = mkCmd $ \conn -> DPS.query conn querySelectNgramsGroup (lu,lm)
313 querySelectNgramsGroup :: DPS.Query
314 querySelectNgramsGroup = [sql|
316 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
317 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
318 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
320 nnn.node_id = ? -- User listId
323 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
324 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
325 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
327 nnn.node_id = ? -- Master listId
329 SELECT COALESCE(gu.t1,gm.t1) AS ngram1_id
330 , COALESCE(gu.t2,gm.t2) AS ngram2_id
331 FROM groupUser gu RIGHT JOIN groupMaster gm ON gu.t1 = gm.t1