2 Module : Gargantext.Database.Schema.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams connection to the Database.
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE MultiParamTypeClasses #-}
19 {-# LANGUAGE NoImplicitPrelude #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE QuasiQuotes #-}
22 {-# LANGUAGE TemplateHaskell #-}
24 module Gargantext.Database.Schema.Ngrams where
27 import Control.Lens (makeLenses, view)
28 import Data.ByteString.Internal (ByteString)
29 import Data.Map (Map, fromList, lookup, fromListWith)
30 import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
32 import Data.Text (Text, splitOn)
33 import Database.PostgreSQL.Simple ((:.)(..))
34 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
35 import Database.PostgreSQL.Simple.SqlQQ (sql)
36 import Database.PostgreSQL.Simple.ToField (toField)
37 import Database.PostgreSQL.Simple.ToRow (toRow)
38 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
39 import Debug.Trace (trace)
40 import GHC.Generics (Generic)
41 import Gargantext.Core.Types -- (fromListTypeId, ListType, NodePoly(Node))
42 import Gargantext.Database.Config (nodeTypeId,userMaster)
43 import Gargantext.Database.Root (getRoot)
44 import Gargantext.Database.Types.Node (NodeType)
45 import Gargantext.Database.Schema.Node (getListsWithParentId, getCorporaWithParentId)
46 import Gargantext.Database.Utils (mkCmd, Cmd(..))
47 import Gargantext.Prelude
49 import Prelude (Enum, Bounded, minBound, maxBound)
50 import qualified Data.Set as DS
51 import qualified Database.PostgreSQL.Simple as DPS
54 data NgramsPoly id terms n = NgramsDb { ngrams_id :: id
55 , ngrams_terms :: terms
60 type NgramsWrite = NgramsPoly (Maybe (Column PGInt4))
64 type NgramsRead = NgramsPoly (Column PGInt4)
68 type NgramsReadNull = NgramsPoly (Column (Nullable PGInt4))
69 (Column (Nullable PGText))
70 (Column (Nullable PGInt4))
73 type NgramsDb = NgramsPoly Int Text Int
75 $(makeAdaptorAndInstance "pNgramsDb" ''NgramsPoly)
76 -- $(makeLensesWith abbreviatedFields ''NgramsPoly)
78 ngramsTable :: Table NgramsWrite NgramsRead
79 ngramsTable = Table "ngrams" (pNgramsDb NgramsDb { ngrams_id = optional "id"
80 , ngrams_terms = required "terms"
81 , ngrams_n = required "n"
85 queryNgramsTable :: Query NgramsRead
86 queryNgramsTable = queryTable ngramsTable
88 dbGetNgramsDb :: DPS.Connection -> IO [NgramsDb]
89 dbGetNgramsDb conn = runQuery conn queryNgramsTable
92 -- | Main Ngrams Types
94 -- Typed Ngrams localize the context of the ngrams
95 -- ngrams in source field of document has Sources Type
96 -- ngrams in authors field of document has Authors Type
97 -- ngrams in text (title or abstract) of documents has Terms Type
98 data NgramsType = Authors | Institutes | Sources | NgramsTerms
99 deriving (Eq, Show, Ord, Enum, Bounded)
101 ngramsTypeId :: NgramsType -> Int
102 ngramsTypeId Authors = 1
103 ngramsTypeId Institutes = 2
104 ngramsTypeId Sources = 3
105 ngramsTypeId NgramsTerms = 4
107 fromNgramsTypeId :: Int -> Maybe NgramsType
108 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
110 type NgramsTerms = Text
114 ------------------------------------------------------------------------
115 -- | TODO put it in Gargantext.Text.Ngrams
116 data Ngrams = Ngrams { _ngramsTerms :: Text
118 } deriving (Generic, Show, Eq, Ord)
121 instance DPS.ToRow Ngrams where
122 toRow (Ngrams t s) = [toField t, toField s]
124 text2ngrams :: Text -> Ngrams
125 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
127 -------------------------------------------------------------------------
128 -- | TODO put it in Gargantext.Text.Ngrams
129 -- Named entity are typed ngrams of Terms Ngrams
131 NgramsT { _ngramsType :: NgramsType
133 } deriving (Generic, Show, Eq, Ord)
136 -----------------------------------------------------------------------
140 , _ngramsId :: NgramsId
141 } deriving (Show, Generic, Eq, Ord)
143 makeLenses ''NgramsIndexed
144 ------------------------------------------------------------------------
149 } deriving (Show, Generic, Eq, Ord)
151 instance DPS.FromRow NgramIds where
152 fromRow = NgramIds <$> field <*> field
154 ----------------------
155 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
156 indexNgramsT m ngrId = indexNgramsTWith f ngrId
158 f n = maybe (panic "indexNgramsT: should not happen") identity (lookup n m)
160 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams-> NgramsT NgramsIndexed
161 indexNgramsTWith f (NgramsT t n) = NgramsT t (NgramsIndexed n ((f . _ngramsTerms) n))
163 insertNgrams :: [Ngrams] -> Cmd (Map NgramsTerms NgramsId)
164 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
166 insertNgrams' :: [Ngrams] -> Cmd [NgramIds]
167 insertNgrams' ns = mkCmd $ \conn -> DPS.query conn queryInsertNgrams (DPS.Only $ Values fields ns)
169 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
171 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd ByteString
172 insertNgrams_Debug ns = mkCmd $ \conn -> DPS.formatQuery conn queryInsertNgrams (DPS.Only $ Values fields ns)
174 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
176 ----------------------
177 queryInsertNgrams :: DPS.Query
178 queryInsertNgrams = [sql|
179 WITH input_rows(terms,n) AS (?)
181 INSERT INTO ngrams (terms,n)
182 SELECT * FROM input_rows
183 ON CONFLICT (terms) DO NOTHING -- unique index created here
192 JOIN ngrams c USING (terms); -- columns of unique index
197 -- TODO: the way we are getting main Master Corpus and List ID is not clean
198 -- TODO: if ids are not present -> create
199 -- TODO: Global Env State Monad to keep in memory the ids without retrieving it each time
200 getNgramsTableDb :: DPS.Connection
201 -> NodeType -> NgramsType
202 -> NgramsTableParamUser
204 -> IO ([NgramsTableData], MapToParent, MapToChildren)
205 getNgramsTableDb c nt ngrt ntp@(NgramsTableParam listIdUser _) limit_ offset_ = do
208 maybeRoot <- head <$> getRoot userMaster c
209 let path = "Garg.Db.Ngrams.getTableNgrams: "
210 let masterRootId = maybe (panic $ path <> "no userMaster Tree") (view node_id) maybeRoot
211 -- let errMess = panic "Error"
213 corpusMasterId <- maybe (panic "error master corpus") (view node_id) <$> head <$> getCorporaWithParentId c masterRootId
215 listMasterId <- maybe (panic "error master list") (view node_id) <$> head <$> getListsWithParentId c corpusMasterId
217 ngramsTableData <- getNgramsTableData c nt ngrt ntp (NgramsTableParam listMasterId corpusMasterId) limit_ offset_
219 (mapToParent,mapToChildren) <- getNgramsGroup c listIdUser listMasterId
220 pure (ngramsTableData, mapToParent,mapToChildren)
223 data NgramsTableParam =
224 NgramsTableParam { _nt_listId :: Int
225 , _nt_corpusId :: Int
228 type NgramsTableParamUser = NgramsTableParam
229 type NgramsTableParamMaster = NgramsTableParam
231 data NgramsTableData = NgramsTableData { _ntd_ngrams :: Text
233 , _ntd_listType :: Maybe ListType
234 , _ntd_weight :: Double
237 getNgramsTableData :: DPS.Connection
238 -> NodeType -> NgramsType
239 -> NgramsTableParamUser -> NgramsTableParamMaster
241 -> IO [NgramsTableData]
242 getNgramsTableData conn nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) limit_ offset_ =
243 trace ("Ngrams table params" <> show params) <$>
244 map (\(t,n,nt,w) -> NgramsTableData t n (fromListTypeId nt) w) <$>
245 DPS.query conn querySelectTableNgrams params
247 nodeTId = nodeTypeId nodeT
248 ngrmTId = ngramsTypeId ngrmT
249 params = (ul,uc,nodeTId,ngrmTId,ml,mc,nodeTId,ngrmTId,uc) :.
254 querySelectTableNgrams :: DPS.Query
255 querySelectTableNgrams = [sql|
258 SELECT ngs.terms, ngs.n, list.ngrams_type, corp.weight FROM ngrams ngs
259 JOIN nodes_ngrams list ON list.ngram_id = ngs.id
260 JOIN nodes_ngrams corp ON corp.ngram_id = ngs.id
261 JOIN nodes n ON n.id = corp.node_id
263 WHERE list.node_id = ? -- User listId
264 AND n.parent_id = ? -- User CorpusId or AnnuaireId
265 AND n.typename = ? -- both type of childs (Documents or Contacts)
266 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms or...)
269 SELECT ngs.terms, ngs.n, list.ngrams_type, corp.weight FROM ngrams ngs
270 JOIN nodes_ngrams list ON list.ngram_id = ngs.id
271 JOIN nodes_ngrams corp ON corp.ngram_id = ngs.id
272 JOIN nodes n ON n.id = corp.node_id
273 JOIN nodes_nodes nn ON nn.node2_id = n.id
275 WHERE list.node_id = ? -- Master listId
276 AND n.parent_id = ? -- Master CorpusId or AnnuaireId
277 AND n.typename = ? -- Master childs (Documents or Contacts)
278 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
279 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
282 SELECT COALESCE(tu.terms,tm.terms) AS terms
283 , COALESCE(tu.n,tm.n) AS n
284 , COALESCE(tu.ngrams_type,tm.ngrams_type) AS ngrams_type
285 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
286 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
287 GROUP BY tu.terms,tm.terms,tu.n,tm.n,tu.ngrams_type,tm.ngrams_type
293 type ListIdUser = Int
294 type ListIdMaster = Int
296 type MapToChildren = Map Text (Set Text)
297 type MapToParent = Map Text Text
299 getNgramsGroup :: DPS.Connection -> ListIdUser -> ListIdMaster -> IO (MapToParent, MapToChildren)
300 getNgramsGroup conn lu lm = do
301 groups <- getNgramsGroup' conn lu lm
302 let mapChildren = fromListWith (<>) $ map (\(a,b) -> (a, DS.singleton b)) groups
303 let mapParent = fromListWith (<>) $ map (\(a,b) -> (b, a)) groups
304 pure (mapParent, mapChildren)
306 getNgramsGroup' :: DPS.Connection -> ListIdUser -> ListIdMaster -> IO [(Text,Text)]
307 getNgramsGroup' conn lu lm = DPS.query conn querySelectNgramsGroup (lu,lm)
309 getNgramsGroup'' :: ListIdUser -> ListIdMaster -> Cmd [(Text, Text)]
310 getNgramsGroup'' lu lm = mkCmd $ \conn -> DPS.query conn querySelectNgramsGroup (lu,lm)
312 querySelectNgramsGroup :: DPS.Query
313 querySelectNgramsGroup = [sql|
315 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
316 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
317 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
319 nnn.node_id = ? -- User listId
322 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
323 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
324 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
326 nnn.node_id = ? -- Master listId
328 SELECT COALESCE(gu.t1,gm.t1) AS ngram1_id
329 , COALESCE(gu.t2,gm.t2) AS ngram2_id
330 FROM groupUser gu RIGHT JOIN groupMaster gm ON gu.t1 = gm.t1