2 Module : Gargantext.Database.Schema.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams connection to the Database.
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE MultiParamTypeClasses #-}
19 {-# LANGUAGE NoImplicitPrelude #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE QuasiQuotes #-}
22 {-# LANGUAGE TemplateHaskell #-}
24 module Gargantext.Database.Schema.Ngrams where
27 import Control.Lens (makeLenses, view, _Just, traverse)
28 import Data.ByteString.Internal (ByteString)
29 import Data.Map (Map, fromList, lookup, fromListWith)
30 import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
32 import Data.Text (Text, splitOn)
33 import Database.PostgreSQL.Simple as DPS (Connection)
34 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
35 import Database.PostgreSQL.Simple.SqlQQ (sql)
36 import Database.PostgreSQL.Simple.ToField (toField)
37 import Database.PostgreSQL.Simple.ToRow (toRow)
38 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
39 import Debug.Trace (trace)
40 import GHC.Generics (Generic)
41 import Gargantext.Core.Types (CorpusId)
42 import Gargantext.Core.Types -- (fromListTypeId, ListType, NodePoly(Node))
43 import Gargantext.Database.Config (nodeTypeId,userMaster)
44 import Gargantext.Database.Root (getRoot)
45 import Gargantext.Database.Types.Node (NodeType)
46 import Gargantext.Database.Schema.Node (getListsWithParentId, getCorporaWithParentId)
47 import Gargantext.Database.Utils (mkCmd, Cmd(..))
48 import Gargantext.Prelude
50 import Prelude (Enum, Bounded, minBound, maxBound)
51 import qualified Data.Set as DS
52 import qualified Database.PostgreSQL.Simple as DPS
55 data NgramsPoly id terms n = NgramsDb { ngrams_id :: id
56 , ngrams_terms :: terms
61 type NgramsWrite = NgramsPoly (Maybe (Column PGInt4))
65 type NgramsRead = NgramsPoly (Column PGInt4)
69 type NgramsReadNull = NgramsPoly (Column (Nullable PGInt4))
70 (Column (Nullable PGText))
71 (Column (Nullable PGInt4))
74 type NgramsDb = NgramsPoly Int Text Int
76 $(makeAdaptorAndInstance "pNgramsDb" ''NgramsPoly)
77 -- $(makeLensesWith abbreviatedFields ''NgramsPoly)
79 ngramsTable :: Table NgramsWrite NgramsRead
80 ngramsTable = Table "ngrams" (pNgramsDb NgramsDb { ngrams_id = optional "id"
81 , ngrams_terms = required "terms"
82 , ngrams_n = required "n"
86 queryNgramsTable :: Query NgramsRead
87 queryNgramsTable = queryTable ngramsTable
89 dbGetNgramsDb :: DPS.Connection -> IO [NgramsDb]
90 dbGetNgramsDb conn = runQuery conn queryNgramsTable
93 -- | Main Ngrams Types
95 -- Typed Ngrams localize the context of the ngrams
96 -- ngrams in source field of document has Sources Type
97 -- ngrams in authors field of document has Authors Type
98 -- ngrams in text (title or abstract) of documents has Terms Type
99 data NgramsType = Authors | Institutes | Sources | NgramsTerms
100 deriving (Eq, Show, Ord, Enum, Bounded)
102 ngramsTypeId :: NgramsType -> Int
103 ngramsTypeId Authors = 1
104 ngramsTypeId Institutes = 2
105 ngramsTypeId Sources = 3
106 ngramsTypeId NgramsTerms = 4
108 fromNgramsTypeId :: Int -> Maybe NgramsType
109 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
111 type NgramsTerms = Text
115 ------------------------------------------------------------------------
116 -- | TODO put it in Gargantext.Text.Ngrams
117 data Ngrams = Ngrams { _ngramsTerms :: Text
119 } deriving (Generic, Show, Eq, Ord)
122 instance DPS.ToRow Ngrams where
123 toRow (Ngrams t s) = [toField t, toField s]
125 text2ngrams :: Text -> Ngrams
126 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
128 -------------------------------------------------------------------------
129 -- | TODO put it in Gargantext.Text.Ngrams
130 -- Named entity are typed ngrams of Terms Ngrams
132 NgramsT { _ngramsType :: NgramsType
134 } deriving (Generic, Show, Eq, Ord)
137 -----------------------------------------------------------------------
141 , _ngramsId :: NgramsId
142 } deriving (Show, Generic, Eq, Ord)
144 makeLenses ''NgramsIndexed
145 ------------------------------------------------------------------------
150 } deriving (Show, Generic, Eq, Ord)
152 instance DPS.FromRow NgramIds where
153 fromRow = NgramIds <$> field <*> field
155 ----------------------
156 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
157 indexNgramsT m ngrId = indexNgramsTWith f ngrId
159 f n = maybe (panic "indexNgramsT: should not happen") identity (lookup n m)
161 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams-> NgramsT NgramsIndexed
162 indexNgramsTWith f (NgramsT t n) = NgramsT t (NgramsIndexed n ((f . _ngramsTerms) n))
164 insertNgrams :: [Ngrams] -> Cmd (Map NgramsTerms NgramsId)
165 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
167 insertNgrams' :: [Ngrams] -> Cmd [NgramIds]
168 insertNgrams' ns = mkCmd $ \conn -> DPS.query conn queryInsertNgrams (DPS.Only $ Values fields ns)
170 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
172 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd ByteString
173 insertNgrams_Debug ns = mkCmd $ \conn -> DPS.formatQuery conn queryInsertNgrams (DPS.Only $ Values fields ns)
175 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
177 ----------------------
178 queryInsertNgrams :: DPS.Query
179 queryInsertNgrams = [sql|
180 WITH input_rows(terms,n) AS (?)
182 INSERT INTO ngrams (terms,n)
183 SELECT * FROM input_rows
184 ON CONFLICT (terms) DO NOTHING -- unique index created here
193 JOIN ngrams c USING (terms); -- columns of unique index
198 -- TODO: the way we are getting main Master Corpus and List ID is not clean
199 -- TODO: if ids are not present -> create
200 -- TODO: Global Env State Monad to keep in memory the ids without retrieving it each time
201 getNgramsTableDb :: DPS.Connection
202 -> NodeType -> NgramsType
203 -> NgramsTableParamUser
204 -> IO ([NgramsTableData], MapToParent, MapToChildren)
205 getNgramsTableDb c nt ngrt ntp@(NgramsTableParam listIdUser _) = do
208 maybeRoot <- head <$> getRoot userMaster c
209 let path = "Garg.Db.Ngrams.getTableNgrams: "
210 let masterRootId = maybe (panic $ path <> "no userMaster Tree") (view node_id) maybeRoot
211 -- let errMess = panic "Error"
213 corpusMasterId <- maybe (panic "error master corpus") (view node_id) <$> head <$> getCorporaWithParentId c masterRootId
215 listMasterId <- maybe (panic "error master list") (view node_id) <$> head <$> getListsWithParentId c corpusMasterId
217 ngramsTableData <- getNgramsTableData c nt ngrt ntp (NgramsTableParam listMasterId corpusMasterId)
219 (mapToParent,mapToChildren) <- getNgramsGroup c listIdUser listMasterId
220 pure (ngramsTableData, mapToParent,mapToChildren)
223 data NgramsTableParam =
224 NgramsTableParam { _nt_listId :: Int
225 , _nt_corpusId :: Int
228 type NgramsTableParamUser = NgramsTableParam
229 type NgramsTableParamMaster = NgramsTableParam
231 data NgramsTableData = NgramsTableData { _ntd_ngrams :: Text
233 , _ntd_listType :: Maybe ListType
234 , _ntd_weight :: Double
237 getNgramsTableData :: DPS.Connection
238 -> NodeType -> NgramsType
239 -> NgramsTableParamUser -> NgramsTableParamMaster
240 -> IO [NgramsTableData]
241 getNgramsTableData conn nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) =
242 trace ("Ngrams table params" <> show params) <$>
243 map (\(t,n,nt,w) -> NgramsTableData t n (fromListTypeId nt) w) <$>
244 DPS.query conn querySelectTableNgrams params
246 nodeTId = nodeTypeId nodeT
247 ngrmTId = ngramsTypeId ngrmT
248 params = (ul,uc,nodeTId,ngrmTId,ml,mc,nodeTId,ngrmTId,uc)
252 querySelectTableNgrams :: DPS.Query
253 querySelectTableNgrams = [sql|
256 SELECT ngs.terms, ngs.n, list.ngrams_type, corp.weight FROM ngrams ngs
257 JOIN nodes_ngrams list ON list.ngram_id = ngs.id
258 JOIN nodes_ngrams corp ON corp.ngram_id = ngs.id
259 JOIN nodes n ON n.id = corp.node_id
261 WHERE list.node_id = ? -- User listId
262 AND n.parent_id = ? -- User CorpusId or AnnuaireId
263 AND n.typename = ? -- both type of childs (Documents or Contacts)
264 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms or...)
267 SELECT ngs.terms, ngs.n, list.ngrams_type, corp.weight FROM ngrams ngs
268 JOIN nodes_ngrams list ON list.ngram_id = ngs.id
269 JOIN nodes_ngrams corp ON corp.ngram_id = ngs.id
270 JOIN nodes n ON n.id = corp.node_id
271 JOIN nodes_nodes nn ON nn.node2_id = n.id
273 WHERE list.node_id = ? -- Master listId
274 AND n.parent_id = ? -- Master CorpusId or AnnuaireId
275 AND n.typename = ? -- Master childs (Documents or Contacts)
276 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
277 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
280 SELECT COALESCE(tu.terms,tm.terms) AS terms
281 , COALESCE(tu.n,tm.n) AS n
282 , COALESCE(tu.ngrams_type,tm.ngrams_type) AS ngrams_type
283 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
284 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
285 GROUP BY tu.terms,tm.terms,tu.n,tm.n,tu.ngrams_type,tm.ngrams_type;
289 type ListIdUser = Int
290 type ListIdMaster = Int
292 type MapToChildren = Map Text (Set Text)
293 type MapToParent = Map Text Text
295 getNgramsGroup :: DPS.Connection -> ListIdUser -> ListIdMaster -> IO (MapToParent, MapToChildren)
296 getNgramsGroup conn lu lm = do
297 groups <- getNgramsGroup' conn lu lm
298 let mapChildren = fromListWith (<>) $ map (\(a,b) -> (a, DS.singleton b)) groups
299 let mapParent = fromListWith (<>) $ map (\(a,b) -> (b, a)) groups
300 pure (mapParent, mapChildren)
302 getNgramsGroup' :: DPS.Connection -> ListIdUser -> ListIdMaster -> IO [(Text,Text)]
303 getNgramsGroup' conn lu lm = DPS.query conn querySelectNgramsGroup (lu,lm)
305 getNgramsGroup'' :: ListIdUser -> ListIdMaster -> Cmd [(Text, Text)]
306 getNgramsGroup'' lu lm = mkCmd $ \conn -> DPS.query conn querySelectNgramsGroup (lu,lm)
308 querySelectNgramsGroup :: DPS.Query
309 querySelectNgramsGroup = [sql|
311 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
312 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
313 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
315 nnn.node_id = ? -- User listId
318 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
319 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
320 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
322 nnn.node_id = ? -- Master listId
324 SELECT COALESCE(gu.t1,gm.t1) AS ngram1_id
325 , COALESCE(gu.t2,gm.t2) AS ngram2_id
326 FROM groupUser gu RIGHT JOIN groupMaster gm ON gu.t1 = gm.t1