2 Module : Gargantext.Database.Schema.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams connection to the Database.
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE GeneralizedNewtypeDeriving #-}
19 {-# LANGUAGE MultiParamTypeClasses #-}
20 {-# LANGUAGE NoImplicitPrelude #-}
21 {-# LANGUAGE OverloadedStrings #-}
22 {-# LANGUAGE QuasiQuotes #-}
23 {-# LANGUAGE RankNTypes #-}
24 {-# LANGUAGE TemplateHaskell #-}
26 module Gargantext.Database.Schema.Ngrams where
29 import Control.Lens (makeLenses, view)
30 import Control.Monad (mzero)
31 import Data.ByteString.Internal (ByteString)
32 import Data.Map (Map, fromList, lookup, fromListWith)
33 import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
35 import Data.Text (Text, splitOn)
36 import Database.PostgreSQL.Simple ((:.)(..))
37 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
38 import Database.PostgreSQL.Simple.SqlQQ (sql)
39 import Database.PostgreSQL.Simple.ToField (toField, ToField)
40 import Database.PostgreSQL.Simple.FromField (FromField, fromField)
41 import Database.PostgreSQL.Simple.ToRow (toRow)
42 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
43 --import Debug.Trace (trace)
44 import GHC.Generics (Generic)
45 import Gargantext.Core.Types -- (fromListTypeId, ListType, NodePoly(Node))
46 import Gargantext.Database.Config (nodeTypeId,userMaster)
47 import Gargantext.Database.Root (getRoot)
48 import Gargantext.Database.Types.Node (NodeType)
49 import Gargantext.Database.Schema.Node (getListsWithParentId, getCorporaWithParentId)
50 import Gargantext.Database.Utils (Cmd, runPGSQuery, runOpaQuery, formatPGSQuery)
51 import Gargantext.Prelude
52 import Opaleye hiding (FromField)
53 import Prelude (Enum, Bounded, minBound, maxBound)
54 import qualified Data.Set as DS
55 import qualified Database.PostgreSQL.Simple as PGS
58 data NgramsPoly id terms n = NgramsDb { ngrams_id :: id
59 , ngrams_terms :: terms
64 type NgramsWrite = NgramsPoly (Maybe (Column PGInt4))
68 type NgramsRead = NgramsPoly (Column PGInt4)
72 type NgramsReadNull = NgramsPoly (Column (Nullable PGInt4))
73 (Column (Nullable PGText))
74 (Column (Nullable PGInt4))
77 type NgramsDb = NgramsPoly Int Text Int
79 $(makeAdaptorAndInstance "pNgramsDb" ''NgramsPoly)
80 -- $(makeLensesWith abbreviatedFields ''NgramsPoly)
82 ngramsTable :: Table NgramsWrite NgramsRead
83 ngramsTable = Table "ngrams" (pNgramsDb NgramsDb { ngrams_id = optional "id"
84 , ngrams_terms = required "terms"
85 , ngrams_n = required "n"
89 queryNgramsTable :: Query NgramsRead
90 queryNgramsTable = queryTable ngramsTable
92 dbGetNgramsDb :: Cmd err [NgramsDb]
93 dbGetNgramsDb = runOpaQuery queryNgramsTable
96 -- | Main Ngrams Types
98 -- Typed Ngrams localize the context of the ngrams
99 -- ngrams in source field of document has Sources Type
100 -- ngrams in authors field of document has Authors Type
101 -- ngrams in text (title or abstract) of documents has Terms Type
102 data NgramsType = Authors | Institutes | Sources | NgramsTerms
103 deriving (Eq, Show, Ord, Enum, Bounded)
105 newtype NgramsTypeId = NgramsTypeId Int
106 deriving (Eq, Show, Ord, Num)
108 instance ToField NgramsTypeId where
109 toField (NgramsTypeId n) = toField n
111 instance FromField NgramsTypeId where
112 fromField field mdata = do
113 n <- fromField field mdata
114 if (n :: Int) > 0 then return $ NgramsTypeId n
117 pgNgramsTypeId :: NgramsTypeId -> Column PGInt4
118 pgNgramsTypeId (NgramsTypeId n) = pgInt4 n
121 ngramsTypeId :: NgramsType -> NgramsTypeId
122 ngramsTypeId Authors = 1
123 ngramsTypeId Institutes = 2
124 ngramsTypeId Sources = 3
125 ngramsTypeId NgramsTerms = 4
127 fromNgramsTypeId :: NgramsTypeId -> Maybe NgramsType
128 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
130 type NgramsTerms = Text
134 ------------------------------------------------------------------------
135 -- | TODO put it in Gargantext.Text.Ngrams
136 data Ngrams = Ngrams { _ngramsTerms :: Text
138 } deriving (Generic, Show, Eq, Ord)
141 instance PGS.ToRow Ngrams where
142 toRow (Ngrams t s) = [toField t, toField s]
144 text2ngrams :: Text -> Ngrams
145 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
147 -------------------------------------------------------------------------
148 -- | TODO put it in Gargantext.Text.Ngrams
149 -- Named entity are typed ngrams of Terms Ngrams
151 NgramsT { _ngramsType :: NgramsType
153 } deriving (Generic, Show, Eq, Ord)
156 -----------------------------------------------------------------------
160 , _ngramsId :: NgramsId
161 } deriving (Show, Generic, Eq, Ord)
163 makeLenses ''NgramsIndexed
164 ------------------------------------------------------------------------
169 } deriving (Show, Generic, Eq, Ord)
171 instance PGS.FromRow NgramIds where
172 fromRow = NgramIds <$> field <*> field
174 ----------------------
175 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
176 indexNgramsT m ngrId = indexNgramsTWith f ngrId
178 f n = maybe (panic "indexNgramsT: should not happen") identity (lookup n m)
180 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams-> NgramsT NgramsIndexed
181 indexNgramsTWith f (NgramsT t n) = NgramsT t (NgramsIndexed n ((f . _ngramsTerms) n))
183 insertNgrams :: [Ngrams] -> Cmd err (Map NgramsTerms NgramsId)
184 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
186 insertNgrams' :: [Ngrams] -> Cmd err [NgramIds]
187 insertNgrams' ns = runPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
189 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
191 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd err ByteString
192 insertNgrams_Debug ns = formatPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
194 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
196 ----------------------
197 queryInsertNgrams :: PGS.Query
198 queryInsertNgrams = [sql|
199 WITH input_rows(terms,n) AS (?)
201 INSERT INTO ngrams (terms,n)
202 SELECT * FROM input_rows
203 ON CONFLICT (terms) DO NOTHING -- unique index created here
212 JOIN ngrams c USING (terms); -- columns of unique index
217 -- TODO: the way we are getting main Master Corpus and List ID is not clean
218 -- TODO: if ids are not present -> create
219 -- TODO: Global Env State Monad to keep in memory the ids without retrieving it each time
220 getNgramsTableDb :: NodeType -> NgramsType
221 -> NgramsTableParamUser
223 -> Cmd err ([NgramsTableData], MapToParent, MapToChildren)
224 getNgramsTableDb nt ngrt ntp@(NgramsTableParam listIdUser _) limit_ offset_ = do
227 maybeRoot <- head <$> getRoot userMaster
228 let path = "Garg.Db.Ngrams.getTableNgrams: "
229 let masterRootId = maybe (panic $ path <> "no userMaster Tree") (view node_id) maybeRoot
230 -- let errMess = panic "Error"
232 corpusMasterId <- maybe (panic "error master corpus") (view node_id) <$> head <$> getCorporaWithParentId masterRootId
234 listMasterId <- maybe (panic "error master list") (view node_id) <$> head <$> getListsWithParentId corpusMasterId
236 ngramsTableData <- getNgramsTableData nt ngrt ntp (NgramsTableParam listMasterId corpusMasterId) limit_ offset_
238 (mapToParent,mapToChildren) <- getNgramsGroup listIdUser listMasterId
239 pure (ngramsTableData, mapToParent,mapToChildren)
242 data NgramsTableParam =
243 NgramsTableParam { _nt_listId :: NodeId
244 , _nt_corpusId :: NodeId
247 type NgramsTableParamUser = NgramsTableParam
248 type NgramsTableParamMaster = NgramsTableParam
250 data NgramsTableData = NgramsTableData { _ntd_ngrams :: Text
252 , _ntd_listType :: Maybe ListType
253 , _ntd_weight :: Double
256 getNgramsTableData :: NodeType -> NgramsType
257 -> NgramsTableParamUser -> NgramsTableParamMaster
259 -> Cmd err [NgramsTableData]
260 getNgramsTableData nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) limit_ offset_ =
261 -- trace ("Ngrams table params" <> show params) <$>
262 map (\(t,n,nt,w) -> NgramsTableData t n (fromListTypeId nt) w) <$>
263 runPGSQuery querySelectTableNgrams params
265 nodeTId = nodeTypeId nodeT
266 ngrmTId = ngramsTypeId ngrmT
267 params = (ul,uc,nodeTId,ngrmTId,ml,mc,nodeTId,ngrmTId,uc) :.
272 querySelectTableNgrams :: PGS.Query
273 querySelectTableNgrams = [sql|
276 SELECT ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
277 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
278 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
279 JOIN nodes_nodes nn ON nn.node2_id = corp.node_id
280 JOIN nodes n ON n.id = corp.node_id
282 WHERE list.node_id = ? -- User listId
283 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
284 AND n.typename = ? -- both type of childs (Documents or Contacts)
285 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms or...)
288 SELECT ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
289 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
290 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
291 JOIN nodes n ON n.id = corp.node_id
292 JOIN nodes_nodes nn ON nn.node2_id = n.id
294 WHERE list.node_id = ? -- Master listId
295 AND n.parent_id = ? -- Master CorpusId or AnnuaireId
296 AND n.typename = ? -- Master childs (Documents or Contacts)
297 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
298 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
301 SELECT COALESCE(tu.terms,tm.terms) AS terms
302 , COALESCE(tu.n,tm.n) AS n
303 , COALESCE(tu.list_type,tm.list_type) AS ngrams_type
304 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
305 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
306 GROUP BY tu.terms,tm.terms,tu.n,tm.n,tu.list_type,tm.list_type
313 type ListIdUser = NodeId
314 type ListIdMaster = NodeId
316 type MapToChildren = Map Text (Set Text)
317 type MapToParent = Map Text Text
319 getNgramsGroup :: ListIdUser -> ListIdMaster -> Cmd err (MapToParent, MapToChildren)
320 getNgramsGroup lu lm = do
321 groups <- runPGSQuery querySelectNgramsGroup (lu,lm)
322 let mapChildren = fromListWith (<>) $ map (\(a,b) -> (a, DS.singleton b)) groups
323 let mapParent = fromListWith (<>) $ map (\(a,b) -> (b, a)) groups
324 pure (mapParent, mapChildren)
326 querySelectNgramsGroup :: PGS.Query
327 querySelectNgramsGroup = [sql|
329 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
330 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
331 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
333 nnn.node_id = ? -- User listId
336 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
337 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
338 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
340 nnn.node_id = ? -- Master listId
342 SELECT COALESCE(gu.t1,gm.t1) AS ngram1_id
343 , COALESCE(gu.t2,gm.t2) AS ngram2_id
344 FROM groupUser gu LEFT JOIN groupMaster gm ON gu.t1 = gm.t1