2 Module : Gargantext.Database.Schema.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams connection to the Database.
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE GeneralizedNewtypeDeriving #-}
19 {-# LANGUAGE MultiParamTypeClasses #-}
20 {-# LANGUAGE NoImplicitPrelude #-}
21 {-# LANGUAGE OverloadedStrings #-}
22 {-# LANGUAGE QuasiQuotes #-}
23 {-# LANGUAGE RankNTypes #-}
24 {-# LANGUAGE TemplateHaskell #-}
26 module Gargantext.Database.Schema.Ngrams where
29 import Control.Lens (makeLenses, view)
30 import Control.Monad (mzero)
31 import Data.ByteString.Internal (ByteString)
32 import Data.Map (Map, fromList, lookup, fromListWith)
33 import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
35 import Data.Text (Text, splitOn)
36 import Database.PostgreSQL.Simple ((:.)(..))
37 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
38 import Database.PostgreSQL.Simple.SqlQQ (sql)
39 import Database.PostgreSQL.Simple.ToField (toField, ToField)
40 import Database.PostgreSQL.Simple.FromField (FromField, fromField)
41 import Database.PostgreSQL.Simple.ToRow (toRow)
42 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
43 --import Debug.Trace (trace)
44 import GHC.Generics (Generic)
45 import Gargantext.Core.Types -- (fromListTypeId, ListType, NodePoly(Node))
46 import Gargantext.Database.Config (nodeTypeId,userMaster)
47 import Gargantext.Database.Root (getRoot)
48 import Gargantext.Database.Types.Node (NodeType)
49 import Gargantext.Database.Schema.Node (getListsWithParentId, getCorporaWithParentId)
50 import Gargantext.Database.Utils (Cmd, runPGSQuery, runOpaQuery, formatPGSQuery)
51 import Gargantext.Prelude
52 import Opaleye hiding (FromField)
53 import Prelude (Enum, Bounded, minBound, maxBound)
54 import qualified Data.Set as DS
55 import qualified Database.PostgreSQL.Simple as PGS
58 data NgramsPoly id terms n = NgramsDb { ngrams_id :: id
59 , ngrams_terms :: terms
64 type NgramsWrite = NgramsPoly (Maybe (Column PGInt4))
68 type NgramsRead = NgramsPoly (Column PGInt4)
72 type NgramsReadNull = NgramsPoly (Column (Nullable PGInt4))
73 (Column (Nullable PGText))
74 (Column (Nullable PGInt4))
77 type NgramsDb = NgramsPoly Int Text Int
79 $(makeAdaptorAndInstance "pNgramsDb" ''NgramsPoly)
80 -- $(makeLensesWith abbreviatedFields ''NgramsPoly)
82 ngramsTable :: Table NgramsWrite NgramsRead
83 ngramsTable = Table "ngrams" (pNgramsDb NgramsDb { ngrams_id = optional "id"
84 , ngrams_terms = required "terms"
85 , ngrams_n = required "n"
89 queryNgramsTable :: Query NgramsRead
90 queryNgramsTable = queryTable ngramsTable
92 dbGetNgramsDb :: Cmd err [NgramsDb]
93 dbGetNgramsDb = runOpaQuery queryNgramsTable
96 -- | Main Ngrams Types
98 -- Typed Ngrams localize the context of the ngrams
99 -- ngrams in source field of document has Sources Type
100 -- ngrams in authors field of document has Authors Type
101 -- ngrams in text (title or abstract) of documents has Terms Type
102 data NgramsType = Authors | Institutes | Sources | NgramsTerms
103 deriving (Eq, Show, Ord, Enum, Bounded)
105 newtype NgramsTypeId = NgramsTypeId Int
106 deriving (Eq, Show, Ord, Num)
108 instance ToField NgramsTypeId where
109 toField (NgramsTypeId n) = toField n
111 instance FromField NgramsTypeId where
112 fromField fld mdata = do
113 n <- fromField fld mdata
114 if (n :: Int) > 0 then return $ NgramsTypeId n
117 pgNgramsType :: NgramsType -> Column PGInt4
118 pgNgramsType = pgNgramsTypeId . ngramsTypeId
120 pgNgramsTypeId :: NgramsTypeId -> Column PGInt4
121 pgNgramsTypeId (NgramsTypeId n) = pgInt4 n
123 ngramsTypeId :: NgramsType -> NgramsTypeId
124 ngramsTypeId Authors = 1
125 ngramsTypeId Institutes = 2
126 ngramsTypeId Sources = 3
127 ngramsTypeId NgramsTerms = 4
129 fromNgramsTypeId :: NgramsTypeId -> Maybe NgramsType
130 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
132 type NgramsTerms = Text
136 ------------------------------------------------------------------------
137 -- | TODO put it in Gargantext.Text.Ngrams
138 data Ngrams = Ngrams { _ngramsTerms :: Text
140 } deriving (Generic, Show, Eq, Ord)
143 instance PGS.ToRow Ngrams where
144 toRow (Ngrams t s) = [toField t, toField s]
146 text2ngrams :: Text -> Ngrams
147 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
149 -------------------------------------------------------------------------
150 -- | TODO put it in Gargantext.Text.Ngrams
151 -- Named entity are typed ngrams of Terms Ngrams
153 NgramsT { _ngramsType :: NgramsType
155 } deriving (Generic, Show, Eq, Ord)
158 -----------------------------------------------------------------------
162 , _ngramsId :: NgramsId
163 } deriving (Show, Generic, Eq, Ord)
165 makeLenses ''NgramsIndexed
166 ------------------------------------------------------------------------
171 } deriving (Show, Generic, Eq, Ord)
173 instance PGS.FromRow NgramIds where
174 fromRow = NgramIds <$> field <*> field
176 ----------------------
177 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
178 indexNgramsT m ngrId = indexNgramsTWith f ngrId
180 f n = maybe (panic "indexNgramsT: should not happen") identity (lookup n m)
182 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams-> NgramsT NgramsIndexed
183 indexNgramsTWith f (NgramsT t n) = NgramsT t (NgramsIndexed n ((f . _ngramsTerms) n))
185 insertNgrams :: [Ngrams] -> Cmd err (Map NgramsTerms NgramsId)
186 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
188 insertNgrams' :: [Ngrams] -> Cmd err [NgramIds]
189 insertNgrams' ns = runPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
191 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
193 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd err ByteString
194 insertNgrams_Debug ns = formatPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
196 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
198 ----------------------
199 queryInsertNgrams :: PGS.Query
200 queryInsertNgrams = [sql|
201 WITH input_rows(terms,n) AS (?)
203 INSERT INTO ngrams (terms,n)
204 SELECT * FROM input_rows
205 ON CONFLICT (terms) DO NOTHING -- unique index created here
214 JOIN ngrams c USING (terms); -- columns of unique index
219 -- TODO: the way we are getting main Master Corpus and List ID is not clean
220 -- TODO: if ids are not present -> create
221 -- TODO: Global Env State Monad to keep in memory the ids without retrieving it each time
222 getNgramsTableDb :: NodeType -> NgramsType
223 -> NgramsTableParamUser
225 -> Cmd err ([NgramsTableData], MapToParent, MapToChildren)
226 getNgramsTableDb nt ngrt ntp@(NgramsTableParam listIdUser _) limit_ offset_ = do
229 maybeRoot <- head <$> getRoot userMaster
230 let path = "Garg.Db.Ngrams.getTableNgrams: "
231 let masterRootId = maybe (panic $ path <> "no userMaster Tree") (view node_id) maybeRoot
232 -- let errMess = panic "Error"
234 corpusMasterId <- maybe (panic "error master corpus") (view node_id) <$> head <$> getCorporaWithParentId masterRootId
236 listMasterId <- maybe (panic "error master list") (view node_id) <$> head <$> getListsWithParentId corpusMasterId
238 ngramsTableData <- getNgramsTableData nt ngrt ntp (NgramsTableParam listMasterId corpusMasterId) limit_ offset_
240 (mapToParent,mapToChildren) <- getNgramsGroup listIdUser listMasterId
241 pure (ngramsTableData, mapToParent,mapToChildren)
244 data NgramsTableParam =
245 NgramsTableParam { _nt_listId :: NodeId
246 , _nt_corpusId :: NodeId
249 type NgramsTableParamUser = NgramsTableParam
250 type NgramsTableParamMaster = NgramsTableParam
252 data NgramsTableData = NgramsTableData { _ntd_ngrams :: Text
254 , _ntd_listType :: Maybe ListType
255 , _ntd_weight :: Double
258 getNgramsTableData :: NodeType -> NgramsType
259 -> NgramsTableParamUser -> NgramsTableParamMaster
261 -> Cmd err [NgramsTableData]
262 getNgramsTableData nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) limit_ offset_ =
263 -- trace ("Ngrams table params" <> show params) <$>
264 map (\(t,n,nt,w) -> NgramsTableData t n (fromListTypeId nt) w) <$>
265 runPGSQuery querySelectTableNgrams params
267 nodeTId = nodeTypeId nodeT
268 ngrmTId = ngramsTypeId ngrmT
269 params = (ul,uc,nodeTId,ngrmTId,ml,mc,nodeTId,ngrmTId,uc) :.
274 querySelectTableNgrams :: PGS.Query
275 querySelectTableNgrams = [sql|
278 SELECT ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
279 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
280 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
281 JOIN nodes_nodes nn ON nn.node2_id = corp.node_id
282 JOIN nodes n ON n.id = corp.node_id
284 WHERE list.node_id = ? -- User listId
285 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
286 AND n.typename = ? -- both type of childs (Documents or Contacts)
287 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms or...)
290 SELECT ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
291 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
292 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
293 JOIN nodes n ON n.id = corp.node_id
294 JOIN nodes_nodes nn ON nn.node2_id = n.id
296 WHERE list.node_id = ? -- Master listId
297 AND n.parent_id = ? -- Master CorpusId or AnnuaireId
298 AND n.typename = ? -- Master childs (Documents or Contacts)
299 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
300 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
303 SELECT COALESCE(tu.terms,tm.terms) AS terms
304 , COALESCE(tu.n,tm.n) AS n
305 , COALESCE(tu.list_type,tm.list_type) AS ngrams_type
306 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
307 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
308 GROUP BY tu.terms,tm.terms,tu.n,tm.n,tu.list_type,tm.list_type
315 type ListIdUser = NodeId
316 type ListIdMaster = NodeId
318 type MapToChildren = Map Text (Set Text)
319 type MapToParent = Map Text Text
321 getNgramsGroup :: ListIdUser -> ListIdMaster -> Cmd err (MapToParent, MapToChildren)
322 getNgramsGroup lu lm = do
323 groups <- runPGSQuery querySelectNgramsGroup (lu,lm)
324 let mapChildren = fromListWith (<>) $ map (\(a,b) -> (a, DS.singleton b)) groups
325 let mapParent = fromListWith (<>) $ map (\(a,b) -> (b, a)) groups
326 pure (mapParent, mapChildren)
328 querySelectNgramsGroup :: PGS.Query
329 querySelectNgramsGroup = [sql|
331 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
332 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
333 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
335 nnn.node_id = ? -- User listId
338 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
339 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
340 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
342 nnn.node_id = ? -- Master listId
344 SELECT COALESCE(gu.t1,gm.t1) AS ngram1_id
345 , COALESCE(gu.t2,gm.t2) AS ngram2_id
346 FROM groupUser gu LEFT JOIN groupMaster gm ON gu.t1 = gm.t1