2 Module : Gargantext.Database.Schema.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams connection to the Database.
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE GeneralizedNewtypeDeriving #-}
19 {-# LANGUAGE MultiParamTypeClasses #-}
20 {-# LANGUAGE NoImplicitPrelude #-}
21 {-# LANGUAGE OverloadedStrings #-}
22 {-# LANGUAGE QuasiQuotes #-}
23 {-# LANGUAGE RankNTypes #-}
24 {-# LANGUAGE TemplateHaskell #-}
26 module Gargantext.Database.Schema.Ngrams where
29 import Control.Lens (makeLenses, view, over)
30 import Control.Monad (mzero)
31 import Data.ByteString.Internal (ByteString)
32 import Data.Map (Map, fromList, lookup, fromListWith)
33 import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
35 import Data.Text (Text, splitOn)
36 import Database.PostgreSQL.Simple ((:.)(..))
37 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
38 import Database.PostgreSQL.Simple.SqlQQ (sql)
39 import Database.PostgreSQL.Simple.ToField (toField, ToField)
40 import Database.PostgreSQL.Simple.FromField (FromField, fromField)
41 import Database.PostgreSQL.Simple.ToRow (toRow)
42 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
43 --import Debug.Trace (trace)
44 import GHC.Generics (Generic)
45 import Gargantext.Core.Types -- (fromListTypeId, ListType, NodePoly(Node))
46 import Gargantext.Database.Config (nodeTypeId,userMaster)
47 import Gargantext.Database.Root (getRoot)
48 import Gargantext.Database.Types.Node (NodeType)
49 import Gargantext.Database.Schema.Node (getListsWithParentId, getCorporaWithParentId)
50 import Gargantext.Database.Utils (Cmd, runPGSQuery, runOpaQuery, formatPGSQuery)
51 import Gargantext.Prelude
52 import Opaleye hiding (FromField)
53 import Prelude (Enum, Bounded, minBound, maxBound, Functor)
54 import qualified Data.Set as DS
55 import qualified Database.PostgreSQL.Simple as PGS
58 data NgramsPoly id terms n = NgramsDb { ngrams_id :: id
59 , ngrams_terms :: terms
64 type NgramsWrite = NgramsPoly (Maybe (Column PGInt4))
68 type NgramsRead = NgramsPoly (Column PGInt4)
72 type NgramsReadNull = NgramsPoly (Column (Nullable PGInt4))
73 (Column (Nullable PGText))
74 (Column (Nullable PGInt4))
77 type NgramsDb = NgramsPoly Int Text Int
79 $(makeAdaptorAndInstance "pNgramsDb" ''NgramsPoly)
80 -- $(makeLensesWith abbreviatedFields ''NgramsPoly)
82 ngramsTable :: Table NgramsWrite NgramsRead
83 ngramsTable = Table "ngrams" (pNgramsDb NgramsDb { ngrams_id = optional "id"
84 , ngrams_terms = required "terms"
85 , ngrams_n = required "n"
89 queryNgramsTable :: Query NgramsRead
90 queryNgramsTable = queryTable ngramsTable
92 dbGetNgramsDb :: Cmd err [NgramsDb]
93 dbGetNgramsDb = runOpaQuery queryNgramsTable
96 -- | Main Ngrams Types
98 -- Typed Ngrams localize the context of the ngrams
99 -- ngrams in source field of document has Sources Type
100 -- ngrams in authors field of document has Authors Type
101 -- ngrams in text (title or abstract) of documents has Terms Type
102 data NgramsType = Authors | Institutes | Sources | NgramsTerms
103 deriving (Eq, Show, Ord, Enum, Bounded)
105 newtype NgramsTypeId = NgramsTypeId Int
106 deriving (Eq, Show, Ord, Num)
108 instance ToField NgramsTypeId where
109 toField (NgramsTypeId n) = toField n
111 instance FromField NgramsTypeId where
112 fromField fld mdata = do
113 n <- fromField fld mdata
114 if (n :: Int) > 0 then return $ NgramsTypeId n
117 pgNgramsType :: NgramsType -> Column PGInt4
118 pgNgramsType = pgNgramsTypeId . ngramsTypeId
120 pgNgramsTypeId :: NgramsTypeId -> Column PGInt4
121 pgNgramsTypeId (NgramsTypeId n) = pgInt4 n
123 ngramsTypeId :: NgramsType -> NgramsTypeId
124 ngramsTypeId Authors = 1
125 ngramsTypeId Institutes = 2
126 ngramsTypeId Sources = 3
127 ngramsTypeId NgramsTerms = 4
129 fromNgramsTypeId :: NgramsTypeId -> Maybe NgramsType
130 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
132 type NgramsTerms = Text
136 ------------------------------------------------------------------------
137 -- | TODO put it in Gargantext.Text.Ngrams
138 data Ngrams = Ngrams { _ngramsTerms :: Text
140 } deriving (Generic, Show, Eq, Ord)
143 instance PGS.ToRow Ngrams where
144 toRow (Ngrams t s) = [toField t, toField s]
146 text2ngrams :: Text -> Ngrams
147 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
149 -------------------------------------------------------------------------
150 -- | TODO put it in Gargantext.Text.Ngrams
151 -- Named entity are typed ngrams of Terms Ngrams
153 NgramsT { _ngramsType :: NgramsType
155 } deriving (Generic, Show, Eq, Ord)
159 instance Functor NgramsT where
161 -----------------------------------------------------------------------
165 , _ngramsId :: NgramsId
166 } deriving (Show, Generic, Eq, Ord)
168 makeLenses ''NgramsIndexed
169 ------------------------------------------------------------------------
174 } deriving (Show, Generic, Eq, Ord)
176 instance PGS.FromRow NgramIds where
177 fromRow = NgramIds <$> field <*> field
179 ----------------------
180 withMap :: Map NgramsTerms NgramsId -> NgramsTerms -> NgramsId
181 withMap m n = maybe (panic "withMap: should not happen") identity (lookup n m)
183 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
184 indexNgramsT = fmap . indexNgramsWith . withMap
186 indexNgrams :: Map NgramsTerms NgramsId -> Ngrams -> NgramsIndexed
187 indexNgrams = indexNgramsWith . withMap
189 -- NP: not sure we need it anymore
190 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams -> NgramsT NgramsIndexed
191 indexNgramsTWith = fmap . indexNgramsWith
193 indexNgramsWith :: (NgramsTerms -> NgramsId) -> Ngrams -> NgramsIndexed
194 indexNgramsWith f n = NgramsIndexed n (f $ _ngramsTerms n)
196 insertNgrams :: [Ngrams] -> Cmd err (Map NgramsTerms NgramsId)
197 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
199 insertNgrams' :: [Ngrams] -> Cmd err [NgramIds]
200 insertNgrams' ns = runPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
202 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
204 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd err ByteString
205 insertNgrams_Debug ns = formatPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
207 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
209 ----------------------
210 queryInsertNgrams :: PGS.Query
211 queryInsertNgrams = [sql|
212 WITH input_rows(terms,n) AS (?)
214 INSERT INTO ngrams (terms,n)
215 SELECT * FROM input_rows
216 ON CONFLICT (terms) DO NOTHING -- unique index created here
225 JOIN ngrams c USING (terms); -- columns of unique index
230 -- TODO: the way we are getting main Master Corpus and List ID is not clean
231 -- TODO: if ids are not present -> create
232 -- TODO: Global Env State Monad to keep in memory the ids without retrieving it each time
233 getNgramsTableDb :: NodeType -> NgramsType
234 -> NgramsTableParamUser
236 -> Cmd err ([NgramsTableData], MapToParent, MapToChildren)
237 getNgramsTableDb nt ngrt ntp@(NgramsTableParam listIdUser _) limit_ offset_ = do
240 maybeRoot <- head <$> getRoot userMaster
241 let path = "Garg.Db.Ngrams.getTableNgrams: "
242 let masterRootId = maybe (panic $ path <> "no userMaster Tree") (view node_id) maybeRoot
243 -- let errMess = panic "Error"
245 corpusMasterId <- maybe (panic "error master corpus") (view node_id) <$> head <$> getCorporaWithParentId masterRootId
247 listMasterId <- maybe (panic "error master list") (view node_id) <$> head <$> getListsWithParentId corpusMasterId
249 ngramsTableData <- getNgramsTableData nt ngrt ntp (NgramsTableParam listMasterId corpusMasterId) limit_ offset_
251 (mapToParent,mapToChildren) <- getNgramsGroup listIdUser listMasterId
252 pure (ngramsTableData, mapToParent,mapToChildren)
255 data NgramsTableParam =
256 NgramsTableParam { _nt_listId :: NodeId
257 , _nt_corpusId :: NodeId
260 type NgramsTableParamUser = NgramsTableParam
261 type NgramsTableParamMaster = NgramsTableParam
263 data NgramsTableData = NgramsTableData { _ntd_ngrams :: Text
265 , _ntd_listType :: Maybe ListType
266 , _ntd_weight :: Double
269 getNgramsTableData :: NodeType -> NgramsType
270 -> NgramsTableParamUser -> NgramsTableParamMaster
272 -> Cmd err [NgramsTableData]
273 getNgramsTableData nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) limit_ offset_ =
274 -- trace ("Ngrams table params" <> show params) <$>
275 map (\(t,n,nt,w) -> NgramsTableData t n (fromListTypeId nt) w) <$>
276 runPGSQuery querySelectTableNgrams params
278 nodeTId = nodeTypeId nodeT
279 ngrmTId = ngramsTypeId ngrmT
280 params = (ul,uc,nodeTId,ngrmTId,ml,mc,nodeTId,ngrmTId,uc) :.
285 querySelectTableNgrams :: PGS.Query
286 querySelectTableNgrams = [sql|
289 SELECT ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
290 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
291 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
292 JOIN nodes_nodes nn ON nn.node2_id = corp.node_id
293 JOIN nodes n ON n.id = corp.node_id
295 WHERE list.node_id = ? -- User listId
296 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
297 AND n.typename = ? -- both type of childs (Documents or Contacts)
298 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms or...)
301 SELECT ngs.terms, ngs.n, list.list_type, corp.weight FROM ngrams ngs
302 JOIN nodes_ngrams list ON list.ngrams_id = ngs.id
303 JOIN nodes_ngrams corp ON corp.ngrams_id = ngs.id
304 JOIN nodes n ON n.id = corp.node_id
305 JOIN nodes_nodes nn ON nn.node2_id = n.id
307 WHERE list.node_id = ? -- Master listId
308 AND n.parent_id = ? -- Master CorpusId or AnnuaireId
309 AND n.typename = ? -- Master childs (Documents or Contacts)
310 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
311 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
314 SELECT COALESCE(tu.terms,tm.terms) AS terms
315 , COALESCE(tu.n,tm.n) AS n
316 , COALESCE(tu.list_type,tm.list_type) AS ngrams_type
317 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
318 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
319 GROUP BY tu.terms,tm.terms,tu.n,tm.n,tu.list_type,tm.list_type
326 type ListIdUser = NodeId
327 type ListIdMaster = NodeId
329 type MapToChildren = Map Text (Set Text)
330 type MapToParent = Map Text Text
332 getNgramsGroup :: ListIdUser -> ListIdMaster -> Cmd err (MapToParent, MapToChildren)
333 getNgramsGroup lu lm = do
334 groups <- runPGSQuery querySelectNgramsGroup (lu,lm)
335 let mapChildren = fromListWith (<>) $ map (\(a,b) -> (a, DS.singleton b)) groups
336 let mapParent = fromListWith (<>) $ map (\(a,b) -> (b, a)) groups
337 pure (mapParent, mapChildren)
339 querySelectNgramsGroup :: PGS.Query
340 querySelectNgramsGroup = [sql|
342 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
343 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
344 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
346 nnn.node_id = ? -- User listId
349 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
350 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
351 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
353 nnn.node_id = ? -- Master listId
355 SELECT COALESCE(gu.t1,gm.t1) AS ngram1_id
356 , COALESCE(gu.t2,gm.t2) AS ngram2_id
357 FROM groupUser gu LEFT JOIN groupMaster gm ON gu.t1 = gm.t1