2 Module : Gargantext.Database.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams connection to the Database.
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE MultiParamTypeClasses #-}
19 {-# LANGUAGE NoImplicitPrelude #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE QuasiQuotes #-}
22 {-# LANGUAGE TemplateHaskell #-}
24 module Gargantext.Database.Ngrams where
26 import Control.Lens (makeLenses, view)
27 import Data.ByteString.Internal (ByteString)
28 import Data.Map (Map, fromList, lookup, fromListWith)
30 import Data.Text (Text, splitOn)
31 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
32 import Database.PostgreSQL.Simple.SqlQQ (sql)
33 import Database.PostgreSQL.Simple.ToField (toField)
34 import Database.PostgreSQL.Simple.ToRow (toRow)
35 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
36 import Debug.Trace (trace)
37 import GHC.Generics (Generic)
38 import Gargantext.Core.Types -- (fromListTypeId, ListType, NodePoly(Node))
39 import Gargantext.Database.Config (nodeTypeId,userMaster)
40 import Gargantext.Database.Node (mkCmd, Cmd(..),getListsWithParentId, getCorporaWithParentId)
41 import Gargantext.Database.Root (getRoot)
42 import Gargantext.Core.Types (CorpusId)
43 import Gargantext.Database.Types.Node (NodeType)
44 import Gargantext.Prelude
45 import Prelude (Enum, Bounded, minBound, maxBound)
46 import qualified Data.Set as DS
47 import qualified Database.PostgreSQL.Simple as DPS
49 --data NgramPoly id terms n = NgramDb { ngram_id :: id
50 -- , ngram_terms :: terms
54 --type NgramWrite = NgramPoly (Maybe (Column PGInt4))
58 --type NgramRead = NgramPoly (Column PGInt4)
62 ----type Ngram = NgramPoly Int Text Int
64 -- $(makeAdaptorAndInstance "pNgram" ''NgramPoly)
65 -- $(makeLensesWith abbreviatedFields ''NgramPoly)
67 --ngramTable :: Table NgramWrite NgramRead
68 --ngramTable = Table "ngrams" (pNgram NgramDb { ngram_id = optional "id"
69 -- , ngram_terms = required "terms"
70 -- , ngram_n = required "n"
74 --queryNgramTable :: Query NgramRead
75 --queryNgramTable = queryTable ngramTable
77 --dbGetNgrams :: PGS.Connection -> IO [NgramDb]
78 --dbGetNgrams conn = runQuery conn queryNgramTable
80 -- | Main Ngrams Types
82 -- Typed Ngrams localize the context of the ngrams
83 -- ngrams in source field of document has Sources Type
84 -- ngrams in authors field of document has Authors Type
85 -- ngrams in text (title or abstract) of documents has Terms Type
86 data NgramsType = Authors | Institutes | Sources | NgramsTerms
87 deriving (Eq, Show, Ord, Enum, Bounded)
89 ngramsTypeId :: NgramsType -> Int
90 ngramsTypeId Authors = 1
91 ngramsTypeId Institutes = 2
92 ngramsTypeId Sources = 3
93 ngramsTypeId NgramsTerms = 4
95 fromNgramsTypeId :: Int -> Maybe NgramsType
96 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
98 type NgramsTerms = Text
102 ------------------------------------------------------------------------
103 -- | TODO put it in Gargantext.Text.Ngrams
104 data Ngrams = Ngrams { _ngramsTerms :: Text
106 } deriving (Generic, Show, Eq, Ord)
109 instance DPS.ToRow Ngrams where
110 toRow (Ngrams t s) = [toField t, toField s]
112 text2ngrams :: Text -> Ngrams
113 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
115 -------------------------------------------------------------------------
116 -- | TODO put it in Gargantext.Text.Ngrams
117 -- Named entity are typed ngrams of Terms Ngrams
119 NgramsT { _ngramsType :: NgramsType
121 } deriving (Generic, Show, Eq, Ord)
124 -----------------------------------------------------------------------
128 , _ngramsId :: NgramsId
129 } deriving (Show, Generic, Eq, Ord)
131 makeLenses ''NgramsIndexed
132 ------------------------------------------------------------------------
137 } deriving (Show, Generic, Eq, Ord)
139 instance DPS.FromRow NgramIds where
140 fromRow = NgramIds <$> field <*> field
142 ----------------------
143 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
144 indexNgramsT m ngrId = indexNgramsTWith f ngrId
146 f n = maybe (panic "indexNgramsT: should not happen") identity (lookup n m)
148 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams-> NgramsT NgramsIndexed
149 indexNgramsTWith f (NgramsT t n) = NgramsT t (NgramsIndexed n ((f . _ngramsTerms) n))
151 insertNgrams :: [Ngrams] -> Cmd (Map NgramsTerms NgramsId)
152 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
154 insertNgrams' :: [Ngrams] -> Cmd [NgramIds]
155 insertNgrams' ns = mkCmd $ \conn -> DPS.query conn queryInsertNgrams (DPS.Only $ Values fields ns)
157 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
159 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd ByteString
160 insertNgrams_Debug ns = mkCmd $ \conn -> DPS.formatQuery conn queryInsertNgrams (DPS.Only $ Values fields ns)
162 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
164 ----------------------
165 queryInsertNgrams :: DPS.Query
166 queryInsertNgrams = [sql|
167 WITH input_rows(terms,n) AS (?)
169 INSERT INTO ngrams (terms,n)
170 SELECT * FROM input_rows
171 ON CONFLICT (terms) DO NOTHING -- unique index created here
180 JOIN ngrams c USING (terms); -- columns of unique index
183 defaultList :: DPS.Connection -> CorpusId -> IO ListId
184 defaultList c cId = view node_id <$> maybe (panic errMessage) identity
186 <$> getListsWithParentId c cId
188 errMessage = "Gargantext.API.Ngrams.defaultList: no list found"
191 -- TODO: the way we are getting main Master Corpus and List ID is not clean
192 -- TODO: if ids are not present -> create
193 -- TODO: Global Env State Monad to keep in memory the ids without retrieving it each time
194 getNgramsTableDb :: DPS.Connection
195 -> NodeType -> NgramsType
196 -> NgramsTableParamUser
197 -> IO ([NgramsTableData], MapToParent, MapToChildren)
198 getNgramsTableDb c nt ngrt ntp@(NgramsTableParam listIdUser _) = do
201 maybeRoot <- head <$> getRoot userMaster c
202 let path = "Garg.Db.Ngrams.getTableNgrams: "
203 let masterRootId = maybe (panic $ path <> "no userMaster Tree") (view node_id) maybeRoot
204 -- let errMess = panic "Error"
206 corpusMasterId <- maybe (panic "error master corpus") (view node_id) <$> head <$> getCorporaWithParentId c masterRootId
208 listMasterId <- maybe (panic "error master list") (view node_id) <$> head <$> getListsWithParentId c corpusMasterId
210 ngramsTableData <- getNgramsTableData c nt ngrt ntp (NgramsTableParam listMasterId corpusMasterId)
212 (mapToParent,mapToChildren) <- getNgramsGroup c listIdUser listMasterId
213 pure (ngramsTableData, mapToParent,mapToChildren)
216 data NgramsTableParam =
217 NgramsTableParam { _nt_listId :: Int
218 , _nt_corpusId :: Int
221 type NgramsTableParamUser = NgramsTableParam
222 type NgramsTableParamMaster = NgramsTableParam
224 data NgramsTableData = NgramsTableData { _ntd_ngrams :: Text
226 , _ntd_listType :: Maybe ListType
227 , _ntd_weight :: Double
230 getNgramsTableData :: DPS.Connection
231 -> NodeType -> NgramsType
232 -> NgramsTableParamUser -> NgramsTableParamMaster
233 -> IO [NgramsTableData]
234 getNgramsTableData conn nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) =
235 trace ("Ngrams table params" <> show params) <$>
236 map (\(t,n,nt,w) -> NgramsTableData t n (fromListTypeId nt) w) <$>
237 DPS.query conn querySelectTableNgrams params
239 nodeTId = nodeTypeId nodeT
240 ngrmTId = ngramsTypeId ngrmT
241 params = (ul,uc,nodeTId,ngrmTId,ml,mc,nodeTId,ngrmTId,uc)
245 querySelectTableNgrams :: DPS.Query
246 querySelectTableNgrams = [sql|
249 SELECT ngs.terms, ngs.n, list.ngrams_type, corp.weight FROM ngrams ngs
250 JOIN nodes_ngrams list ON list.ngram_id = ngs.id
251 JOIN nodes_ngrams corp ON corp.ngram_id = ngs.id
252 JOIN nodes n ON n.id = corp.node_id
254 WHERE list.node_id = ? -- User listId
255 AND n.parent_id = ? -- User CorpusId or AnnuaireId
256 AND n.typename = ? -- both type of childs (Documents or Contacts)
257 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms or...)
260 SELECT ngs.terms, ngs.n, list.ngrams_type, corp.weight FROM ngrams ngs
261 JOIN nodes_ngrams list ON list.ngram_id = ngs.id
262 JOIN nodes_ngrams corp ON corp.ngram_id = ngs.id
263 JOIN nodes n ON n.id = corp.node_id
264 JOIN nodes_nodes nn ON nn.node2_id = n.id
266 WHERE list.node_id = ? -- Master listId
267 AND n.parent_id = ? -- Master CorpusId or AnnuaireId
268 AND n.typename = ? -- Master childs (Documents or Contacts)
269 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
270 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
273 SELECT COALESCE(tu.terms,tm.terms) AS terms
274 , COALESCE(tu.n,tm.n) AS n
275 , COALESCE(tu.ngrams_type,tm.ngrams_type) AS ngrams_type
276 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
277 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
278 GROUP BY tu.terms,tm.terms,tu.n,tm.n,tu.ngrams_type,tm.ngrams_type;
282 type ListIdUser = Int
283 type ListIdMaster = Int
285 type MapToChildren = Map Text (Set Text)
286 type MapToParent = Map Text Text
288 getNgramsGroup :: DPS.Connection -> ListIdUser -> ListIdMaster -> IO (MapToParent, MapToChildren)
289 getNgramsGroup conn lu lm = do
290 groups <- getNgramsGroup' conn lu lm
291 let mapChildren = fromListWith (<>) $ map (\(a,b) -> (a, DS.singleton b)) groups
292 let mapParent = fromListWith (<>) $ map (\(a,b) -> (b, a)) groups
293 pure (mapParent, mapChildren)
295 getNgramsGroup' :: DPS.Connection -> ListIdUser -> ListIdMaster -> IO [(Text,Text)]
296 getNgramsGroup' conn lu lm = DPS.query conn querySelectNgramsGroup (lu,lm)
298 getNgramsGroup'' :: ListIdUser -> ListIdMaster -> Cmd [(Text, Text)]
299 getNgramsGroup'' lu lm = mkCmd $ \conn -> DPS.query conn querySelectNgramsGroup (lu,lm)
301 querySelectNgramsGroup :: DPS.Query
302 querySelectNgramsGroup = [sql|
304 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
305 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
306 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
308 nnn.node_id = ? -- User listId
311 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
312 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
313 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
315 nnn.node_id = ? -- Master listId
317 SELECT COALESCE(gu.t1,gm.t1) AS ngram1_id
318 , COALESCE(gu.t2,gm.t2) AS ngram2_id
319 FROM groupUser gu RIGHT JOIN groupMaster gm ON gu.t1 = gm.t1