2 Module : Gargantext.Database.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams connection to the Database.
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE MultiParamTypeClasses #-}
19 {-# LANGUAGE NoImplicitPrelude #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE QuasiQuotes #-}
22 {-# LANGUAGE TemplateHaskell #-}
24 module Gargantext.Database.Ngrams where
26 import Prelude (Enum, Bounded, minBound, maxBound)
27 import Control.Lens (makeLenses, view)
28 import Data.ByteString.Internal (ByteString)
29 import Data.Map (Map, fromList, lookup, fromListWith)
31 import Data.Tuple.Extra (both)
32 import qualified Data.Set as DS
33 import Data.Text (Text, splitOn)
34 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
35 import Database.PostgreSQL.Simple.SqlQQ (sql)
36 import Database.PostgreSQL.Simple.ToField (toField)
37 import Database.PostgreSQL.Simple.ToRow (toRow)
38 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
39 import GHC.Generics (Generic)
40 import Gargantext.Core.Types -- (fromListTypeId, ListType, NodePoly(Node))
41 import Gargantext.Database.Config (nodeTypeId,userMaster)
42 import Gargantext.Database.Types.Node (NodeType)
43 import Gargantext.Database.Node (mkCmd, Cmd(..),getRootUsername)
44 import Gargantext.Database.Tree (dbTree, toNodeTree)
45 import Gargantext.Core.Types.Main (NodeTree(..))
46 import Gargantext.Prelude
47 import qualified Database.PostgreSQL.Simple as DPS
49 --data NgramPoly id terms n = NgramDb { ngram_id :: id
50 -- , ngram_terms :: terms
54 --type NgramWrite = NgramPoly (Maybe (Column PGInt4))
58 --type NgramRead = NgramPoly (Column PGInt4)
62 ----type Ngram = NgramPoly Int Text Int
64 -- $(makeAdaptorAndInstance "pNgram" ''NgramPoly)
65 -- $(makeLensesWith abbreviatedFields ''NgramPoly)
67 --ngramTable :: Table NgramWrite NgramRead
68 --ngramTable = Table "ngrams" (pNgram NgramDb { ngram_id = optional "id"
69 -- , ngram_terms = required "terms"
70 -- , ngram_n = required "n"
74 --queryNgramTable :: Query NgramRead
75 --queryNgramTable = queryTable ngramTable
77 --dbGetNgrams :: PGS.Connection -> IO [NgramDb]
78 --dbGetNgrams conn = runQuery conn queryNgramTable
80 -- | Main Ngrams Types
82 -- Typed Ngrams localize the context of the ngrams
83 -- ngrams in source field of document has Sources Type
84 -- ngrams in authors field of document has Authors Type
85 -- ngrams in text (title or abstract) of documents has Terms Type
86 data NgramsType = Authors | Institutes | Sources | NgramsTerms
87 deriving (Eq, Show, Ord, Enum, Bounded)
89 ngramsTypeId :: NgramsType -> Int
90 ngramsTypeId Authors = 1
91 ngramsTypeId Institutes = 2
92 ngramsTypeId Sources = 3
93 ngramsTypeId NgramsTerms = 4
95 fromNgramsTypeId :: Int -> Maybe NgramsType
96 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
98 type NgramsTerms = Text
102 ------------------------------------------------------------------------
103 -- | TODO put it in Gargantext.Text.Ngrams
104 data Ngrams = Ngrams { _ngramsTerms :: Text
106 } deriving (Generic, Show, Eq, Ord)
109 instance DPS.ToRow Ngrams where
110 toRow (Ngrams t s) = [toField t, toField s]
112 text2ngrams :: Text -> Ngrams
113 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
115 -------------------------------------------------------------------------
116 -- | TODO put it in Gargantext.Text.Ngrams
117 -- Named entity are typed ngrams of Terms Ngrams
119 NgramsT { _ngramsType :: NgramsType
121 } deriving (Generic, Show, Eq, Ord)
124 -----------------------------------------------------------------------
128 , _ngramsId :: NgramsId
129 } deriving (Show, Generic, Eq, Ord)
131 makeLenses ''NgramsIndexed
132 ------------------------------------------------------------------------
137 } deriving (Show, Generic, Eq, Ord)
139 instance DPS.FromRow NgramIds where
140 fromRow = NgramIds <$> field <*> field
142 ----------------------
143 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
144 indexNgramsT m ngrId = indexNgramsTWith f ngrId
146 f n = maybe (panic "indexNgramsT: should not happen") identity (lookup n m)
148 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams-> NgramsT NgramsIndexed
149 indexNgramsTWith f (NgramsT t n) = NgramsT t (NgramsIndexed n ((f . _ngramsTerms) n))
151 insertNgrams :: [Ngrams] -> Cmd (Map NgramsTerms NgramsId)
152 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
154 insertNgrams' :: [Ngrams] -> Cmd [NgramIds]
155 insertNgrams' ns = mkCmd $ \conn -> DPS.query conn queryInsertNgrams (DPS.Only $ Values fields ns)
157 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
159 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd ByteString
160 insertNgrams_Debug ns = mkCmd $ \conn -> DPS.formatQuery conn queryInsertNgrams (DPS.Only $ Values fields ns)
162 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
164 ----------------------
165 queryInsertNgrams :: DPS.Query
166 queryInsertNgrams = [sql|
167 WITH input_rows(terms,n) AS (?)
169 INSERT INTO ngrams (terms,n)
170 SELECT * FROM input_rows
171 ON CONFLICT (terms) DO NOTHING -- unique index created here
180 JOIN ngrams c USING (terms); -- columns of unique index
186 -- TODO: the way we are getting main Master Corpus and List ID is not clean
187 -- TODO: if ids are not present -> create
188 -- TODO: Global Env State Monad to keep in memory the ids without retrieving it each time
189 getNgramsTableDb :: DPS.Connection
190 -> NodeType -> NgramsType
191 -> NgramsTableParamUser
192 -> IO ([NgramsTableData], MapToParent, MapToChildren)
193 getNgramsTableDb c nt ngrt ntp@(NgramsTableParam listIdUser _) = do
194 let lieu = "Garg.Db.Ngrams.getTableNgrams: "
195 maybeRoot <- head <$> getRootUsername userMaster c
196 let masterRootId = maybe (panic $ lieu <> "no userMaster Tree") (view node_id) maybeRoot
197 tree <- map toNodeTree <$> dbTree c masterRootId
198 let maybeCorpus = head $ filter (\n -> _nt_type n == NodeCorpus) tree
199 let maybeList = head $ filter (\n -> _nt_type n == NodeList) tree
200 let maybeIds = fmap (both _nt_id) $ (,) <$> maybeCorpus <*> maybeList
201 let (corpusMasterId, listMasterId) = maybe (panic $ lieu <> "no CorpusId or ListId") identity maybeIds
202 ngramsTableData <- getNgramsTableData c nt ngrt ntp (NgramsTableParam listMasterId corpusMasterId)
203 (mapToParent,mapToChildren) <- getNgramsGroup c listIdUser listMasterId
204 pure (ngramsTableData, mapToParent,mapToChildren)
207 data NgramsTableParam =
208 NgramsTableParam { _nt_listId :: Int
209 , _nt_corpusId :: Int
212 type NgramsTableParamUser = NgramsTableParam
213 type NgramsTableParamMaster = NgramsTableParam
215 data NgramsTableData = NgramsTableData { _ntd_ngrams :: Text
217 , _ntd_listType :: Maybe ListType
218 , _ntd_weight :: Double
221 getNgramsTableData :: DPS.Connection
222 -> NodeType -> NgramsType
223 -> NgramsTableParamUser -> NgramsTableParamMaster
224 -> IO [NgramsTableData]
225 getNgramsTableData conn nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) =
226 map (\(t,n,nt,w) -> NgramsTableData t n (fromListTypeId nt) w)
227 <$> DPS.query conn querySelectTableNgrams (ul,uc,nodeTId,ngrmTId,ml,mc,nodeTId,ngrmTId,uc)
229 nodeTId = nodeTypeId nodeT
230 ngrmTId = ngramsTypeId ngrmT
233 querySelectTableNgrams :: DPS.Query
234 querySelectTableNgrams = [sql|
237 SELECT ngs.terms, ngs.n, list.ngrams_type, corp.weight FROM ngrams ngs
238 JOIN nodes_ngrams list ON list.ngram_id = ngs.id
239 JOIN nodes_ngrams corp ON corp.ngram_id = ngs.id
240 JOIN nodes n ON n.id = corp.node_id
242 WHERE list.node_id = ? -- User listId
243 AND n.parent_id = ? -- User CorpusId or AnnuaireId
244 AND n.typename = ? -- both type of childs (Documents or Contacts)
245 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms or...)
248 SELECT ngs.terms, ngs.n, list.ngrams_type, corp.weight FROM ngrams ngs
249 JOIN nodes_ngrams list ON list.ngram_id = ngs.id
250 JOIN nodes_ngrams corp ON corp.ngram_id = ngs.id
251 JOIN nodes n ON n.id = corp.node_id
252 JOIN nodes_nodes nn ON nn.node2_id = n.id
254 WHERE list.node_id = ? -- Master listId
255 AND n.parent_id = ? -- Master CorpusId or AnnuaireId
256 AND n.typename = ? -- Master childs (Documents or Contacts)
257 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
258 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
261 SELECT COALESCE(tu.terms,tm.terms) AS terms
262 , COALESCE(tu.n,tm.n) AS n
263 , COALESCE(tu.ngrams_type,tm.ngrams_type) AS ngrams_type
264 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
265 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
266 GROUP BY tu.terms,tm.terms,tu.n,tm.n,tu.ngrams_type,tm.ngrams_type;
270 type ListIdUser = Int
271 type ListIdMaster = Int
273 type MapToChildren = Map Text (Set Text)
274 type MapToParent = Map Text Text
276 getNgramsGroup :: DPS.Connection -> ListIdUser -> ListIdMaster -> IO (MapToParent, MapToChildren)
277 getNgramsGroup conn lu lm = do
278 groups <- getNgramsGroup' conn lu lm
279 let mapChildren = fromListWith (<>) $ map (\(a,b) -> (a, DS.singleton b)) groups
280 let mapParent = fromListWith (<>) $ map (\(a,b) -> (b, a)) groups
281 pure (mapParent, mapChildren)
283 getNgramsGroup' :: DPS.Connection -> ListIdUser -> ListIdMaster -> IO [(Text,Text)]
284 getNgramsGroup' conn lu lm = DPS.query conn querySelectNgramsGroup (lu,lm)
286 getNgramsGroup'' :: ListIdUser -> ListIdMaster -> Cmd [(Text, Text)]
287 getNgramsGroup'' lu lm = mkCmd $ \conn -> DPS.query conn querySelectNgramsGroup (lu,lm)
289 querySelectNgramsGroup :: DPS.Query
290 querySelectNgramsGroup = [sql|
292 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
293 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
294 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
296 nnn.node_id = ? -- User listId
299 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
300 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
301 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
303 nnn.node_id = ? -- Master listId
305 SELECT COALESCE(gu.t1,gm.t1) AS ngram1_id
306 , COALESCE(gu.t2,gm.t2) AS ngram2_id
307 FROM groupUser gu RIGHT JOIN groupMaster gm ON gu.t1 = gm.t1