2 Module : Gargantext.Database.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams connection to the Database.
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE MultiParamTypeClasses #-}
19 {-# LANGUAGE NoImplicitPrelude #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE QuasiQuotes #-}
22 {-# LANGUAGE TemplateHaskell #-}
24 module Gargantext.Database.Ngrams where
27 import Prelude (Enum, Bounded, minBound, maxBound)
28 import Control.Lens (makeLenses)
29 import Data.ByteString.Internal (ByteString)
30 import Data.Map (Map, fromList, lookup, fromListWith)
32 import qualified Data.Set as DS
33 import Data.Text (Text, splitOn)
34 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
35 import Database.PostgreSQL.Simple.SqlQQ (sql)
36 import Database.PostgreSQL.Simple.ToField (toField)
37 import Database.PostgreSQL.Simple.ToRow (toRow)
38 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
39 import GHC.Generics (Generic)
40 import Gargantext.Core.Types (fromListTypeId, ListType)
41 import Gargantext.Database.Config (nodeTypeId)
42 import Gargantext.Database.Types.Node (NodeType)
43 import Gargantext.Database.Node (mkCmd, Cmd(..))
44 import Gargantext.Prelude
45 import qualified Database.PostgreSQL.Simple as DPS
48 --data NgramPoly id terms n = NgramDb { ngram_id :: id
49 -- , ngram_terms :: terms
53 --type NgramWrite = NgramPoly (Maybe (Column PGInt4))
57 --type NgramRead = NgramPoly (Column PGInt4)
61 ----type Ngram = NgramPoly Int Text Int
63 -- $(makeAdaptorAndInstance "pNgram" ''NgramPoly)
64 -- $(makeLensesWith abbreviatedFields ''NgramPoly)
66 --ngramTable :: Table NgramWrite NgramRead
67 --ngramTable = Table "ngrams" (pNgram NgramDb { ngram_id = optional "id"
68 -- , ngram_terms = required "terms"
69 -- , ngram_n = required "n"
73 --queryNgramTable :: Query NgramRead
74 --queryNgramTable = queryTable ngramTable
76 --dbGetNgrams :: PGS.Connection -> IO [NgramDb]
77 --dbGetNgrams conn = runQuery conn queryNgramTable
79 -- | Main Ngrams Types
81 -- Typed Ngrams localize the context of the ngrams
82 -- ngrams in source field of document has Sources Type
83 -- ngrams in authors field of document has Authors Type
84 -- ngrams in text (title or abstract) of documents has Terms Type
85 data NgramsType = Authors | Institutes | Sources | Terms
86 deriving (Eq, Show, Ord, Enum, Bounded)
88 ngramsTypeId :: NgramsType -> Int
89 ngramsTypeId Authors = 1
90 ngramsTypeId Institutes = 2
91 ngramsTypeId Sources = 3
92 ngramsTypeId Terms = 4
94 fromNgramsTypeId :: Int -> Maybe NgramsType
95 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
97 type NgramsTerms = Text
101 ------------------------------------------------------------------------
102 -- | TODO put it in Gargantext.Text.Ngrams
103 data Ngrams = Ngrams { _ngramsTerms :: Text
105 } deriving (Generic, Show, Eq, Ord)
108 instance DPS.ToRow Ngrams where
109 toRow (Ngrams t s) = [toField t, toField s]
111 text2ngrams :: Text -> Ngrams
112 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
114 -------------------------------------------------------------------------
115 -- | TODO put it in Gargantext.Text.Ngrams
116 -- Named entity are typed ngrams of Terms Ngrams
118 NgramsT { _ngramsType :: NgramsType
120 } deriving (Generic, Show, Eq, Ord)
123 -----------------------------------------------------------------------
127 , _ngramsId :: NgramsId
128 } deriving (Show, Generic, Eq, Ord)
130 makeLenses ''NgramsIndexed
131 ------------------------------------------------------------------------
136 } deriving (Show, Generic, Eq, Ord)
138 instance DPS.FromRow NgramIds where
139 fromRow = NgramIds <$> field <*> field
141 ----------------------
142 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
143 indexNgramsT m ngrId = indexNgramsTWith f ngrId
145 f n = maybe (panic "indexNgramsT: should not happen") identity (lookup n m)
147 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams-> NgramsT NgramsIndexed
148 indexNgramsTWith f (NgramsT t n) = NgramsT t (NgramsIndexed n ((f . _ngramsTerms) n))
150 insertNgrams :: [Ngrams] -> Cmd (Map NgramsTerms NgramsId)
151 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
153 insertNgrams' :: [Ngrams] -> Cmd [NgramIds]
154 insertNgrams' ns = mkCmd $ \conn -> DPS.query conn queryInsertNgrams (DPS.Only $ Values fields ns)
156 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
158 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd ByteString
159 insertNgrams_Debug ns = mkCmd $ \conn -> DPS.formatQuery conn queryInsertNgrams (DPS.Only $ Values fields ns)
161 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
163 ----------------------
164 queryInsertNgrams :: DPS.Query
165 queryInsertNgrams = [sql|
166 WITH input_rows(terms,n) AS (?)
168 INSERT INTO ngrams (terms,n)
169 SELECT * FROM input_rows
170 ON CONFLICT (terms) DO NOTHING -- unique index created here
179 JOIN ngrams c USING (terms); -- columns of unique index
186 data NgramsTableParam =
187 NgramsTableParam { _nt_listId :: Int
188 , _nt_corpusId :: Int
191 type NgramsTableParamUser = NgramsTableParam
192 type NgramsTableParamMaster = NgramsTableParam
194 data NgramsTableData = NgramsTableData { _ntd_terms :: Text
196 , _ntd_listType :: Maybe ListType
197 , _ntd_weight :: Double
200 getTableNgrams :: NodeType -> NgramsType -> NgramsTableParamUser -> NgramsTableParamMaster -> Cmd [NgramsTableData]
201 getTableNgrams nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) =
202 mkCmd $ \conn -> map (\(t,n,nt,w) -> NgramsTableData t n (fromListTypeId nt) w) <$> DPS.query conn querySelectTableNgrams (ul,uc,nodeTId,ngrmTId,ml,mc,nodeTId,ngrmTId)
204 nodeTId = nodeTypeId nodeT
205 ngrmTId = ngramsTypeId ngrmT
209 querySelectTableNgrams :: DPS.Query
210 querySelectTableNgrams = [sql|
212 WITH tableUser AS (select ngs.terms, ngs.n, nn1.ngrams_type,nn2.weight FROM ngrams ngs
213 JOIN nodes_ngrams nn1 ON nn1.ngram_id = ngs.id
214 JOIN nodes_ngrams nn2 ON nn2.ngram_id = ngs.id
215 JOIN nodes n ON n.id = nn2.node_id
216 WHERE nn1.node_id = ? -- User listId
217 AND n.parent_id = ? -- User CorpusId or AnnuaireId
218 AND n.typename = ? -- both type of childs (Documents or Contacts)
219 AND nn2.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
220 ), tableMaster AS (select ngs.terms, ngs.n, nn1.ngrams_type,nn2.weight FROM ngrams ngs
221 JOIN nodes_ngrams nn1 ON nn1.ngram_id = ngs.id
222 JOIN nodes_ngrams nn2 ON nn2.ngram_id = ngs.id
223 JOIN nodes n ON n.id = nn2.node_id
224 WHERE nn1.node_id = ? -- Master listId
225 AND n.parent_id = ? -- Master CorpusId or AnnuaireId
226 AND n.typename = ? -- both type of childs (Documents or Contacts)
227 AND nn2.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
230 SELECT COALESCE(tu.terms,tm.terms) AS terms
231 , COALESCE(tu.n,tm.n) AS n
232 , COALESCE(tu.ngrams_type,tm.ngrams_type) AS ngrams_type
233 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
234 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
235 GROUP BY tu.terms,tm.terms,tu.n,tm.n,tu.ngrams_type,tm.ngrams_type;
239 type ListIdUser = Int
240 type ListIdMaster = Int
242 type MapChildren = Map Text (Set Text)
243 type MapParent = Map Text Text
245 getNgramsGroup :: DPS.Connection -> ListIdUser -> ListIdMaster -> IO (Map Text (Set Text))
246 getNgramsGroup conn lu lm = fromListWith (<>)
247 <$> map (\(a,b) -> (a, DS.singleton b))
248 <$> getNgramsGroup' conn lu lm
251 getNgramsGroup' :: DPS.Connection -> ListIdUser -> ListIdMaster -> IO [(Text,Text)]
252 getNgramsGroup' conn lu lm = DPS.query conn querySelectNgramsGroup (lu,lm)
254 getNgramsGroup'' :: ListIdUser -> ListIdMaster -> Cmd [(Text, Text)]
255 getNgramsGroup'' lu lm = mkCmd $ \conn -> DPS.query conn querySelectNgramsGroup (lu,lm)
257 querySelectNgramsGroup :: DPS.Query
258 querySelectNgramsGroup = [sql|
260 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
261 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
262 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
264 nnn.node_id = ? -- User listId
267 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
268 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
269 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
271 nnn.node_id = ? -- Master listId
273 SELECT COALESCE(gu.t1,gm.t1) AS ngram1_id
274 , COALESCE(gu.t2,gm.t2) AS ngram2_id
275 FROM groupUser gu RIGHT JOIN groupMaster gm ON gu.t1 = gm.t1