]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Ngrams.hs
[DB] SQL queries for ngrams table.
[gargantext.git] / src / Gargantext / Database / Ngrams.hs
1 {-|
2 Module : Gargantext.Database.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams connection to the Database.
11
12 -}
13
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE MultiParamTypeClasses #-}
19 {-# LANGUAGE NoImplicitPrelude #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE QuasiQuotes #-}
22 {-# LANGUAGE TemplateHaskell #-}
23
24 module Gargantext.Database.Ngrams where
25
26 -- import Opaleye
27 import Control.Lens (makeLenses)
28 import Data.ByteString.Internal (ByteString)
29 import Data.Map (Map, fromList, lookup)
30 import Data.Text (Text, splitOn)
31 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
32 import Database.PostgreSQL.Simple.SqlQQ (sql)
33 import Database.PostgreSQL.Simple.ToField (toField)
34 import Database.PostgreSQL.Simple.ToRow (toRow)
35 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
36 import GHC.Generics (Generic)
37 import Gargantext.Database.Node (mkCmd, Cmd(..))
38 import Gargantext.Prelude
39 import qualified Database.PostgreSQL.Simple as DPS
40
41
42 --data NgramPoly id terms n = NgramDb { ngram_id :: id
43 -- , ngram_terms :: terms
44 -- , ngram_n :: n
45 -- } deriving (Show)
46 --
47 --type NgramWrite = NgramPoly (Maybe (Column PGInt4))
48 -- (Column PGText)
49 -- (Column PGInt4)
50 --
51 --type NgramRead = NgramPoly (Column PGInt4)
52 -- (Column PGText)
53 -- (Column PGInt4)
54 --
55 ----type Ngram = NgramPoly Int Text Int
56 --
57 -- $(makeAdaptorAndInstance "pNgram" ''NgramPoly)
58 -- $(makeLensesWith abbreviatedFields ''NgramPoly)
59 --
60 --ngramTable :: Table NgramWrite NgramRead
61 --ngramTable = Table "ngrams" (pNgram NgramDb { ngram_id = optional "id"
62 -- , ngram_terms = required "terms"
63 -- , ngram_n = required "n"
64 -- }
65 -- )
66 --
67 --queryNgramTable :: Query NgramRead
68 --queryNgramTable = queryTable ngramTable
69 --
70 --dbGetNgrams :: PGS.Connection -> IO [NgramDb]
71 --dbGetNgrams conn = runQuery conn queryNgramTable
72
73 -- | Main Ngrams Types
74 -- | Typed Ngrams
75 -- Typed Ngrams localize the context of the ngrams
76 -- ngrams in source field of document has Sources Type
77 -- ngrams in authors field of document has Authors Type
78 -- ngrams in text (title or abstract) of documents has Terms Type
79 data NgramsType = Authors | Institutes | Sources | Terms
80 deriving (Eq, Show, Ord)
81
82 ngramsTypeId :: NgramsType -> Int
83 ngramsTypeId Authors = 1
84 ngramsTypeId Institutes = 2
85 ngramsTypeId Sources = 3
86 ngramsTypeId Terms = 4
87
88 type NgramsTerms = Text
89 type NgramsId = Int
90 type Size = Int
91
92 ------------------------------------------------------------------------
93 -- | TODO put it in Gargantext.Text.Ngrams
94 data Ngrams = Ngrams { _ngramsTerms :: Text
95 , _ngramsSize :: Int
96 } deriving (Generic, Show, Eq, Ord)
97
98 makeLenses ''Ngrams
99 instance DPS.ToRow Ngrams where
100 toRow (Ngrams t s) = [toField t, toField s]
101
102 text2ngrams :: Text -> Ngrams
103 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
104
105 -------------------------------------------------------------------------
106 -- | TODO put it in Gargantext.Text.Ngrams
107 -- Named entity are typed ngrams of Terms Ngrams
108 data NgramsT a =
109 NgramsT { _ngramsType :: NgramsType
110 , _ngramsT :: a
111 } deriving (Generic, Show, Eq, Ord)
112
113 makeLenses ''NgramsT
114 -----------------------------------------------------------------------
115 data NgramsIndexed =
116 NgramsIndexed
117 { _ngrams :: Ngrams
118 , _ngramsId :: NgramsId
119 } deriving (Show, Generic, Eq, Ord)
120
121 makeLenses ''NgramsIndexed
122 ------------------------------------------------------------------------
123 data NgramIds =
124 NgramIds
125 { ngramId :: Int
126 , ngramTerms :: Text
127 } deriving (Show, Generic, Eq, Ord)
128
129 instance DPS.FromRow NgramIds where
130 fromRow = NgramIds <$> field <*> field
131
132 ----------------------
133 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
134 indexNgramsT m ngrId = indexNgramsTWith f ngrId
135 where
136 f n = maybe (panic "indexNgramsT: should not happen") identity (lookup n m)
137
138 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams-> NgramsT NgramsIndexed
139 indexNgramsTWith f (NgramsT t n) = NgramsT t (NgramsIndexed n ((f . _ngramsTerms) n))
140
141 insertNgrams :: [Ngrams] -> Cmd (Map NgramsTerms NgramsId)
142 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
143
144 insertNgrams' :: [Ngrams] -> Cmd [NgramIds]
145 insertNgrams' ns = mkCmd $ \conn -> DPS.query conn queryInsertNgrams (DPS.Only $ Values fields ns)
146 where
147 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
148
149 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd ByteString
150 insertNgrams_Debug ns = mkCmd $ \conn -> DPS.formatQuery conn queryInsertNgrams (DPS.Only $ Values fields ns)
151 where
152 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
153
154 ----------------------
155 queryInsertNgrams :: DPS.Query
156 queryInsertNgrams = [sql|
157 WITH input_rows(terms,n) AS (?)
158 , ins AS (
159 INSERT INTO ngrams (terms,n)
160 SELECT * FROM input_rows
161 ON CONFLICT (terms) DO NOTHING -- unique index created here
162 RETURNING id,terms
163 )
164
165 SELECT id, terms
166 FROM ins
167 UNION ALL
168 SELECT c.id, terms
169 FROM input_rows
170 JOIN ngrams c USING (terms); -- columns of unique index
171 |]
172
173
174
175
176 -- | Ngrams Table
177
178 data NgramsTableParam =
179 NgramsTableParam { _nt_listId :: Int
180 , _nt_corpusId :: Int
181 , _nt_typeNode :: Int
182 , _nt_typeNgrams :: Int
183 }
184
185 type NgramsTableParamUser = NgramsTableParam
186 type NgramsTableParamMaster = NgramsTableParam
187
188 data NgramsTableData = NgramsTableData { _ntd_terms :: Text
189 , _ntd_n :: Int
190 , _ntd_ngramsType :: Int
191 , _ntd_weight :: Double
192 } deriving (Show)
193
194 getTableNgrams :: NgramsTableParamUser -> NgramsTableParamMaster -> Cmd [(Text, Int, Int, Double)]
195 getTableNgrams (NgramsTableParam ul uc utn utg) (NgramsTableParam ml mc mtn mtg) =
196 mkCmd $ \conn -> DPS.query conn querySelectTableNgrams (ul,uc,utn,utg,ml,mc,mtn,mtg)
197
198
199 querySelectTableNgrams :: DPS.Query
200 querySelectTableNgrams = [sql|
201
202 WITH tableUser AS (select ngs.terms, ngs.n, nn1.ngrams_type,nn2.weight FROM ngrams ngs
203 JOIN nodes_ngrams nn1 ON nn1.ngram_id = ngs.id
204 JOIN nodes_ngrams nn2 ON nn2.ngram_id = ngs.id
205 JOIN nodes n ON n.id = nn2.node_id
206 WHERE nn1.node_id = ? -- User listId
207 AND n.parent_id = ? -- User CorpusId or AnnuaireId
208 AND n.typename = ? -- both type of childs (Documents or Contacts)
209 AND nn2.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
210 ), tableMaster AS (select ngs.terms, ngs.n, nn1.ngrams_type,nn2.weight FROM ngrams ngs
211 JOIN nodes_ngrams nn1 ON nn1.ngram_id = ngs.id
212 JOIN nodes_ngrams nn2 ON nn2.ngram_id = ngs.id
213 JOIN nodes n ON n.id = nn2.node_id
214 WHERE nn1.node_id = ? -- Master listId
215 AND n.parent_id = ? -- Master CorpusId or AnnuaireId
216 AND n.typename = ? -- both type of childs (Documents or Contacts)
217 AND nn2.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
218 )
219
220 SELECT COALESCE(tu.terms,tm.terms) AS terms
221 , COALESCE(tu.n,tm.n) AS n
222 , COALESCE(tu.ngrams_type,tm.ngrams_type) AS ngrams_type
223 , COALESCE(tu.weight,tm.weight) AS weight
224 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms;
225
226 |]
227
228 type ListIdUser = Int
229 type ListIdMaster = Int
230
231
232 getNgramsGroup :: ListIdUser -> ListIdMaster -> Cmd [(Text, Text)]
233 getNgramsGroup lu lm = mkCmd $ \conn -> DPS.query conn querySelectNgramsGroup (lu,lm)
234
235 querySelectNgramsGroup :: DPS.Query
236 querySelectNgramsGroup = [sql|
237 WITH groupUser AS (
238 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
239 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
240 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
241 WHERE
242 nnn.node_id = ? -- User listId
243 ),
244 groupMaster AS (
245 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
246 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
247 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
248 WHERE
249 nnn.node_id = ? -- Master listId
250 )
251 SELECT COALESCE(gu.t1,gm.t1) AS ngram1_id
252 , COALESCE(gu.t2,gm.t2) AS ngram2_id
253 FROM groupUser gu RIGHT JOIN groupMaster gm ON gu.t1 = gm.t1
254 |]