]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Ngrams.hs
[Merge]
[gargantext.git] / src / Gargantext / Database / Ngrams.hs
1 {-|
2 Module : Gargantext.Database.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams connection to the Database.
11
12 -}
13
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE MultiParamTypeClasses #-}
19 {-# LANGUAGE NoImplicitPrelude #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE QuasiQuotes #-}
22 {-# LANGUAGE TemplateHaskell #-}
23
24 module Gargantext.Database.Ngrams where
25
26 -- import Opaleye
27 import Prelude (Enum, Bounded, minBound, maxBound)
28 import Control.Lens (makeLenses)
29 import Data.ByteString.Internal (ByteString)
30 import Data.Map (Map, fromList, lookup, fromListWith)
31 import Data.Set (Set)
32 import qualified Data.Set as DS
33 import Data.Text (Text, splitOn)
34 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
35 import Database.PostgreSQL.Simple.SqlQQ (sql)
36 import Database.PostgreSQL.Simple.ToField (toField)
37 import Database.PostgreSQL.Simple.ToRow (toRow)
38 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
39 import GHC.Generics (Generic)
40 import Gargantext.Core.Types (fromListTypeId, ListType)
41 import Gargantext.Database.Config (nodeTypeId)
42 import Gargantext.Database.Types.Node (NodeType)
43 import Gargantext.Database.Node (mkCmd, Cmd(..))
44 import Gargantext.Prelude
45 import qualified Database.PostgreSQL.Simple as DPS
46
47
48 --data NgramPoly id terms n = NgramDb { ngram_id :: id
49 -- , ngram_terms :: terms
50 -- , ngram_n :: n
51 -- } deriving (Show)
52 --
53 --type NgramWrite = NgramPoly (Maybe (Column PGInt4))
54 -- (Column PGText)
55 -- (Column PGInt4)
56 --
57 --type NgramRead = NgramPoly (Column PGInt4)
58 -- (Column PGText)
59 -- (Column PGInt4)
60 --
61 ----type Ngram = NgramPoly Int Text Int
62 --
63 -- $(makeAdaptorAndInstance "pNgram" ''NgramPoly)
64 -- $(makeLensesWith abbreviatedFields ''NgramPoly)
65 --
66 --ngramTable :: Table NgramWrite NgramRead
67 --ngramTable = Table "ngrams" (pNgram NgramDb { ngram_id = optional "id"
68 -- , ngram_terms = required "terms"
69 -- , ngram_n = required "n"
70 -- }
71 -- )
72 --
73 --queryNgramTable :: Query NgramRead
74 --queryNgramTable = queryTable ngramTable
75 --
76 --dbGetNgrams :: PGS.Connection -> IO [NgramDb]
77 --dbGetNgrams conn = runQuery conn queryNgramTable
78
79 -- | Main Ngrams Types
80 -- | Typed Ngrams
81 -- Typed Ngrams localize the context of the ngrams
82 -- ngrams in source field of document has Sources Type
83 -- ngrams in authors field of document has Authors Type
84 -- ngrams in text (title or abstract) of documents has Terms Type
85 data NgramsType = Authors | Institutes | Sources | Terms
86 deriving (Eq, Show, Ord, Enum, Bounded)
87
88 ngramsTypeId :: NgramsType -> Int
89 ngramsTypeId Authors = 1
90 ngramsTypeId Institutes = 2
91 ngramsTypeId Sources = 3
92 ngramsTypeId Terms = 4
93
94 fromNgramsTypeId :: Int -> Maybe NgramsType
95 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
96
97 type NgramsTerms = Text
98 type NgramsId = Int
99 type Size = Int
100
101 ------------------------------------------------------------------------
102 -- | TODO put it in Gargantext.Text.Ngrams
103 data Ngrams = Ngrams { _ngramsTerms :: Text
104 , _ngramsSize :: Int
105 } deriving (Generic, Show, Eq, Ord)
106
107 makeLenses ''Ngrams
108 instance DPS.ToRow Ngrams where
109 toRow (Ngrams t s) = [toField t, toField s]
110
111 text2ngrams :: Text -> Ngrams
112 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
113
114 -------------------------------------------------------------------------
115 -- | TODO put it in Gargantext.Text.Ngrams
116 -- Named entity are typed ngrams of Terms Ngrams
117 data NgramsT a =
118 NgramsT { _ngramsType :: NgramsType
119 , _ngramsT :: a
120 } deriving (Generic, Show, Eq, Ord)
121
122 makeLenses ''NgramsT
123 -----------------------------------------------------------------------
124 data NgramsIndexed =
125 NgramsIndexed
126 { _ngrams :: Ngrams
127 , _ngramsId :: NgramsId
128 } deriving (Show, Generic, Eq, Ord)
129
130 makeLenses ''NgramsIndexed
131 ------------------------------------------------------------------------
132 data NgramIds =
133 NgramIds
134 { ngramId :: Int
135 , ngramTerms :: Text
136 } deriving (Show, Generic, Eq, Ord)
137
138 instance DPS.FromRow NgramIds where
139 fromRow = NgramIds <$> field <*> field
140
141 ----------------------
142 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
143 indexNgramsT m ngrId = indexNgramsTWith f ngrId
144 where
145 f n = maybe (panic "indexNgramsT: should not happen") identity (lookup n m)
146
147 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams-> NgramsT NgramsIndexed
148 indexNgramsTWith f (NgramsT t n) = NgramsT t (NgramsIndexed n ((f . _ngramsTerms) n))
149
150 insertNgrams :: [Ngrams] -> Cmd (Map NgramsTerms NgramsId)
151 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
152
153 insertNgrams' :: [Ngrams] -> Cmd [NgramIds]
154 insertNgrams' ns = mkCmd $ \conn -> DPS.query conn queryInsertNgrams (DPS.Only $ Values fields ns)
155 where
156 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
157
158 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd ByteString
159 insertNgrams_Debug ns = mkCmd $ \conn -> DPS.formatQuery conn queryInsertNgrams (DPS.Only $ Values fields ns)
160 where
161 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
162
163 ----------------------
164 queryInsertNgrams :: DPS.Query
165 queryInsertNgrams = [sql|
166 WITH input_rows(terms,n) AS (?)
167 , ins AS (
168 INSERT INTO ngrams (terms,n)
169 SELECT * FROM input_rows
170 ON CONFLICT (terms) DO NOTHING -- unique index created here
171 RETURNING id,terms
172 )
173
174 SELECT id, terms
175 FROM ins
176 UNION ALL
177 SELECT c.id, terms
178 FROM input_rows
179 JOIN ngrams c USING (terms); -- columns of unique index
180 |]
181
182
183
184 -- | Ngrams Table
185
186 data NgramsTableParam =
187 NgramsTableParam { _nt_listId :: Int
188 , _nt_corpusId :: Int
189 }
190
191 type NgramsTableParamUser = NgramsTableParam
192 type NgramsTableParamMaster = NgramsTableParam
193
194 data NgramsTableData = NgramsTableData { _ntd_terms :: Text
195 , _ntd_n :: Int
196 , _ntd_listType :: Maybe ListType
197 , _ntd_weight :: Double
198 } deriving (Show)
199
200 getTableNgrams :: NodeType -> NgramsType -> NgramsTableParamUser -> NgramsTableParamMaster -> Cmd [NgramsTableData]
201 getTableNgrams nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) =
202 mkCmd $ \conn -> map (\(t,n,nt,w) -> NgramsTableData t n (fromListTypeId nt) w) <$> DPS.query conn querySelectTableNgrams (ul,uc,nodeTId,ngrmTId,ml,mc,nodeTId,ngrmTId)
203 where
204 nodeTId = nodeTypeId nodeT
205 ngrmTId = ngramsTypeId ngrmT
206
207
208
209 querySelectTableNgrams :: DPS.Query
210 querySelectTableNgrams = [sql|
211
212 WITH tableUser AS (select ngs.terms, ngs.n, nn1.ngrams_type,nn2.weight FROM ngrams ngs
213 JOIN nodes_ngrams nn1 ON nn1.ngram_id = ngs.id
214 JOIN nodes_ngrams nn2 ON nn2.ngram_id = ngs.id
215 JOIN nodes n ON n.id = nn2.node_id
216 WHERE nn1.node_id = ? -- User listId
217 AND n.parent_id = ? -- User CorpusId or AnnuaireId
218 AND n.typename = ? -- both type of childs (Documents or Contacts)
219 AND nn2.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
220 ), tableMaster AS (select ngs.terms, ngs.n, nn1.ngrams_type,nn2.weight FROM ngrams ngs
221 JOIN nodes_ngrams nn1 ON nn1.ngram_id = ngs.id
222 JOIN nodes_ngrams nn2 ON nn2.ngram_id = ngs.id
223 JOIN nodes n ON n.id = nn2.node_id
224 WHERE nn1.node_id = ? -- Master listId
225 AND n.parent_id = ? -- Master CorpusId or AnnuaireId
226 AND n.typename = ? -- both type of childs (Documents or Contacts)
227 AND nn2.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
228 )
229
230 SELECT COALESCE(tu.terms,tm.terms) AS terms
231 , COALESCE(tu.n,tm.n) AS n
232 , COALESCE(tu.ngrams_type,tm.ngrams_type) AS ngrams_type
233 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
234 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
235 GROUP BY tu.terms,tm.terms,tu.n,tm.n,tu.ngrams_type,tm.ngrams_type;
236
237 |]
238
239 type ListIdUser = Int
240 type ListIdMaster = Int
241
242 type MapChildren = Map Text (Set Text)
243 type MapParent = Map Text Text
244
245 getNgramsGroup :: DPS.Connection -> ListIdUser -> ListIdMaster -> IO (Map Text (Set Text))
246 getNgramsGroup conn lu lm = fromListWith (<>)
247 <$> map (\(a,b) -> (a, DS.singleton b))
248 <$> getNgramsGroup' conn lu lm
249
250
251 getNgramsGroup' :: DPS.Connection -> ListIdUser -> ListIdMaster -> IO [(Text,Text)]
252 getNgramsGroup' conn lu lm = DPS.query conn querySelectNgramsGroup (lu,lm)
253
254 getNgramsGroup'' :: ListIdUser -> ListIdMaster -> Cmd [(Text, Text)]
255 getNgramsGroup'' lu lm = mkCmd $ \conn -> DPS.query conn querySelectNgramsGroup (lu,lm)
256
257 querySelectNgramsGroup :: DPS.Query
258 querySelectNgramsGroup = [sql|
259 WITH groupUser AS (
260 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
261 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
262 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
263 WHERE
264 nnn.node_id = ? -- User listId
265 ),
266 groupMaster AS (
267 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
268 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
269 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
270 WHERE
271 nnn.node_id = ? -- Master listId
272 )
273 SELECT COALESCE(gu.t1,gm.t1) AS ngram1_id
274 , COALESCE(gu.t2,gm.t2) AS ngram2_id
275 FROM groupUser gu RIGHT JOIN groupMaster gm ON gu.t1 = gm.t1
276 |]