]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Ngrams.hs
[Graph] missing files and default graph.
[gargantext.git] / src / Gargantext / Database / Ngrams.hs
1 {-|
2 Module : Gargantext.Database.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams connection to the Database.
11
12 -}
13
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE MultiParamTypeClasses #-}
19 {-# LANGUAGE NoImplicitPrelude #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE QuasiQuotes #-}
22 {-# LANGUAGE TemplateHaskell #-}
23
24 module Gargantext.Database.Ngrams where
25
26 import Prelude (Enum, Bounded, minBound, maxBound)
27 import Control.Lens (makeLenses, view)
28 import Data.ByteString.Internal (ByteString)
29 import Data.Map (Map, fromList, lookup, fromListWith)
30 import Data.Set (Set)
31 import Data.Tuple.Extra (both)
32 import qualified Data.Set as DS
33 import Data.Text (Text, splitOn)
34 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
35 import Database.PostgreSQL.Simple.SqlQQ (sql)
36 import Database.PostgreSQL.Simple.ToField (toField)
37 import Database.PostgreSQL.Simple.ToRow (toRow)
38 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
39 import GHC.Generics (Generic)
40 import Gargantext.Core.Types -- (fromListTypeId, ListType, NodePoly(Node))
41 import Gargantext.Database.Config (nodeTypeId,userMaster)
42 import Gargantext.Database.Types.Node (NodeType)
43 import Gargantext.Database.Node (mkCmd, Cmd(..))
44 import Gargantext.Database.Root (getRoot)
45 import Gargantext.Database.Tree (dbTree, toNodeTree)
46 import Gargantext.Core.Types.Main (NodeTree(..))
47 import Gargantext.Prelude
48 import qualified Database.PostgreSQL.Simple as DPS
49
50 --data NgramPoly id terms n = NgramDb { ngram_id :: id
51 -- , ngram_terms :: terms
52 -- , ngram_n :: n
53 -- } deriving (Show)
54 --
55 --type NgramWrite = NgramPoly (Maybe (Column PGInt4))
56 -- (Column PGText)
57 -- (Column PGInt4)
58 --
59 --type NgramRead = NgramPoly (Column PGInt4)
60 -- (Column PGText)
61 -- (Column PGInt4)
62 --
63 ----type Ngram = NgramPoly Int Text Int
64 --
65 -- $(makeAdaptorAndInstance "pNgram" ''NgramPoly)
66 -- $(makeLensesWith abbreviatedFields ''NgramPoly)
67 --
68 --ngramTable :: Table NgramWrite NgramRead
69 --ngramTable = Table "ngrams" (pNgram NgramDb { ngram_id = optional "id"
70 -- , ngram_terms = required "terms"
71 -- , ngram_n = required "n"
72 -- }
73 -- )
74 --
75 --queryNgramTable :: Query NgramRead
76 --queryNgramTable = queryTable ngramTable
77 --
78 --dbGetNgrams :: PGS.Connection -> IO [NgramDb]
79 --dbGetNgrams conn = runQuery conn queryNgramTable
80
81 -- | Main Ngrams Types
82 -- | Typed Ngrams
83 -- Typed Ngrams localize the context of the ngrams
84 -- ngrams in source field of document has Sources Type
85 -- ngrams in authors field of document has Authors Type
86 -- ngrams in text (title or abstract) of documents has Terms Type
87 data NgramsType = Authors | Institutes | Sources | NgramsTerms
88 deriving (Eq, Show, Ord, Enum, Bounded)
89
90 ngramsTypeId :: NgramsType -> Int
91 ngramsTypeId Authors = 1
92 ngramsTypeId Institutes = 2
93 ngramsTypeId Sources = 3
94 ngramsTypeId NgramsTerms = 4
95
96 fromNgramsTypeId :: Int -> Maybe NgramsType
97 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
98
99 type NgramsTerms = Text
100 type NgramsId = Int
101 type Size = Int
102
103 ------------------------------------------------------------------------
104 -- | TODO put it in Gargantext.Text.Ngrams
105 data Ngrams = Ngrams { _ngramsTerms :: Text
106 , _ngramsSize :: Int
107 } deriving (Generic, Show, Eq, Ord)
108
109 makeLenses ''Ngrams
110 instance DPS.ToRow Ngrams where
111 toRow (Ngrams t s) = [toField t, toField s]
112
113 text2ngrams :: Text -> Ngrams
114 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
115
116 -------------------------------------------------------------------------
117 -- | TODO put it in Gargantext.Text.Ngrams
118 -- Named entity are typed ngrams of Terms Ngrams
119 data NgramsT a =
120 NgramsT { _ngramsType :: NgramsType
121 , _ngramsT :: a
122 } deriving (Generic, Show, Eq, Ord)
123
124 makeLenses ''NgramsT
125 -----------------------------------------------------------------------
126 data NgramsIndexed =
127 NgramsIndexed
128 { _ngrams :: Ngrams
129 , _ngramsId :: NgramsId
130 } deriving (Show, Generic, Eq, Ord)
131
132 makeLenses ''NgramsIndexed
133 ------------------------------------------------------------------------
134 data NgramIds =
135 NgramIds
136 { ngramId :: Int
137 , ngramTerms :: Text
138 } deriving (Show, Generic, Eq, Ord)
139
140 instance DPS.FromRow NgramIds where
141 fromRow = NgramIds <$> field <*> field
142
143 ----------------------
144 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
145 indexNgramsT m ngrId = indexNgramsTWith f ngrId
146 where
147 f n = maybe (panic "indexNgramsT: should not happen") identity (lookup n m)
148
149 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams-> NgramsT NgramsIndexed
150 indexNgramsTWith f (NgramsT t n) = NgramsT t (NgramsIndexed n ((f . _ngramsTerms) n))
151
152 insertNgrams :: [Ngrams] -> Cmd (Map NgramsTerms NgramsId)
153 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
154
155 insertNgrams' :: [Ngrams] -> Cmd [NgramIds]
156 insertNgrams' ns = mkCmd $ \conn -> DPS.query conn queryInsertNgrams (DPS.Only $ Values fields ns)
157 where
158 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
159
160 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd ByteString
161 insertNgrams_Debug ns = mkCmd $ \conn -> DPS.formatQuery conn queryInsertNgrams (DPS.Only $ Values fields ns)
162 where
163 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
164
165 ----------------------
166 queryInsertNgrams :: DPS.Query
167 queryInsertNgrams = [sql|
168 WITH input_rows(terms,n) AS (?)
169 , ins AS (
170 INSERT INTO ngrams (terms,n)
171 SELECT * FROM input_rows
172 ON CONFLICT (terms) DO NOTHING -- unique index created here
173 RETURNING id,terms
174 )
175
176 SELECT id, terms
177 FROM ins
178 UNION ALL
179 SELECT c.id, terms
180 FROM input_rows
181 JOIN ngrams c USING (terms); -- columns of unique index
182 |]
183
184
185
186 -- | Ngrams Table
187 -- TODO: the way we are getting main Master Corpus and List ID is not clean
188 -- TODO: if ids are not present -> create
189 -- TODO: Global Env State Monad to keep in memory the ids without retrieving it each time
190 getNgramsTableDb :: DPS.Connection
191 -> NodeType -> NgramsType
192 -> NgramsTableParamUser
193 -> IO ([NgramsTableData], MapToParent, MapToChildren)
194 getNgramsTableDb c nt ngrt ntp@(NgramsTableParam listIdUser _) = do
195 let lieu = "Garg.Db.Ngrams.getTableNgrams: "
196 maybeRoot <- head <$> getRoot userMaster c
197 let masterRootId = maybe (panic $ lieu <> "no userMaster Tree") (view node_id) maybeRoot
198 tree <- map toNodeTree <$> dbTree c masterRootId
199 let maybeCorpus = head $ filter (\n -> _nt_type n == NodeCorpus) tree
200 let maybeList = head $ filter (\n -> _nt_type n == NodeList) tree
201 let maybeIds = fmap (both _nt_id) $ (,) <$> maybeCorpus <*> maybeList
202 let (corpusMasterId, listMasterId) = maybe (panic $ lieu <> "no CorpusId or ListId") identity maybeIds
203 ngramsTableData <- getNgramsTableData c nt ngrt ntp (NgramsTableParam listMasterId corpusMasterId)
204 (mapToParent,mapToChildren) <- getNgramsGroup c listIdUser listMasterId
205 pure (ngramsTableData, mapToParent,mapToChildren)
206
207
208 data NgramsTableParam =
209 NgramsTableParam { _nt_listId :: Int
210 , _nt_corpusId :: Int
211 }
212
213 type NgramsTableParamUser = NgramsTableParam
214 type NgramsTableParamMaster = NgramsTableParam
215
216 data NgramsTableData = NgramsTableData { _ntd_ngrams :: Text
217 , _ntd_n :: Int
218 , _ntd_listType :: Maybe ListType
219 , _ntd_weight :: Double
220 } deriving (Show)
221
222 getNgramsTableData :: DPS.Connection
223 -> NodeType -> NgramsType
224 -> NgramsTableParamUser -> NgramsTableParamMaster
225 -> IO [NgramsTableData]
226 getNgramsTableData conn nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) =
227 map (\(t,n,nt,w) -> NgramsTableData t n (fromListTypeId nt) w)
228 <$> DPS.query conn querySelectTableNgrams (ul,uc,nodeTId,ngrmTId,ml,mc,nodeTId,ngrmTId,uc)
229 where
230 nodeTId = nodeTypeId nodeT
231 ngrmTId = ngramsTypeId ngrmT
232
233
234 querySelectTableNgrams :: DPS.Query
235 querySelectTableNgrams = [sql|
236
237 WITH tableUser AS (
238 SELECT ngs.terms, ngs.n, list.ngrams_type, corp.weight FROM ngrams ngs
239 JOIN nodes_ngrams list ON list.ngram_id = ngs.id
240 JOIN nodes_ngrams corp ON corp.ngram_id = ngs.id
241 JOIN nodes n ON n.id = corp.node_id
242
243 WHERE list.node_id = ? -- User listId
244 AND n.parent_id = ? -- User CorpusId or AnnuaireId
245 AND n.typename = ? -- both type of childs (Documents or Contacts)
246 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms or...)
247 )
248 , tableMaster AS (
249 SELECT ngs.terms, ngs.n, list.ngrams_type, corp.weight FROM ngrams ngs
250 JOIN nodes_ngrams list ON list.ngram_id = ngs.id
251 JOIN nodes_ngrams corp ON corp.ngram_id = ngs.id
252 JOIN nodes n ON n.id = corp.node_id
253 JOIN nodes_nodes nn ON nn.node2_id = n.id
254
255 WHERE list.node_id = ? -- Master listId
256 AND n.parent_id = ? -- Master CorpusId or AnnuaireId
257 AND n.typename = ? -- Master childs (Documents or Contacts)
258 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
259 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
260 )
261
262 SELECT COALESCE(tu.terms,tm.terms) AS terms
263 , COALESCE(tu.n,tm.n) AS n
264 , COALESCE(tu.ngrams_type,tm.ngrams_type) AS ngrams_type
265 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
266 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
267 GROUP BY tu.terms,tm.terms,tu.n,tm.n,tu.ngrams_type,tm.ngrams_type;
268
269 |]
270
271 type ListIdUser = Int
272 type ListIdMaster = Int
273
274 type MapToChildren = Map Text (Set Text)
275 type MapToParent = Map Text Text
276
277 getNgramsGroup :: DPS.Connection -> ListIdUser -> ListIdMaster -> IO (MapToParent, MapToChildren)
278 getNgramsGroup conn lu lm = do
279 groups <- getNgramsGroup' conn lu lm
280 let mapChildren = fromListWith (<>) $ map (\(a,b) -> (a, DS.singleton b)) groups
281 let mapParent = fromListWith (<>) $ map (\(a,b) -> (b, a)) groups
282 pure (mapParent, mapChildren)
283
284 getNgramsGroup' :: DPS.Connection -> ListIdUser -> ListIdMaster -> IO [(Text,Text)]
285 getNgramsGroup' conn lu lm = DPS.query conn querySelectNgramsGroup (lu,lm)
286
287 getNgramsGroup'' :: ListIdUser -> ListIdMaster -> Cmd [(Text, Text)]
288 getNgramsGroup'' lu lm = mkCmd $ \conn -> DPS.query conn querySelectNgramsGroup (lu,lm)
289
290 querySelectNgramsGroup :: DPS.Query
291 querySelectNgramsGroup = [sql|
292 WITH groupUser AS (
293 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
294 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
295 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
296 WHERE
297 nnn.node_id = ? -- User listId
298 ),
299 groupMaster AS (
300 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
301 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
302 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
303 WHERE
304 nnn.node_id = ? -- Master listId
305 )
306 SELECT COALESCE(gu.t1,gm.t1) AS ngram1_id
307 , COALESCE(gu.t2,gm.t2) AS ngram2_id
308 FROM groupUser gu RIGHT JOIN groupMaster gm ON gu.t1 = gm.t1
309 |]
310