]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Ngrams.hs
Merge branch 'annuaire'
[gargantext.git] / src / Gargantext / Database / Ngrams.hs
1 {-|
2 Module : Gargantext.Database.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams connection to the Database.
11
12 -}
13
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE MultiParamTypeClasses #-}
19 {-# LANGUAGE NoImplicitPrelude #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE QuasiQuotes #-}
22 {-# LANGUAGE TemplateHaskell #-}
23
24 module Gargantext.Database.Ngrams where
25
26 import Prelude (Enum, Bounded, minBound, maxBound)
27 import Control.Lens (makeLenses, view)
28 import Data.ByteString.Internal (ByteString)
29 import Data.Map (Map, fromList, lookup, fromListWith)
30 import Data.Set (Set)
31 import Data.Tuple.Extra (both)
32 import qualified Data.Set as DS
33 import Data.Text (Text, splitOn)
34 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
35 import Database.PostgreSQL.Simple.SqlQQ (sql)
36 import Database.PostgreSQL.Simple.ToField (toField)
37 import Database.PostgreSQL.Simple.ToRow (toRow)
38 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
39 import GHC.Generics (Generic)
40 import Gargantext.Core.Types -- (fromListTypeId, ListType, NodePoly(Node))
41 import Gargantext.Database.Config (nodeTypeId,userMaster)
42 import Gargantext.Database.Types.Node (NodeType)
43 import Gargantext.Database.Node (mkCmd, Cmd(..),getRootUsername)
44 import Gargantext.Database.Tree (dbTree, toNodeTree)
45 import Gargantext.Core.Types.Main (NodeTree(..))
46 import Gargantext.Prelude
47 import qualified Database.PostgreSQL.Simple as DPS
48
49 --data NgramPoly id terms n = NgramDb { ngram_id :: id
50 -- , ngram_terms :: terms
51 -- , ngram_n :: n
52 -- } deriving (Show)
53 --
54 --type NgramWrite = NgramPoly (Maybe (Column PGInt4))
55 -- (Column PGText)
56 -- (Column PGInt4)
57 --
58 --type NgramRead = NgramPoly (Column PGInt4)
59 -- (Column PGText)
60 -- (Column PGInt4)
61 --
62 ----type Ngram = NgramPoly Int Text Int
63 --
64 -- $(makeAdaptorAndInstance "pNgram" ''NgramPoly)
65 -- $(makeLensesWith abbreviatedFields ''NgramPoly)
66 --
67 --ngramTable :: Table NgramWrite NgramRead
68 --ngramTable = Table "ngrams" (pNgram NgramDb { ngram_id = optional "id"
69 -- , ngram_terms = required "terms"
70 -- , ngram_n = required "n"
71 -- }
72 -- )
73 --
74 --queryNgramTable :: Query NgramRead
75 --queryNgramTable = queryTable ngramTable
76 --
77 --dbGetNgrams :: PGS.Connection -> IO [NgramDb]
78 --dbGetNgrams conn = runQuery conn queryNgramTable
79
80 -- | Main Ngrams Types
81 -- | Typed Ngrams
82 -- Typed Ngrams localize the context of the ngrams
83 -- ngrams in source field of document has Sources Type
84 -- ngrams in authors field of document has Authors Type
85 -- ngrams in text (title or abstract) of documents has Terms Type
86 data NgramsType = Authors | Institutes | Sources | NgramsTerms
87 deriving (Eq, Show, Ord, Enum, Bounded)
88
89 ngramsTypeId :: NgramsType -> Int
90 ngramsTypeId Authors = 1
91 ngramsTypeId Institutes = 2
92 ngramsTypeId Sources = 3
93 ngramsTypeId NgramsTerms = 4
94
95 fromNgramsTypeId :: Int -> Maybe NgramsType
96 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
97
98 type NgramsTerms = Text
99 type NgramsId = Int
100 type Size = Int
101
102 ------------------------------------------------------------------------
103 -- | TODO put it in Gargantext.Text.Ngrams
104 data Ngrams = Ngrams { _ngramsTerms :: Text
105 , _ngramsSize :: Int
106 } deriving (Generic, Show, Eq, Ord)
107
108 makeLenses ''Ngrams
109 instance DPS.ToRow Ngrams where
110 toRow (Ngrams t s) = [toField t, toField s]
111
112 text2ngrams :: Text -> Ngrams
113 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
114
115 -------------------------------------------------------------------------
116 -- | TODO put it in Gargantext.Text.Ngrams
117 -- Named entity are typed ngrams of Terms Ngrams
118 data NgramsT a =
119 NgramsT { _ngramsType :: NgramsType
120 , _ngramsT :: a
121 } deriving (Generic, Show, Eq, Ord)
122
123 makeLenses ''NgramsT
124 -----------------------------------------------------------------------
125 data NgramsIndexed =
126 NgramsIndexed
127 { _ngrams :: Ngrams
128 , _ngramsId :: NgramsId
129 } deriving (Show, Generic, Eq, Ord)
130
131 makeLenses ''NgramsIndexed
132 ------------------------------------------------------------------------
133 data NgramIds =
134 NgramIds
135 { ngramId :: Int
136 , ngramTerms :: Text
137 } deriving (Show, Generic, Eq, Ord)
138
139 instance DPS.FromRow NgramIds where
140 fromRow = NgramIds <$> field <*> field
141
142 ----------------------
143 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
144 indexNgramsT m ngrId = indexNgramsTWith f ngrId
145 where
146 f n = maybe (panic "indexNgramsT: should not happen") identity (lookup n m)
147
148 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams-> NgramsT NgramsIndexed
149 indexNgramsTWith f (NgramsT t n) = NgramsT t (NgramsIndexed n ((f . _ngramsTerms) n))
150
151 insertNgrams :: [Ngrams] -> Cmd (Map NgramsTerms NgramsId)
152 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
153
154 insertNgrams' :: [Ngrams] -> Cmd [NgramIds]
155 insertNgrams' ns = mkCmd $ \conn -> DPS.query conn queryInsertNgrams (DPS.Only $ Values fields ns)
156 where
157 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
158
159 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd ByteString
160 insertNgrams_Debug ns = mkCmd $ \conn -> DPS.formatQuery conn queryInsertNgrams (DPS.Only $ Values fields ns)
161 where
162 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
163
164 ----------------------
165 queryInsertNgrams :: DPS.Query
166 queryInsertNgrams = [sql|
167 WITH input_rows(terms,n) AS (?)
168 , ins AS (
169 INSERT INTO ngrams (terms,n)
170 SELECT * FROM input_rows
171 ON CONFLICT (terms) DO NOTHING -- unique index created here
172 RETURNING id,terms
173 )
174
175 SELECT id, terms
176 FROM ins
177 UNION ALL
178 SELECT c.id, terms
179 FROM input_rows
180 JOIN ngrams c USING (terms); -- columns of unique index
181 |]
182
183
184
185 -- | Ngrams Table
186 -- TODO: the way we are getting main Master Corpus and List ID is not clean
187 -- TODO: if ids are not present -> create
188 -- TODO: Global Env State Monad to keep in memory the ids without retrieving it each time
189 getNgramsTableDb :: DPS.Connection
190 -> NodeType -> NgramsType
191 -> NgramsTableParamUser
192 -> IO ([NgramsTableData], MapToParent, MapToChildren)
193 getNgramsTableDb c nt ngrt ntp@(NgramsTableParam listIdUser _) = do
194 let lieu = "Garg.Db.Ngrams.getTableNgrams: "
195 maybeRoot <- head <$> getRootUsername userMaster c
196 let masterRootId = maybe (panic $ lieu <> "no userMaster Tree") (view node_id) maybeRoot
197 tree <- map toNodeTree <$> dbTree c masterRootId
198 let maybeCorpus = head $ filter (\n -> _nt_type n == NodeCorpus) tree
199 let maybeList = head $ filter (\n -> _nt_type n == NodeList) tree
200 let maybeIds = fmap (both _nt_id) $ (,) <$> maybeCorpus <*> maybeList
201 let (corpusMasterId, listMasterId) = maybe (panic $ lieu <> "no CorpusId or ListId") identity maybeIds
202 ngramsTableData <- getNgramsTableData c nt ngrt ntp (NgramsTableParam listMasterId corpusMasterId)
203 (mapToParent,mapToChildren) <- getNgramsGroup c listIdUser listMasterId
204 pure (ngramsTableData, mapToParent,mapToChildren)
205
206
207 data NgramsTableParam =
208 NgramsTableParam { _nt_listId :: Int
209 , _nt_corpusId :: Int
210 }
211
212 type NgramsTableParamUser = NgramsTableParam
213 type NgramsTableParamMaster = NgramsTableParam
214
215 data NgramsTableData = NgramsTableData { _ntd_ngrams :: Text
216 , _ntd_n :: Int
217 , _ntd_listType :: Maybe ListType
218 , _ntd_weight :: Double
219 } deriving (Show)
220
221 getNgramsTableData :: DPS.Connection
222 -> NodeType -> NgramsType
223 -> NgramsTableParamUser -> NgramsTableParamMaster
224 -> IO [NgramsTableData]
225 getNgramsTableData conn nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) =
226 map (\(t,n,nt,w) -> NgramsTableData t n (fromListTypeId nt) w)
227 <$> DPS.query conn querySelectTableNgrams (ul,uc,nodeTId,ngrmTId,ml,mc,nodeTId,ngrmTId,uc)
228 where
229 nodeTId = nodeTypeId nodeT
230 ngrmTId = ngramsTypeId ngrmT
231
232
233 querySelectTableNgrams :: DPS.Query
234 querySelectTableNgrams = [sql|
235
236 WITH tableUser AS (
237 SELECT ngs.terms, ngs.n, list.ngrams_type, corp.weight FROM ngrams ngs
238 JOIN nodes_ngrams list ON list.ngram_id = ngs.id
239 JOIN nodes_ngrams corp ON corp.ngram_id = ngs.id
240 JOIN nodes n ON n.id = corp.node_id
241
242 WHERE list.node_id = ? -- User listId
243 AND n.parent_id = ? -- User CorpusId or AnnuaireId
244 AND n.typename = ? -- both type of childs (Documents or Contacts)
245 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms or...)
246 )
247 , tableMaster AS (
248 SELECT ngs.terms, ngs.n, list.ngrams_type, corp.weight FROM ngrams ngs
249 JOIN nodes_ngrams list ON list.ngram_id = ngs.id
250 JOIN nodes_ngrams corp ON corp.ngram_id = ngs.id
251 JOIN nodes n ON n.id = corp.node_id
252 JOIN nodes_nodes nn ON nn.node2_id = n.id
253
254 WHERE list.node_id = ? -- Master listId
255 AND n.parent_id = ? -- Master CorpusId or AnnuaireId
256 AND n.typename = ? -- Master childs (Documents or Contacts)
257 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
258 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
259 )
260
261 SELECT COALESCE(tu.terms,tm.terms) AS terms
262 , COALESCE(tu.n,tm.n) AS n
263 , COALESCE(tu.ngrams_type,tm.ngrams_type) AS ngrams_type
264 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
265 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
266 GROUP BY tu.terms,tm.terms,tu.n,tm.n,tu.ngrams_type,tm.ngrams_type;
267
268 |]
269
270 type ListIdUser = Int
271 type ListIdMaster = Int
272
273 type MapToChildren = Map Text (Set Text)
274 type MapToParent = Map Text Text
275
276 getNgramsGroup :: DPS.Connection -> ListIdUser -> ListIdMaster -> IO (MapToParent, MapToChildren)
277 getNgramsGroup conn lu lm = do
278 groups <- getNgramsGroup' conn lu lm
279 let mapChildren = fromListWith (<>) $ map (\(a,b) -> (a, DS.singleton b)) groups
280 let mapParent = fromListWith (<>) $ map (\(a,b) -> (b, a)) groups
281 pure (mapParent, mapChildren)
282
283 getNgramsGroup' :: DPS.Connection -> ListIdUser -> ListIdMaster -> IO [(Text,Text)]
284 getNgramsGroup' conn lu lm = DPS.query conn querySelectNgramsGroup (lu,lm)
285
286 getNgramsGroup'' :: ListIdUser -> ListIdMaster -> Cmd [(Text, Text)]
287 getNgramsGroup'' lu lm = mkCmd $ \conn -> DPS.query conn querySelectNgramsGroup (lu,lm)
288
289 querySelectNgramsGroup :: DPS.Query
290 querySelectNgramsGroup = [sql|
291 WITH groupUser AS (
292 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
293 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
294 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
295 WHERE
296 nnn.node_id = ? -- User listId
297 ),
298 groupMaster AS (
299 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
300 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
301 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
302 WHERE
303 nnn.node_id = ? -- Master listId
304 )
305 SELECT COALESCE(gu.t1,gm.t1) AS ngram1_id
306 , COALESCE(gu.t2,gm.t2) AS ngram2_id
307 FROM groupUser gu RIGHT JOIN groupMaster gm ON gu.t1 = gm.t1
308 |]
309