]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Ngrams.hs
[ANNUAIRE] Contact type and mkAnnuaire function.
[gargantext.git] / src / Gargantext / Database / Ngrams.hs
1 {-|
2 Module : Gargantext.Database.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams connection to the Database.
11
12 -}
13
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE MultiParamTypeClasses #-}
19 {-# LANGUAGE NoImplicitPrelude #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE QuasiQuotes #-}
22 {-# LANGUAGE TemplateHaskell #-}
23
24 module Gargantext.Database.Ngrams where
25
26 -- import Opaleye
27 import Prelude (Enum, Bounded, minBound, maxBound)
28 import Control.Lens (makeLenses, view)
29 import Data.ByteString.Internal (ByteString)
30 import Data.Map (Map, fromList, lookup, fromListWith)
31 import Data.Set (Set)
32 import Data.Tuple.Extra (both)
33 import qualified Data.Set as DS
34 import Data.Text (Text, splitOn)
35 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
36 import Database.PostgreSQL.Simple.SqlQQ (sql)
37 import Database.PostgreSQL.Simple.ToField (toField)
38 import Database.PostgreSQL.Simple.ToRow (toRow)
39 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
40 import GHC.Generics (Generic)
41 import Gargantext.Core.Types -- (fromListTypeId, ListType, NodePoly(Node))
42 import Gargantext.Database.Config (nodeTypeId,userMaster)
43 import Gargantext.Database.Types.Node (NodeType)
44 import Gargantext.Database.Node (mkCmd, Cmd(..),getRootUsername)
45 import Gargantext.Database.Tree (dbTree, toNodeTree)
46 import Gargantext.Core.Types.Main (NodeTree(..))
47 import Gargantext.Prelude
48 import qualified Database.PostgreSQL.Simple as DPS
49
50 --data NgramPoly id terms n = NgramDb { ngram_id :: id
51 -- , ngram_terms :: terms
52 -- , ngram_n :: n
53 -- } deriving (Show)
54 --
55 --type NgramWrite = NgramPoly (Maybe (Column PGInt4))
56 -- (Column PGText)
57 -- (Column PGInt4)
58 --
59 --type NgramRead = NgramPoly (Column PGInt4)
60 -- (Column PGText)
61 -- (Column PGInt4)
62 --
63 ----type Ngram = NgramPoly Int Text Int
64 --
65 -- $(makeAdaptorAndInstance "pNgram" ''NgramPoly)
66 -- $(makeLensesWith abbreviatedFields ''NgramPoly)
67 --
68 --ngramTable :: Table NgramWrite NgramRead
69 --ngramTable = Table "ngrams" (pNgram NgramDb { ngram_id = optional "id"
70 -- , ngram_terms = required "terms"
71 -- , ngram_n = required "n"
72 -- }
73 -- )
74 --
75 --queryNgramTable :: Query NgramRead
76 --queryNgramTable = queryTable ngramTable
77 --
78 --dbGetNgrams :: PGS.Connection -> IO [NgramDb]
79 --dbGetNgrams conn = runQuery conn queryNgramTable
80
81 -- | Main Ngrams Types
82 -- | Typed Ngrams
83 -- Typed Ngrams localize the context of the ngrams
84 -- ngrams in source field of document has Sources Type
85 -- ngrams in authors field of document has Authors Type
86 -- ngrams in text (title or abstract) of documents has Terms Type
87 data NgramsType = Authors | Institutes | Sources | NgramsTerms
88 deriving (Eq, Show, Ord, Enum, Bounded)
89
90 ngramsTypeId :: NgramsType -> Int
91 ngramsTypeId Authors = 1
92 ngramsTypeId Institutes = 2
93 ngramsTypeId Sources = 3
94 ngramsTypeId NgramsTerms = 4
95
96 fromNgramsTypeId :: Int -> Maybe NgramsType
97 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
98
99 type NgramsTerms = Text
100 type NgramsId = Int
101 type Size = Int
102
103 ------------------------------------------------------------------------
104 -- | TODO put it in Gargantext.Text.Ngrams
105 data Ngrams = Ngrams { _ngramsTerms :: Text
106 , _ngramsSize :: Int
107 } deriving (Generic, Show, Eq, Ord)
108
109 makeLenses ''Ngrams
110 instance DPS.ToRow Ngrams where
111 toRow (Ngrams t s) = [toField t, toField s]
112
113 text2ngrams :: Text -> Ngrams
114 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
115
116 -------------------------------------------------------------------------
117 -- | TODO put it in Gargantext.Text.Ngrams
118 -- Named entity are typed ngrams of Terms Ngrams
119 data NgramsT a =
120 NgramsT { _ngramsType :: NgramsType
121 , _ngramsT :: a
122 } deriving (Generic, Show, Eq, Ord)
123
124 makeLenses ''NgramsT
125 -----------------------------------------------------------------------
126 data NgramsIndexed =
127 NgramsIndexed
128 { _ngrams :: Ngrams
129 , _ngramsId :: NgramsId
130 } deriving (Show, Generic, Eq, Ord)
131
132 makeLenses ''NgramsIndexed
133 ------------------------------------------------------------------------
134 data NgramIds =
135 NgramIds
136 { ngramId :: Int
137 , ngramTerms :: Text
138 } deriving (Show, Generic, Eq, Ord)
139
140 instance DPS.FromRow NgramIds where
141 fromRow = NgramIds <$> field <*> field
142
143 ----------------------
144 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
145 indexNgramsT m ngrId = indexNgramsTWith f ngrId
146 where
147 f n = maybe (panic "indexNgramsT: should not happen") identity (lookup n m)
148
149 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams-> NgramsT NgramsIndexed
150 indexNgramsTWith f (NgramsT t n) = NgramsT t (NgramsIndexed n ((f . _ngramsTerms) n))
151
152 insertNgrams :: [Ngrams] -> Cmd (Map NgramsTerms NgramsId)
153 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
154
155 insertNgrams' :: [Ngrams] -> Cmd [NgramIds]
156 insertNgrams' ns = mkCmd $ \conn -> DPS.query conn queryInsertNgrams (DPS.Only $ Values fields ns)
157 where
158 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
159
160 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd ByteString
161 insertNgrams_Debug ns = mkCmd $ \conn -> DPS.formatQuery conn queryInsertNgrams (DPS.Only $ Values fields ns)
162 where
163 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
164
165 ----------------------
166 queryInsertNgrams :: DPS.Query
167 queryInsertNgrams = [sql|
168 WITH input_rows(terms,n) AS (?)
169 , ins AS (
170 INSERT INTO ngrams (terms,n)
171 SELECT * FROM input_rows
172 ON CONFLICT (terms) DO NOTHING -- unique index created here
173 RETURNING id,terms
174 )
175
176 SELECT id, terms
177 FROM ins
178 UNION ALL
179 SELECT c.id, terms
180 FROM input_rows
181 JOIN ngrams c USING (terms); -- columns of unique index
182 |]
183
184
185
186 -- | Ngrams Table
187 -- TODO: the way we are getting main Master Corpus and List ID is not clean
188 -- TODO: if ids are not present -> create
189 -- TODO: Global Env State Monad to keep in memory the ids without retrieving it each time
190 getNgramsTableDb :: DPS.Connection
191 -> NodeType -> NgramsType
192 -> NgramsTableParamUser
193 -> IO ([NgramsTableData], MapToParent, MapToChildren)
194 getNgramsTableDb c nt ngrt ntp@(NgramsTableParam listIdUser _) = do
195 let lieu = "Garg.Db.Ngrams.getTableNgrams: "
196 maybeRoot <- head <$> getRootUsername userMaster c
197 let masterRootId = maybe (panic $ lieu <> "no userMaster Tree") (view node_id) maybeRoot
198 tree <- map toNodeTree <$> dbTree c masterRootId
199 let maybeCorpus = head $ filter (\n -> _nt_type n == NodeCorpus) tree
200 let maybeList = head $ filter (\n -> _nt_type n == NodeList) tree
201 let maybeIds = fmap (both _nt_id) $ (,) <$> maybeCorpus <*> maybeList
202 let (corpusMasterId, listMasterId) = maybe (panic $ lieu <> "no CorpusId or ListId") identity maybeIds
203 ngramsTableData <- getNgramsTableData c nt ngrt ntp (NgramsTableParam listMasterId corpusMasterId)
204 (mapToParent,mapToChildren) <- getNgramsGroup c listIdUser listMasterId
205 pure (ngramsTableData, mapToParent,mapToChildren)
206
207
208 data NgramsTableParam =
209 NgramsTableParam { _nt_listId :: Int
210 , _nt_corpusId :: Int
211 }
212
213 type NgramsTableParamUser = NgramsTableParam
214 type NgramsTableParamMaster = NgramsTableParam
215
216 data NgramsTableData = NgramsTableData { _ntd_ngrams :: Text
217 , _ntd_n :: Int
218 , _ntd_listType :: Maybe ListType
219 , _ntd_weight :: Double
220 } deriving (Show)
221
222 getNgramsTableData :: DPS.Connection
223 -> NodeType -> NgramsType
224 -> NgramsTableParamUser -> NgramsTableParamMaster
225 -> IO [NgramsTableData]
226 getNgramsTableData conn nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) =
227 map (\(t,n,nt,w) -> NgramsTableData t n (fromListTypeId nt) w)
228 <$> DPS.query conn querySelectTableNgrams (ul,uc,nodeTId,ngrmTId,ml,mc,nodeTId,ngrmTId)
229 where
230 nodeTId = nodeTypeId nodeT
231 ngrmTId = ngramsTypeId ngrmT
232
233
234 querySelectTableNgrams :: DPS.Query
235 querySelectTableNgrams = [sql|
236
237 WITH tableUser AS (
238 SELECT ngs.terms, ngs.n, nn1.ngrams_type,nn2.weight FROM ngrams ngs
239 JOIN nodes_ngrams nn1 ON nn1.ngram_id = ngs.id
240 JOIN nodes_ngrams nn2 ON nn2.ngram_id = ngs.id
241 JOIN nodes n ON n.id = nn2.node_id
242 WHERE nn1.node_id = ? -- User listId
243 AND n.parent_id = ? -- User CorpusId or AnnuaireId
244 AND n.typename = ? -- both type of childs (Documents or Contacts)
245 AND nn2.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
246 ), tableMaster AS (
247 SELECT ngs.terms, ngs.n, nn1.ngrams_type,nn2.weight FROM ngrams ngs
248 JOIN nodes_ngrams nn1 ON nn1.ngram_id = ngs.id
249 JOIN nodes_ngrams nn2 ON nn2.ngram_id = ngs.id
250 JOIN nodes n ON n.id = nn2.node_id
251 WHERE nn1.node_id = ? -- Master listId
252 AND n.parent_id = ? -- Master CorpusId or AnnuaireId
253 AND n.typename = ? -- both type of childs (Documents or Contacts)
254 AND nn2.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
255 )
256
257 SELECT COALESCE(tu.terms,tm.terms) AS terms
258 , COALESCE(tu.n,tm.n) AS n
259 , COALESCE(tu.ngrams_type,tm.ngrams_type) AS ngrams_type
260 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
261 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
262 GROUP BY tu.terms,tm.terms,tu.n,tm.n,tu.ngrams_type,tm.ngrams_type;
263
264 |]
265
266 type ListIdUser = Int
267 type ListIdMaster = Int
268
269 type MapToChildren = Map Text (Set Text)
270 type MapToParent = Map Text Text
271
272 getNgramsGroup :: DPS.Connection -> ListIdUser -> ListIdMaster -> IO (MapToParent, MapToChildren)
273 getNgramsGroup conn lu lm = do
274 groups <- getNgramsGroup' conn lu lm
275 let mapChildren = fromListWith (<>) $ map (\(a,b) -> (a, DS.singleton b)) groups
276 let mapParent = fromListWith (<>) $ map (\(a,b) -> (b, a)) groups
277 pure (mapParent, mapChildren)
278
279 getNgramsGroup' :: DPS.Connection -> ListIdUser -> ListIdMaster -> IO [(Text,Text)]
280 getNgramsGroup' conn lu lm = DPS.query conn querySelectNgramsGroup (lu,lm)
281
282 getNgramsGroup'' :: ListIdUser -> ListIdMaster -> Cmd [(Text, Text)]
283 getNgramsGroup'' lu lm = mkCmd $ \conn -> DPS.query conn querySelectNgramsGroup (lu,lm)
284
285 querySelectNgramsGroup :: DPS.Query
286 querySelectNgramsGroup = [sql|
287 WITH groupUser AS (
288 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
289 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
290 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
291 WHERE
292 nnn.node_id = ? -- User listId
293 ),
294 groupMaster AS (
295 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
296 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
297 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
298 WHERE
299 nnn.node_id = ? -- Master listId
300 )
301 SELECT COALESCE(gu.t1,gm.t1) AS ngram1_id
302 , COALESCE(gu.t2,gm.t2) AS ngram2_id
303 FROM groupUser gu RIGHT JOIN groupMaster gm ON gu.t1 = gm.t1
304 |]
305