2 Module : Gargantext.Database.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams connection to the Database.
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE MultiParamTypeClasses #-}
19 {-# LANGUAGE NoImplicitPrelude #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE QuasiQuotes #-}
22 {-# LANGUAGE TemplateHaskell #-}
24 module Gargantext.Database.Ngrams where
27 import Prelude (Enum, Bounded, minBound, maxBound)
28 import Control.Lens (makeLenses, view)
29 import Data.ByteString.Internal (ByteString)
30 import Data.Map (Map, fromList, lookup, fromListWith)
32 import Data.Tuple.Extra (both)
33 import qualified Data.Set as DS
34 import Data.Text (Text, splitOn)
35 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
36 import Database.PostgreSQL.Simple.SqlQQ (sql)
37 import Database.PostgreSQL.Simple.ToField (toField)
38 import Database.PostgreSQL.Simple.ToRow (toRow)
39 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
40 import GHC.Generics (Generic)
41 import Gargantext.Core.Types -- (fromListTypeId, ListType, NodePoly(Node))
42 import Gargantext.Database.Config (nodeTypeId,userMaster)
43 import Gargantext.Database.Types.Node (NodeType)
44 import Gargantext.Database.Node (mkCmd, Cmd(..),getRootUsername)
45 import Gargantext.Database.Tree (dbTree, toNodeTree)
46 import Gargantext.Core.Types.Main (NodeTree(..))
47 import Gargantext.Prelude
48 import qualified Database.PostgreSQL.Simple as DPS
50 --data NgramPoly id terms n = NgramDb { ngram_id :: id
51 -- , ngram_terms :: terms
55 --type NgramWrite = NgramPoly (Maybe (Column PGInt4))
59 --type NgramRead = NgramPoly (Column PGInt4)
63 ----type Ngram = NgramPoly Int Text Int
65 -- $(makeAdaptorAndInstance "pNgram" ''NgramPoly)
66 -- $(makeLensesWith abbreviatedFields ''NgramPoly)
68 --ngramTable :: Table NgramWrite NgramRead
69 --ngramTable = Table "ngrams" (pNgram NgramDb { ngram_id = optional "id"
70 -- , ngram_terms = required "terms"
71 -- , ngram_n = required "n"
75 --queryNgramTable :: Query NgramRead
76 --queryNgramTable = queryTable ngramTable
78 --dbGetNgrams :: PGS.Connection -> IO [NgramDb]
79 --dbGetNgrams conn = runQuery conn queryNgramTable
81 -- | Main Ngrams Types
83 -- Typed Ngrams localize the context of the ngrams
84 -- ngrams in source field of document has Sources Type
85 -- ngrams in authors field of document has Authors Type
86 -- ngrams in text (title or abstract) of documents has Terms Type
87 data NgramsType = Authors | Institutes | Sources | NgramsTerms
88 deriving (Eq, Show, Ord, Enum, Bounded)
90 ngramsTypeId :: NgramsType -> Int
91 ngramsTypeId Authors = 1
92 ngramsTypeId Institutes = 2
93 ngramsTypeId Sources = 3
94 ngramsTypeId NgramsTerms = 4
96 fromNgramsTypeId :: Int -> Maybe NgramsType
97 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
99 type NgramsTerms = Text
103 ------------------------------------------------------------------------
104 -- | TODO put it in Gargantext.Text.Ngrams
105 data Ngrams = Ngrams { _ngramsTerms :: Text
107 } deriving (Generic, Show, Eq, Ord)
110 instance DPS.ToRow Ngrams where
111 toRow (Ngrams t s) = [toField t, toField s]
113 text2ngrams :: Text -> Ngrams
114 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
116 -------------------------------------------------------------------------
117 -- | TODO put it in Gargantext.Text.Ngrams
118 -- Named entity are typed ngrams of Terms Ngrams
120 NgramsT { _ngramsType :: NgramsType
122 } deriving (Generic, Show, Eq, Ord)
125 -----------------------------------------------------------------------
129 , _ngramsId :: NgramsId
130 } deriving (Show, Generic, Eq, Ord)
132 makeLenses ''NgramsIndexed
133 ------------------------------------------------------------------------
138 } deriving (Show, Generic, Eq, Ord)
140 instance DPS.FromRow NgramIds where
141 fromRow = NgramIds <$> field <*> field
143 ----------------------
144 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
145 indexNgramsT m ngrId = indexNgramsTWith f ngrId
147 f n = maybe (panic "indexNgramsT: should not happen") identity (lookup n m)
149 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams-> NgramsT NgramsIndexed
150 indexNgramsTWith f (NgramsT t n) = NgramsT t (NgramsIndexed n ((f . _ngramsTerms) n))
152 insertNgrams :: [Ngrams] -> Cmd (Map NgramsTerms NgramsId)
153 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
155 insertNgrams' :: [Ngrams] -> Cmd [NgramIds]
156 insertNgrams' ns = mkCmd $ \conn -> DPS.query conn queryInsertNgrams (DPS.Only $ Values fields ns)
158 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
160 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd ByteString
161 insertNgrams_Debug ns = mkCmd $ \conn -> DPS.formatQuery conn queryInsertNgrams (DPS.Only $ Values fields ns)
163 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
165 ----------------------
166 queryInsertNgrams :: DPS.Query
167 queryInsertNgrams = [sql|
168 WITH input_rows(terms,n) AS (?)
170 INSERT INTO ngrams (terms,n)
171 SELECT * FROM input_rows
172 ON CONFLICT (terms) DO NOTHING -- unique index created here
181 JOIN ngrams c USING (terms); -- columns of unique index
187 -- TODO: the way we are getting main Master Corpus and List ID is not clean
188 -- TODO: if ids are not present -> create
189 -- TODO: Global Env State Monad to keep in memory the ids without retrieving it each time
190 getNgramsTableDb :: DPS.Connection
191 -> NodeType -> NgramsType
192 -> NgramsTableParamUser
193 -> IO ([NgramsTableData], MapToParent, MapToChildren)
194 getNgramsTableDb c nt ngrt ntp@(NgramsTableParam listIdUser _) = do
195 let lieu = "Garg.Db.Ngrams.getTableNgrams: "
196 maybeRoot <- head <$> getRootUsername userMaster c
197 let masterRootId = maybe (panic $ lieu <> "no userMaster Tree") (view node_id) maybeRoot
198 tree <- map toNodeTree <$> dbTree c masterRootId
199 let maybeCorpus = head $ filter (\n -> _nt_type n == NodeCorpus) tree
200 let maybeList = head $ filter (\n -> _nt_type n == NodeList) tree
201 let maybeIds = fmap (both _nt_id) $ (,) <$> maybeCorpus <*> maybeList
202 let (corpusMasterId, listMasterId) = maybe (panic $ lieu <> "no CorpusId or ListId") identity maybeIds
203 ngramsTableData <- getNgramsTableData c nt ngrt ntp (NgramsTableParam listMasterId corpusMasterId)
204 (mapToParent,mapToChildren) <- getNgramsGroup c listIdUser listMasterId
205 pure (ngramsTableData, mapToParent,mapToChildren)
208 data NgramsTableParam =
209 NgramsTableParam { _nt_listId :: Int
210 , _nt_corpusId :: Int
213 type NgramsTableParamUser = NgramsTableParam
214 type NgramsTableParamMaster = NgramsTableParam
216 data NgramsTableData = NgramsTableData { _ntd_ngrams :: Text
218 , _ntd_listType :: Maybe ListType
219 , _ntd_weight :: Double
222 getNgramsTableData :: DPS.Connection
223 -> NodeType -> NgramsType
224 -> NgramsTableParamUser -> NgramsTableParamMaster
225 -> IO [NgramsTableData]
226 getNgramsTableData conn nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) =
227 map (\(t,n,nt,w) -> NgramsTableData t n (fromListTypeId nt) w)
228 <$> DPS.query conn querySelectTableNgrams (ul,uc,nodeTId,ngrmTId,ml,mc,nodeTId,ngrmTId)
230 nodeTId = nodeTypeId nodeT
231 ngrmTId = ngramsTypeId ngrmT
234 querySelectTableNgrams :: DPS.Query
235 querySelectTableNgrams = [sql|
237 WITH tableUser AS (select ngs.terms, ngs.n, nn1.ngrams_type,nn2.weight FROM ngrams ngs
238 JOIN nodes_ngrams nn1 ON nn1.ngram_id = ngs.id
239 JOIN nodes_ngrams nn2 ON nn2.ngram_id = ngs.id
240 JOIN nodes n ON n.id = nn2.node_id
241 WHERE nn1.node_id = ? -- User listId
242 AND n.parent_id = ? -- User CorpusId or AnnuaireId
243 AND n.typename = ? -- both type of childs (Documents or Contacts)
244 AND nn2.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
245 ), tableMaster AS (select ngs.terms, ngs.n, nn1.ngrams_type,nn2.weight FROM ngrams ngs
246 JOIN nodes_ngrams nn1 ON nn1.ngram_id = ngs.id
247 JOIN nodes_ngrams nn2 ON nn2.ngram_id = ngs.id
248 JOIN nodes n ON n.id = nn2.node_id
249 WHERE nn1.node_id = ? -- Master listId
250 AND n.parent_id = ? -- Master CorpusId or AnnuaireId
251 AND n.typename = ? -- both type of childs (Documents or Contacts)
252 AND nn2.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
255 SELECT COALESCE(tu.terms,tm.terms) AS terms
256 , COALESCE(tu.n,tm.n) AS n
257 , COALESCE(tu.ngrams_type,tm.ngrams_type) AS ngrams_type
258 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
259 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
260 GROUP BY tu.terms,tm.terms,tu.n,tm.n,tu.ngrams_type,tm.ngrams_type;
264 type ListIdUser = Int
265 type ListIdMaster = Int
267 type MapToChildren = Map Text (Set Text)
268 type MapToParent = Map Text Text
270 getNgramsGroup :: DPS.Connection -> ListIdUser -> ListIdMaster -> IO (MapToParent, MapToChildren)
271 getNgramsGroup conn lu lm = do
272 groups <- getNgramsGroup' conn lu lm
273 let mapChildren = fromListWith (<>) $ map (\(a,b) -> (a, DS.singleton b)) groups
274 let mapParent = fromListWith (<>) $ map (\(a,b) -> (b, a)) groups
275 pure (mapParent, mapChildren)
277 getNgramsGroup' :: DPS.Connection -> ListIdUser -> ListIdMaster -> IO [(Text,Text)]
278 getNgramsGroup' conn lu lm = DPS.query conn querySelectNgramsGroup (lu,lm)
280 getNgramsGroup'' :: ListIdUser -> ListIdMaster -> Cmd [(Text, Text)]
281 getNgramsGroup'' lu lm = mkCmd $ \conn -> DPS.query conn querySelectNgramsGroup (lu,lm)
283 querySelectNgramsGroup :: DPS.Query
284 querySelectNgramsGroup = [sql|
286 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
287 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
288 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
290 nnn.node_id = ? -- User listId
293 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
294 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
295 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
297 nnn.node_id = ? -- Master listId
299 SELECT COALESCE(gu.t1,gm.t1) AS ngram1_id
300 , COALESCE(gu.t2,gm.t2) AS ngram2_id
301 FROM groupUser gu RIGHT JOIN groupMaster gm ON gu.t1 = gm.t1