2 Module : Gargantext.Database.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams connection to the Database.
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE MultiParamTypeClasses #-}
19 {-# LANGUAGE NoImplicitPrelude #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE QuasiQuotes #-}
22 {-# LANGUAGE TemplateHaskell #-}
24 module Gargantext.Database.Ngrams where
26 import Prelude (Enum, Bounded, minBound, maxBound)
27 import Control.Lens (makeLenses, view)
28 import Data.ByteString.Internal (ByteString)
29 import Data.Map (Map, fromList, lookup, fromListWith)
31 import Data.Tuple.Extra (both)
32 import qualified Data.Set as DS
33 import Data.Text (Text, splitOn)
34 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
35 import Database.PostgreSQL.Simple.SqlQQ (sql)
36 import Database.PostgreSQL.Simple.ToField (toField)
37 import Database.PostgreSQL.Simple.ToRow (toRow)
38 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
39 import GHC.Generics (Generic)
40 import Gargantext.Core.Types -- (fromListTypeId, ListType, NodePoly(Node))
41 import Gargantext.Database.Config (nodeTypeId,userMaster)
42 import Gargantext.Database.Types.Node (NodeType)
43 import Gargantext.Database.Node (mkCmd, Cmd(..))
44 import Gargantext.Database.Root (getRoot)
45 import Gargantext.Database.Tree (dbTree, toNodeTree)
46 import Gargantext.Core.Types.Main (NodeTree(..))
47 import Gargantext.Prelude
48 import qualified Database.PostgreSQL.Simple as DPS
50 --data NgramPoly id terms n = NgramDb { ngram_id :: id
51 -- , ngram_terms :: terms
55 --type NgramWrite = NgramPoly (Maybe (Column PGInt4))
59 --type NgramRead = NgramPoly (Column PGInt4)
63 ----type Ngram = NgramPoly Int Text Int
65 -- $(makeAdaptorAndInstance "pNgram" ''NgramPoly)
66 -- $(makeLensesWith abbreviatedFields ''NgramPoly)
68 --ngramTable :: Table NgramWrite NgramRead
69 --ngramTable = Table "ngrams" (pNgram NgramDb { ngram_id = optional "id"
70 -- , ngram_terms = required "terms"
71 -- , ngram_n = required "n"
75 --queryNgramTable :: Query NgramRead
76 --queryNgramTable = queryTable ngramTable
78 --dbGetNgrams :: PGS.Connection -> IO [NgramDb]
79 --dbGetNgrams conn = runQuery conn queryNgramTable
81 -- | Main Ngrams Types
83 -- Typed Ngrams localize the context of the ngrams
84 -- ngrams in source field of document has Sources Type
85 -- ngrams in authors field of document has Authors Type
86 -- ngrams in text (title or abstract) of documents has Terms Type
87 data NgramsType = Authors | Institutes | Sources | NgramsTerms
88 deriving (Eq, Show, Ord, Enum, Bounded)
90 ngramsTypeId :: NgramsType -> Int
91 ngramsTypeId Authors = 1
92 ngramsTypeId Institutes = 2
93 ngramsTypeId Sources = 3
94 ngramsTypeId NgramsTerms = 4
96 fromNgramsTypeId :: Int -> Maybe NgramsType
97 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
99 type NgramsTerms = Text
103 ------------------------------------------------------------------------
104 -- | TODO put it in Gargantext.Text.Ngrams
105 data Ngrams = Ngrams { _ngramsTerms :: Text
107 } deriving (Generic, Show, Eq, Ord)
110 instance DPS.ToRow Ngrams where
111 toRow (Ngrams t s) = [toField t, toField s]
113 text2ngrams :: Text -> Ngrams
114 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
116 -------------------------------------------------------------------------
117 -- | TODO put it in Gargantext.Text.Ngrams
118 -- Named entity are typed ngrams of Terms Ngrams
120 NgramsT { _ngramsType :: NgramsType
122 } deriving (Generic, Show, Eq, Ord)
125 -----------------------------------------------------------------------
129 , _ngramsId :: NgramsId
130 } deriving (Show, Generic, Eq, Ord)
132 makeLenses ''NgramsIndexed
133 ------------------------------------------------------------------------
138 } deriving (Show, Generic, Eq, Ord)
140 instance DPS.FromRow NgramIds where
141 fromRow = NgramIds <$> field <*> field
143 ----------------------
144 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
145 indexNgramsT m ngrId = indexNgramsTWith f ngrId
147 f n = maybe (panic "indexNgramsT: should not happen") identity (lookup n m)
149 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams-> NgramsT NgramsIndexed
150 indexNgramsTWith f (NgramsT t n) = NgramsT t (NgramsIndexed n ((f . _ngramsTerms) n))
152 insertNgrams :: [Ngrams] -> Cmd (Map NgramsTerms NgramsId)
153 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
155 insertNgrams' :: [Ngrams] -> Cmd [NgramIds]
156 insertNgrams' ns = mkCmd $ \conn -> DPS.query conn queryInsertNgrams (DPS.Only $ Values fields ns)
158 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
160 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd ByteString
161 insertNgrams_Debug ns = mkCmd $ \conn -> DPS.formatQuery conn queryInsertNgrams (DPS.Only $ Values fields ns)
163 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
165 ----------------------
166 queryInsertNgrams :: DPS.Query
167 queryInsertNgrams = [sql|
168 WITH input_rows(terms,n) AS (?)
170 INSERT INTO ngrams (terms,n)
171 SELECT * FROM input_rows
172 ON CONFLICT (terms) DO NOTHING -- unique index created here
181 JOIN ngrams c USING (terms); -- columns of unique index
187 -- TODO: the way we are getting main Master Corpus and List ID is not clean
188 -- TODO: if ids are not present -> create
189 -- TODO: Global Env State Monad to keep in memory the ids without retrieving it each time
190 getNgramsTableDb :: DPS.Connection
191 -> NodeType -> NgramsType
192 -> NgramsTableParamUser
193 -> IO ([NgramsTableData], MapToParent, MapToChildren)
194 getNgramsTableDb c nt ngrt ntp@(NgramsTableParam listIdUser _) = do
195 let lieu = "Garg.Db.Ngrams.getTableNgrams: "
196 maybeRoot <- head <$> getRoot userMaster c
197 let masterRootId = maybe (panic $ lieu <> "no userMaster Tree") (view node_id) maybeRoot
198 tree <- map toNodeTree <$> dbTree c masterRootId
199 let maybeCorpus = head $ filter (\n -> _nt_type n == NodeCorpus) tree
200 let maybeList = head $ filter (\n -> _nt_type n == NodeList) tree
201 let maybeIds = fmap (both _nt_id) $ (,) <$> maybeCorpus <*> maybeList
202 let (corpusMasterId, listMasterId) = maybe (panic $ lieu <> "no CorpusId or ListId") identity maybeIds
203 ngramsTableData <- getNgramsTableData c nt ngrt ntp (NgramsTableParam listMasterId corpusMasterId)
204 (mapToParent,mapToChildren) <- getNgramsGroup c listIdUser listMasterId
205 pure (ngramsTableData, mapToParent,mapToChildren)
208 data NgramsTableParam =
209 NgramsTableParam { _nt_listId :: Int
210 , _nt_corpusId :: Int
213 type NgramsTableParamUser = NgramsTableParam
214 type NgramsTableParamMaster = NgramsTableParam
216 data NgramsTableData = NgramsTableData { _ntd_ngrams :: Text
218 , _ntd_listType :: Maybe ListType
219 , _ntd_weight :: Double
222 getNgramsTableData :: DPS.Connection
223 -> NodeType -> NgramsType
224 -> NgramsTableParamUser -> NgramsTableParamMaster
225 -> IO [NgramsTableData]
226 getNgramsTableData conn nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) =
227 map (\(t,n,nt,w) -> NgramsTableData t n (fromListTypeId nt) w)
228 <$> DPS.query conn querySelectTableNgrams (ul,uc,nodeTId,ngrmTId,ml,mc,nodeTId,ngrmTId,uc)
230 nodeTId = nodeTypeId nodeT
231 ngrmTId = ngramsTypeId ngrmT
234 querySelectTableNgrams :: DPS.Query
235 querySelectTableNgrams = [sql|
238 SELECT ngs.terms, ngs.n, list.ngrams_type, corp.weight FROM ngrams ngs
239 JOIN nodes_ngrams list ON list.ngram_id = ngs.id
240 JOIN nodes_ngrams corp ON corp.ngram_id = ngs.id
241 JOIN nodes n ON n.id = corp.node_id
243 WHERE list.node_id = ? -- User listId
244 AND n.parent_id = ? -- User CorpusId or AnnuaireId
245 AND n.typename = ? -- both type of childs (Documents or Contacts)
246 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms or...)
249 SELECT ngs.terms, ngs.n, list.ngrams_type, corp.weight FROM ngrams ngs
250 JOIN nodes_ngrams list ON list.ngram_id = ngs.id
251 JOIN nodes_ngrams corp ON corp.ngram_id = ngs.id
252 JOIN nodes n ON n.id = corp.node_id
253 JOIN nodes_nodes nn ON nn.node2_id = n.id
255 WHERE list.node_id = ? -- Master listId
256 AND n.parent_id = ? -- Master CorpusId or AnnuaireId
257 AND n.typename = ? -- Master childs (Documents or Contacts)
258 AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
259 AND nn.node1_id = ? -- User CorpusId or AnnuaireId
262 SELECT COALESCE(tu.terms,tm.terms) AS terms
263 , COALESCE(tu.n,tm.n) AS n
264 , COALESCE(tu.ngrams_type,tm.ngrams_type) AS ngrams_type
265 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
266 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
267 GROUP BY tu.terms,tm.terms,tu.n,tm.n,tu.ngrams_type,tm.ngrams_type;
271 type ListIdUser = Int
272 type ListIdMaster = Int
274 type MapToChildren = Map Text (Set Text)
275 type MapToParent = Map Text Text
277 getNgramsGroup :: DPS.Connection -> ListIdUser -> ListIdMaster -> IO (MapToParent, MapToChildren)
278 getNgramsGroup conn lu lm = do
279 groups <- getNgramsGroup' conn lu lm
280 let mapChildren = fromListWith (<>) $ map (\(a,b) -> (a, DS.singleton b)) groups
281 let mapParent = fromListWith (<>) $ map (\(a,b) -> (b, a)) groups
282 pure (mapParent, mapChildren)
284 getNgramsGroup' :: DPS.Connection -> ListIdUser -> ListIdMaster -> IO [(Text,Text)]
285 getNgramsGroup' conn lu lm = DPS.query conn querySelectNgramsGroup (lu,lm)
287 getNgramsGroup'' :: ListIdUser -> ListIdMaster -> Cmd [(Text, Text)]
288 getNgramsGroup'' lu lm = mkCmd $ \conn -> DPS.query conn querySelectNgramsGroup (lu,lm)
290 querySelectNgramsGroup :: DPS.Query
291 querySelectNgramsGroup = [sql|
293 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
294 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
295 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
297 nnn.node_id = ? -- User listId
300 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
301 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
302 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
304 nnn.node_id = ? -- Master listId
306 SELECT COALESCE(gu.t1,gm.t1) AS ngram1_id
307 , COALESCE(gu.t2,gm.t2) AS ngram2_id
308 FROM groupUser gu RIGHT JOIN groupMaster gm ON gu.t1 = gm.t1