]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Ngrams.hs
[TYPE] HyperdataDocument : adding institutes as new field.
[gargantext.git] / src / Gargantext / Database / Ngrams.hs
1 {-|
2 Module : Gargantext.Database.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams connection to the Database.
11
12 -}
13
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE MultiParamTypeClasses #-}
19 {-# LANGUAGE NoImplicitPrelude #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE QuasiQuotes #-}
22 {-# LANGUAGE TemplateHaskell #-}
23
24 module Gargantext.Database.Ngrams where
25
26 -- import Opaleye
27 import Control.Lens (makeLenses)
28 import Data.ByteString.Internal (ByteString)
29 import Data.Map (Map, fromList, lookup)
30 import Data.Text (Text, splitOn)
31 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
32 import Database.PostgreSQL.Simple.SqlQQ (sql)
33 import Database.PostgreSQL.Simple.ToField (toField)
34 import Database.PostgreSQL.Simple.ToRow (toRow)
35 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
36 import GHC.Generics (Generic)
37 import Gargantext.Database.Node (mkCmd, Cmd(..))
38 import Gargantext.Prelude
39 import qualified Database.PostgreSQL.Simple as DPS
40
41
42 --data NgramPoly id terms n = NgramDb { ngram_id :: id
43 -- , ngram_terms :: terms
44 -- , ngram_n :: n
45 -- } deriving (Show)
46 --
47 --type NgramWrite = NgramPoly (Maybe (Column PGInt4))
48 -- (Column PGText)
49 -- (Column PGInt4)
50 --
51 --type NgramRead = NgramPoly (Column PGInt4)
52 -- (Column PGText)
53 -- (Column PGInt4)
54 --
55 ----type Ngram = NgramPoly Int Text Int
56 --
57 -- $(makeAdaptorAndInstance "pNgram" ''NgramPoly)
58 -- $(makeLensesWith abbreviatedFields ''NgramPoly)
59 --
60 --ngramTable :: Table NgramWrite NgramRead
61 --ngramTable = Table "ngrams" (pNgram NgramDb { ngram_id = optional "id"
62 -- , ngram_terms = required "terms"
63 -- , ngram_n = required "n"
64 -- }
65 -- )
66 --
67 --queryNgramTable :: Query NgramRead
68 --queryNgramTable = queryTable ngramTable
69 --
70 --dbGetNgrams :: PGS.Connection -> IO [NgramDb]
71 --dbGetNgrams conn = runQuery conn queryNgramTable
72
73 -- | Main Ngrams Types
74 -- | Typed Ngrams
75 -- Typed Ngrams localize the context of the ngrams
76 -- ngrams in source field of document has Sources Type
77 -- ngrams in authors field of document has Authors Type
78 -- ngrams in text (title or abstract) of documents has Terms Type
79 data NgramsType = Sources | Authors | Terms
80 deriving (Eq)
81
82 ngramsTypeId :: NgramsType -> Int
83 ngramsTypeId Terms = 1
84 ngramsTypeId Authors = 2
85 ngramsTypeId Sources = 3
86
87 type NgramsTerms = Text
88 type NgramsId = Int
89 type Size = Int
90
91 ------------------------------------------------------------------------
92 -- | TODO put it in Gargantext.Text.Ngrams
93 data Ngrams = Ngrams { _ngramsTerms :: Text
94 , _ngramsSize :: Int
95 } deriving (Generic)
96 instance Eq Ngrams where
97 (==) = (==)
98 instance Ord Ngrams where
99 compare = compare
100 makeLenses ''Ngrams
101 instance DPS.ToRow Ngrams where
102 toRow (Ngrams t s) = [toField t, toField s]
103
104 text2ngrams :: Text -> Ngrams
105 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
106
107 -------------------------------------------------------------------------
108 -- | TODO put it in Gargantext.Text.Ngrams
109 -- Named entity are typed ngrams of Terms Ngrams
110 data NgramsT a =
111 NgramsT { _ngramsType :: NgramsType
112 , _ngramsT :: a
113 } deriving (Generic)
114
115 instance Eq (NgramsT a)
116 where (==) = (==)
117 -- where NgramsT
118 -- t1 == t2
119 -- n1 == n2
120
121 instance Ord (NgramsT a) where compare = compare
122 makeLenses ''NgramsT
123
124
125 -----------------------------------------------------------------------
126 data NgramsIndexed =
127 NgramsIndexed
128 { _ngrams :: Ngrams
129 , _ngramsId :: NgramsId
130 } deriving (Generic)
131
132 instance Eq NgramsIndexed where
133 (==) = (==)
134 instance Ord NgramsIndexed where
135 compare = compare
136 makeLenses ''NgramsIndexed
137
138 ------------------------------------------------------------------------
139 data NgramIds =
140 NgramIds
141 { ngramId :: Int
142 , ngramTerms :: Text
143 } deriving (Show, Generic)
144
145 instance DPS.FromRow NgramIds where
146 fromRow = NgramIds <$> field <*> field
147
148 ----------------------
149 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
150 indexNgramsT m ngrId = indexNgramsTWith f ngrId
151 where
152 f n = maybe (panic "indexNgramsT: should not happen") identity (lookup n m)
153
154 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams-> NgramsT NgramsIndexed
155 indexNgramsTWith f (NgramsT t n) = NgramsT t (NgramsIndexed n ((f . _ngramsTerms) n))
156
157 insertNgrams :: [Ngrams] -> Cmd (Map NgramsTerms NgramsId)
158 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
159
160 insertNgrams' :: [Ngrams] -> Cmd [NgramIds]
161 insertNgrams' ns = mkCmd $ \conn -> DPS.query conn queryInsertNgrams (DPS.Only $ Values fields ns)
162 where
163 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
164
165
166 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd ByteString
167 insertNgrams_Debug ns = mkCmd $ \conn -> DPS.formatQuery conn queryInsertNgrams (DPS.Only $ Values fields ns)
168 where
169 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
170
171 ----------------------
172 queryInsertNgrams :: DPS.Query
173 queryInsertNgrams = [sql|
174 WITH input_rows(terms,n) AS (?)
175 , ins AS (
176 INSERT INTO ngrams (terms,n)
177 SELECT * FROM input_rows
178 ON CONFLICT (terms) DO NOTHING -- unique index created here
179 RETURNING id,terms
180 )
181
182 SELECT id, terms
183 FROM ins
184 UNION ALL
185 SELECT c.id, terms
186 FROM input_rows
187 JOIN ngrams c USING (terms); -- columns of unique index
188 |]