]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Ngrams.hs
[DB][Flow] question about the map of ngramsT.
[gargantext.git] / src / Gargantext / Database / Ngrams.hs
1 {-|
2 Module : Gargantext.Database.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams connection to the Database.
11
12 -}
13
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE MultiParamTypeClasses #-}
19 {-# LANGUAGE NoImplicitPrelude #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE QuasiQuotes #-}
22 {-# LANGUAGE TemplateHaskell #-}
23
24 module Gargantext.Database.Ngrams where
25
26 -- import Opaleye
27 import Control.Lens (makeLenses)
28 import Data.ByteString.Internal (ByteString)
29 import Data.Map (Map, fromList, lookup)
30 import Data.Text (Text, splitOn)
31 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
32 import Database.PostgreSQL.Simple.SqlQQ (sql)
33 import Database.PostgreSQL.Simple.ToField (toField)
34 import Database.PostgreSQL.Simple.ToRow (toRow)
35 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
36 import GHC.Generics (Generic)
37 import Gargantext.Database.Node (mkCmd, Cmd(..))
38 import Gargantext.Prelude
39 import qualified Database.PostgreSQL.Simple as DPS
40
41
42 --data NgramPoly id terms n = NgramDb { ngram_id :: id
43 -- , ngram_terms :: terms
44 -- , ngram_n :: n
45 -- } deriving (Show)
46 --
47 --type NgramWrite = NgramPoly (Maybe (Column PGInt4))
48 -- (Column PGText)
49 -- (Column PGInt4)
50 --
51 --type NgramRead = NgramPoly (Column PGInt4)
52 -- (Column PGText)
53 -- (Column PGInt4)
54 --
55 ----type Ngram = NgramPoly Int Text Int
56 --
57 -- $(makeAdaptorAndInstance "pNgram" ''NgramPoly)
58 -- $(makeLensesWith abbreviatedFields ''NgramPoly)
59 --
60 --ngramTable :: Table NgramWrite NgramRead
61 --ngramTable = Table "ngrams" (pNgram NgramDb { ngram_id = optional "id"
62 -- , ngram_terms = required "terms"
63 -- , ngram_n = required "n"
64 -- }
65 -- )
66 --
67 --queryNgramTable :: Query NgramRead
68 --queryNgramTable = queryTable ngramTable
69 --
70 --dbGetNgrams :: PGS.Connection -> IO [NgramDb]
71 --dbGetNgrams conn = runQuery conn queryNgramTable
72
73 -- | Main Ngrams Types
74 -- | Typed Ngrams
75 -- Typed Ngrams localize the context of the ngrams
76 -- ngrams in source field of document has Sources Type
77 -- ngrams in authors field of document has Authors Type
78 -- ngrams in text (title or abstract) of documents has Terms Type
79 data NgramsType = Authors | Institutes | Sources | Terms
80 deriving (Eq, Show)
81
82 ngramsTypeId :: NgramsType -> Int
83 ngramsTypeId Authors = 1
84 ngramsTypeId Institutes = 2
85 ngramsTypeId Sources = 3
86 ngramsTypeId Terms = 4
87
88 type NgramsTerms = Text
89 type NgramsId = Int
90 type Size = Int
91
92 ------------------------------------------------------------------------
93 -- | TODO put it in Gargantext.Text.Ngrams
94 data Ngrams = Ngrams { _ngramsTerms :: Text
95 , _ngramsSize :: Int
96 } deriving (Generic, Show)
97 instance Eq Ngrams where
98 (==) = (==)
99 instance Ord Ngrams where
100 compare = compare
101 makeLenses ''Ngrams
102 instance DPS.ToRow Ngrams where
103 toRow (Ngrams t s) = [toField t, toField s]
104
105 text2ngrams :: Text -> Ngrams
106 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
107
108 -------------------------------------------------------------------------
109 -- | TODO put it in Gargantext.Text.Ngrams
110 -- Named entity are typed ngrams of Terms Ngrams
111 data NgramsT a =
112 NgramsT { _ngramsType :: NgramsType
113 , _ngramsT :: a
114 } deriving (Generic, Show)
115
116 instance Eq (NgramsT a)
117 where (==) = (==)
118 -- where NgramsT
119 -- t1 == t2
120 -- n1 == n2
121
122 instance Ord (NgramsT a) where compare = compare
123 makeLenses ''NgramsT
124
125
126 -----------------------------------------------------------------------
127 data NgramsIndexed =
128 NgramsIndexed
129 { _ngrams :: Ngrams
130 , _ngramsId :: NgramsId
131 } deriving (Show, Generic)
132
133 instance Eq NgramsIndexed where
134 (==) = (==)
135 instance Ord NgramsIndexed where
136 compare = compare
137 makeLenses ''NgramsIndexed
138
139 ------------------------------------------------------------------------
140 data NgramIds =
141 NgramIds
142 { ngramId :: Int
143 , ngramTerms :: Text
144 } deriving (Show, Generic)
145
146 instance DPS.FromRow NgramIds where
147 fromRow = NgramIds <$> field <*> field
148
149 ----------------------
150 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
151 indexNgramsT m ngrId = indexNgramsTWith f ngrId
152 where
153 f n = maybe (panic "indexNgramsT: should not happen") identity (lookup n m)
154
155 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams-> NgramsT NgramsIndexed
156 indexNgramsTWith f (NgramsT t n) = NgramsT t (NgramsIndexed n ((f . _ngramsTerms) n))
157
158 insertNgrams :: [Ngrams] -> Cmd (Map NgramsTerms NgramsId)
159 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
160
161 insertNgrams' :: [Ngrams] -> Cmd [NgramIds]
162 insertNgrams' ns = mkCmd $ \conn -> DPS.query conn queryInsertNgrams (DPS.Only $ Values fields ns)
163 where
164 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
165
166
167 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd ByteString
168 insertNgrams_Debug ns = mkCmd $ \conn -> DPS.formatQuery conn queryInsertNgrams (DPS.Only $ Values fields ns)
169 where
170 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
171
172 ----------------------
173 queryInsertNgrams :: DPS.Query
174 queryInsertNgrams = [sql|
175 WITH input_rows(terms,n) AS (?)
176 , ins AS (
177 INSERT INTO ngrams (terms,n)
178 SELECT * FROM input_rows
179 ON CONFLICT (terms) DO NOTHING -- unique index created here
180 RETURNING id,terms
181 )
182
183 SELECT id, terms
184 FROM ins
185 UNION ALL
186 SELECT c.id, terms
187 FROM input_rows
188 JOIN ngrams c USING (terms); -- columns of unique index
189 |]