]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Ngram.hs
Merge branch 'master' into dbflow
[gargantext.git] / src / Gargantext / Database / Ngram.hs
1 {-|
2 Module : Gargantext.Database.Ngram
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams connection to the Database.
11
12 -}
13
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE MultiParamTypeClasses #-}
19 {-# LANGUAGE NoImplicitPrelude #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE QuasiQuotes #-}
22 {-# LANGUAGE TemplateHaskell #-}
23
24 module Gargantext.Database.Ngram where
25
26 -- import Opaleye
27 import Control.Lens (makeLenses)
28 import Control.Lens.TH (makeLensesWith, abbreviatedFields)
29 import Data.ByteString.Internal (ByteString)
30 import Data.List (find)
31 import Data.Map (Map, fromList, lookup)
32 import Data.Maybe (Maybe)
33 import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
34 import Data.Text (Text)
35 import Database.PostgreSQL.Simple.FromField ( FromField, fromField)
36 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
37 import Database.PostgreSQL.Simple.SqlQQ (sql)
38 import Database.PostgreSQL.Simple.ToField (toField)
39 import Database.PostgreSQL.Simple.ToRow (toRow)
40 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
41 import GHC.Generics (Generic)
42 import Gargantext.Database.Node (runCmd, mkCmd, Cmd(..))
43 import Gargantext.Prelude
44 import qualified Database.PostgreSQL.Simple as DPS
45
46
47 --data NgramPoly id terms n = NgramDb { ngram_id :: id
48 -- , ngram_terms :: terms
49 -- , ngram_n :: n
50 -- } deriving (Show)
51 --
52 --type NgramWrite = NgramPoly (Maybe (Column PGInt4))
53 -- (Column PGText)
54 -- (Column PGInt4)
55 --
56 --type NgramRead = NgramPoly (Column PGInt4)
57 -- (Column PGText)
58 -- (Column PGInt4)
59 --
60 ----type Ngram = NgramPoly Int Text Int
61 --
62 -- $(makeAdaptorAndInstance "pNgram" ''NgramPoly)
63 -- $(makeLensesWith abbreviatedFields ''NgramPoly)
64 --
65 --ngramTable :: Table NgramWrite NgramRead
66 --ngramTable = Table "ngrams" (pNgram NgramDb { ngram_id = optional "id"
67 -- , ngram_terms = required "terms"
68 -- , ngram_n = required "n"
69 -- }
70 -- )
71 --
72 --queryNgramTable :: Query NgramRead
73 --queryNgramTable = queryTable ngramTable
74 --
75 --dbGetNgrams :: PGS.Connection -> IO [NgramDb]
76 --dbGetNgrams conn = runQuery conn queryNgramTable
77
78 -- | Main Ngrams Types
79 -- | Typed Ngrams
80 -- Typed Ngrams localize the context of the ngrams
81 -- ngrams in source field of document has Sources Type
82 -- ngrams in authors field of document has Authors Type
83 -- ngrams in text (title or abstract) of documents has Terms Type
84 data NgramsType = Sources | Authors | Terms
85
86 ngramsTypeId :: NgramsType -> Int
87 ngramsTypeId Terms = 1
88 ngramsTypeId Authors = 2
89 ngramsTypeId Sources = 3
90
91 type NgramsTerms = Text
92 type NgramsId = Int
93 type Size = Int
94
95 ------------------------------------------------------------------------
96 -- | TODO put it in Gargantext.Text.Ngrams
97 data Ngrams = Ngrams { _ngramsTerms :: Text
98 , _ngramsSize :: Int
99 } deriving (Generic)
100 instance Eq Ngrams where
101 (==) = (==)
102 instance Ord Ngrams where
103 compare = compare
104 makeLenses ''Ngrams
105 instance DPS.ToRow Ngrams where
106 toRow (Ngrams t s) = [toField t, toField s]
107
108 -------------------------------------------------------------------------
109 -- | TODO put it in Gargantext.Text.Ngrams
110 -- Named entity are typed ngrams of Terms Ngrams
111 data NgramsT a = NgramsT { _ngramsType :: NgramsType
112 , _ngramsT :: a
113 } deriving (Generic)
114 instance Eq (NgramsT a) where (==) = (==)
115 instance Ord (NgramsT a) where compare = compare
116 makeLenses ''NgramsT
117 -----------------------------------------------------------------------
118 data NgramsIndexed = NgramsIndexed { _ngrams :: Ngrams
119 , _ngramsId :: NgramsId
120 } deriving (Generic)
121 instance Eq NgramsIndexed where
122 (==) = (==)
123 instance Ord NgramsIndexed where
124 compare = compare
125 makeLenses ''NgramsIndexed
126 ------------------------------------------------------------------------
127 data NgramIds = NgramIds { ngramId :: Int
128 , ngramTerms :: Text
129 } deriving (Show, Generic)
130
131 instance DPS.FromRow NgramIds where
132 fromRow = NgramIds <$> field <*> field
133
134 ----------------------
135 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
136 indexNgramsT m n = indexNgramsTWith f n
137 where
138 f n = maybe (panic "indexNgramsT: should not happen") identity (lookup n m)
139
140 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams-> NgramsT NgramsIndexed
141 indexNgramsTWith f (NgramsT t n) = NgramsT t (NgramsIndexed n ((f . _ngramsTerms) n))
142 ----------------------
143 insertNgrams :: [Ngrams] -> Cmd (Map NgramsTerms NgramsId)
144 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
145
146 insertNgrams' :: [Ngrams] -> Cmd [NgramIds]
147 insertNgrams' ns = mkCmd $ \conn -> DPS.query conn queryInsertNgrams (DPS.Only $ Values fields ns)
148 where
149 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
150
151
152 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd ByteString
153 insertNgrams_Debug ns = mkCmd $ \conn -> DPS.formatQuery conn queryInsertNgrams (DPS.Only $ Values fields ns)
154 where
155 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
156
157 ----------------------
158 queryInsertNgrams :: DPS.Query
159 queryInsertNgrams = [sql|
160 WITH input_rows(terms,n) AS (?)
161 , ins AS (
162 INSERT INTO ngrams (terms,n)
163 SELECT * FROM input_rows
164 ON CONFLICT (terms) DO NOTHING -- unique index created here
165 RETURNING id,terms
166 )
167
168 SELECT id, terms
169 FROM ins
170 UNION ALL
171 SELECT c.id, terms
172 FROM input_rows
173 JOIN ngrams c USING (terms); -- columns of unique index
174 |]