]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/TextSearch.hs
Merge branch 'dev-optim-ngrams-table' into dev-merge
[gargantext.git] / src / Gargantext / Database / TextSearch.hs
1 {-|
2 Module : Gargantext.Database.TextSearch
3 Description : Postgres text search experimentation
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE Arrows #-}
12 {-# LANGUAGE NoImplicitPrelude #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE RankNTypes #-}
15
16 module Gargantext.Database.TextSearch where
17
18 import Data.Aeson
19 import Data.Map.Strict hiding (map, drop, take)
20 import Data.Maybe
21 import Control.Lens ((^.))
22 import Data.List (intersperse, take, drop)
23 import Data.String (IsString(..))
24 import Data.Text (Text, words, unpack, intercalate)
25 import Data.Time (UTCTime)
26 import Database.PostgreSQL.Simple (Query)
27 import Database.PostgreSQL.Simple.ToField
28 import Gargantext.Database.Config (nodeTypeId)
29 import Gargantext.Database.Types.Node (NodeType(..))
30 import Gargantext.Prelude
31 --import Gargantext.Database.Node.Contact
32 import Gargantext.Database.Facet
33 import Gargantext.Database.Schema.Node
34 import Gargantext.Database.Schema.Ngrams
35 import Gargantext.Database.Schema.NodeNode hiding (joinInCorpus)
36 import Gargantext.Database.Schema.NodeNodeNgrams
37 import Gargantext.Database.Queries.Join (leftJoin6)
38 import Gargantext.Database.Utils (Cmd, runPGSQuery, runOpaQuery)
39 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
40 import Gargantext.Core.Types
41 import Control.Arrow (returnA)
42 import qualified Opaleye as O hiding (Order)
43 import Opaleye hiding (Query, Order)
44
45
46 ------------------------------------------------------------------------
47 searchInDatabase :: ParentId
48 -> Text
49 -> Cmd err [(NodeId, HyperdataDocument)]
50 searchInDatabase p t = runOpaQuery (queryInDatabase p t)
51 where
52 -- | Global search query where ParentId is Master Node Corpus Id
53 queryInDatabase :: ParentId -> Text -> O.Query (Column PGInt4, Column PGJsonb)
54 queryInDatabase _ q = proc () -> do
55 row <- queryNodeSearchTable -< ()
56 restrict -< (_ns_search row) @@ (pgTSQuery (unpack q))
57 restrict -< (_ns_typename row) .== (pgInt4 $ nodeTypeId NodeDocument)
58 returnA -< (_ns_id row, _ns_hyperdata row)
59
60 ------------------------------------------------------------------------
61 -- | todo add limit and offset and order
62 searchInCorpus :: CorpusId
63 -> IsTrash
64 -> [Text]
65 -> Maybe Offset
66 -> Maybe Limit
67 -> Maybe OrderBy
68 -> Cmd err [FacetDoc]
69 searchInCorpus cId t q o l order = runOpaQuery
70 $ filterWith o l order
71 $ queryInCorpus cId t
72 $ intercalate " | "
73 $ map stemIt q
74
75 queryInCorpus :: CorpusId
76 -> IsTrash
77 -> Text
78 -> O.Query FacetDocRead
79 queryInCorpus cId t q = proc () -> do
80 (n, nn) <- joinInCorpus -< ()
81 restrict -< (nn^.nn_node1_id) .== (toNullable $ pgNodeId cId)
82 restrict -< if t
83 then (nn^.nn_category) .== (toNullable $ pgInt4 0)
84 else (nn^.nn_category) .>= (toNullable $ pgInt4 1)
85 restrict -< (n ^. ns_search) @@ (pgTSQuery (unpack q))
86 restrict -< (n ^. ns_typename ) .== (pgInt4 $ nodeTypeId NodeDocument)
87 returnA -< FacetDoc (n^.ns_id )
88 (n^.ns_date )
89 (n^.ns_name )
90 (n^.ns_hyperdata)
91 (nn^.nn_category)
92 (nn^.nn_score )
93
94 joinInCorpus :: O.Query (NodeSearchRead, NodeNodeReadNull)
95 joinInCorpus = leftJoin queryNodeSearchTable queryNodeNodeTable cond
96 where
97 cond :: (NodeSearchRead, NodeNodeRead) -> Column PGBool
98 cond (n, nn) = nn^.nn_node2_id .== _ns_id n
99
100 ------------------------------------------------------------------------
101 type AuthorName = Text
102
103 -- | TODO Optim: Offset and Limit in the Query
104 -- TODO-SECURITY check
105 searchInCorpusWithContacts
106 :: CorpusId
107 -> ListId
108 -> [Text]
109 -> Maybe Offset
110 -> Maybe Limit
111 -> Maybe OrderBy
112 -> Cmd err [FacetPaired Int UTCTime HyperdataDocument Int [Pair Int Text]]
113 searchInCorpusWithContacts cId lId q o l order =
114 take (maybe 10 identity l)
115 <$> drop (maybe 0 identity o)
116 <$> map (\((i,u,h,s), ps) -> FacetPaired i u h s ps)
117 <$> toList <$> fromListWith (<>)
118 <$> map (\(FacetPaired i u h s (p1,p2)) -> ( (i,u,h,s)
119 , catMaybes [Pair <$> p1 <*> p2]
120 )
121 )
122 <$> searchInCorpusWithContacts' cId lId q o l order
123
124 -- TODO-SECURITY check
125 searchInCorpusWithContacts'
126 :: CorpusId
127 -> ListId
128 -> [Text]
129 -> Maybe Offset
130 -> Maybe Limit
131 -> Maybe OrderBy
132 -> Cmd err [(FacetPaired Int UTCTime HyperdataDocument Int (Maybe Int, Maybe Text))]
133 searchInCorpusWithContacts' cId lId q o l order =
134 runOpaQuery $ queryInCorpusWithContacts cId lId o l order
135 $ intercalate " | "
136 $ map stemIt q
137
138
139 queryInCorpusWithContacts
140 :: CorpusId
141 -> ListId
142 -> Maybe Offset
143 -> Maybe Limit
144 -> Maybe OrderBy
145 -> Text
146 -> O.Query FacetPairedRead
147 queryInCorpusWithContacts cId _lId _ _ _ q = proc () -> do
148 (n, (nn, (_nng, (ngrams', (_, contacts))))) <- joinInCorpusWithContacts -< ()
149 restrict -< (n^.ns_search) @@ (pgTSQuery $ unpack q )
150 restrict -< (n^.ns_typename) .== (pgInt4 $ nodeTypeId NodeDocument)
151 -- restrict -< (nng^.nnng_node1_id) .== (toNullable $ pgNodeId lId)
152 restrict -< (nn^.nn_node1_id) .== (toNullable $ pgNodeId cId)
153 -- -- restrict -< (nng_listType nng) .== (toNullable $ pgNgramsType Authors)
154 -- restrict -< (contacts^.node_typename) .== (toNullable $ pgInt4 $ nodeTypeId NodeContact)
155 -- -- let contact_id = ifThenElse (isNull $ _node_id contacts) (toNullable $ pgInt4 0) (_node_id contacts)
156 returnA -< FacetPaired (n^.ns_id)
157 (n^.ns_date)
158 (n^.ns_hyperdata)
159 (pgInt4 0)
160 (contacts^.node_id, ngrams'^.ngrams_terms)
161
162 joinInCorpusWithContacts :: O.Query ( NodeSearchRead
163 , ( NodeNodeReadNull
164 , ( NodeNodeNgramsReadNull
165 , ( NgramsReadNull
166 , ( NodeNodeNgramsReadNull
167 , NodeReadNull
168 )
169 )
170 )
171 )
172 )
173 joinInCorpusWithContacts =
174 leftJoin6
175 queryNodeTable
176 queryNodeNodeNgramsTable
177 queryNgramsTable
178 queryNodeNodeNgramsTable
179 queryNodeNodeTable
180 queryNodeSearchTable
181 cond12
182 cond23
183 cond34
184 cond45
185 cond56
186 where
187 cond12 :: (NodeNodeNgramsRead, NodeRead) -> Column PGBool
188 cond12 (nnng, n2) = n2^.node_id .== nnng^.nnng_node1_id
189
190 cond23 :: (NgramsRead, (NodeNodeNgramsRead, NodeReadNull)) -> Column PGBool
191 cond23 (ng2, (nnng2, _)) = nnng2^.nnng_ngrams_id .== ng2^.ngrams_id
192
193 cond34 :: ( NodeNodeNgramsRead
194 , ( NgramsRead
195 , ( NodeNodeNgramsReadNull
196 , NodeReadNull
197 )
198 )
199 ) -> Column PGBool
200 cond34 (nng, (ng, (_,_))) = ng^.ngrams_id .== nng^.nnng_ngrams_id
201
202 cond45 :: ( NodeNodeRead
203 , ( NodeNodeNgramsRead
204 , ( NgramsReadNull
205 , ( NodeNodeNgramsReadNull
206 , NodeReadNull
207 )
208 )
209 )
210 ) -> Column PGBool
211 cond45 (nn, (nng, (_,(_,_)))) = nng^.nnng_node1_id .== nn^.nn_node2_id
212
213 cond56 :: ( NodeSearchRead
214 , ( NodeNodeRead
215 , ( NodeNodeNgramsReadNull
216 , ( NgramsReadNull
217 , ( NodeNodeNgramsReadNull
218 , NodeReadNull
219 )
220 )
221 )
222 )
223 ) -> Column PGBool
224 cond56 (n, (nn, (_,(_,(_,_))))) = _ns_id n .== nn^.nn_node2_id
225
226
227 newtype TSQuery = UnsafeTSQuery [Text]
228
229 -- | TODO [""] -> panic "error"
230 toTSQuery :: [Text] -> TSQuery
231 toTSQuery txt = UnsafeTSQuery $ map stemIt txt
232
233
234 instance IsString TSQuery
235 where
236 fromString = UnsafeTSQuery . words . cs
237
238
239 instance ToField TSQuery
240 where
241 toField (UnsafeTSQuery xs)
242 = Many $ intersperse (Plain " && ")
243 $ map (\q -> Many [ Plain "plainto_tsquery("
244 , Escape (cs q)
245 , Plain ")"
246 ]
247 ) xs
248
249 data Order = Asc | Desc
250
251 instance ToField Order
252 where
253 toField Asc = Plain "ASC"
254 toField Desc = Plain "DESC"
255
256 -- TODO
257 -- FIX fav
258 -- ADD ngrams count
259 -- TESTS
260 textSearchQuery :: Query
261 textSearchQuery = "SELECT n.id, n.hyperdata->'publication_year' \
262 \ , n.hyperdata->'title' \
263 \ , n.hyperdata->'source' \
264 \ , n.hyperdata->'authors' \
265 \ , COALESCE(nn.score,null) \
266 \ FROM nodes n \
267 \ LEFT JOIN nodes_nodes nn ON nn.node2_id = n.id \
268 \ WHERE \
269 \ n.search @@ (?::tsquery) \
270 \ AND (n.parent_id = ? OR nn.node1_id = ?) \
271 \ AND n.typename = ? \
272 \ ORDER BY n.hyperdata -> 'publication_date' ? \
273 \ offset ? limit ?;"
274
275 -- | Text Search Function for Master Corpus
276 -- TODO : text search for user corpus
277 -- Example:
278 -- textSearchTest :: ParentId -> TSQuery -> Cmd err [(Int, Value, Value, Value, Value, Maybe Int)]
279 -- textSearchTest pId q = textSearch q pId 5 0 Asc
280 textSearch :: TSQuery -> ParentId
281 -> Limit -> Offset -> Order
282 -> Cmd err [(Int,Value,Value,Value, Value, Maybe Int)]
283 textSearch q p l o ord = runPGSQuery textSearchQuery (q,p,p,typeId,ord,o,l)
284 where
285 typeId = nodeTypeId NodeDocument
286
287