]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Search.hs
Merge remote-tracking branch 'origin/dev-177-DoorWelcome' into dev
[gargantext.git] / src / Gargantext / Database / Action / Search.hs
1 {-|
2 Module : Gargantext.Database.TextSearch
3 Description : Postgres text search experimentation
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE Arrows #-}
12 {-# LANGUAGE LambdaCase #-}
13
14 module Gargantext.Database.Action.Search where
15
16 import Control.Arrow (returnA)
17 import Control.Lens ((^.), view)
18 import qualified Data.List as List
19 import qualified Data.Map.Strict as Map
20 import Data.Maybe
21 import qualified Data.Set as Set
22 import Data.Text (Text, unpack, intercalate)
23 import Data.Time (UTCTime)
24 import Gargantext.Core
25 import Gargantext.Core.Types
26 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..), HyperdataContact(..))
27 import Gargantext.Database.Prelude (Cmd, runOpaQuery, runCountOpaQuery)
28 import Gargantext.Database.Query.Facet
29 import Gargantext.Database.Query.Filter
30 import Gargantext.Database.Query.Join (leftJoin5)
31 import Gargantext.Database.Query.Table.Node
32 import Gargantext.Database.Query.Table.Context
33 import Gargantext.Database.Query.Table.ContextNodeNgrams (queryContextNodeNgramsTable)
34 import Gargantext.Database.Query.Table.NodeContext
35 import Gargantext.Database.Query.Table.NodeContext_NodeContext
36 import Gargantext.Database.Schema.ContextNodeNgrams (ContextNodeNgramsPoly(..))
37 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
38 import Gargantext.Database.Schema.Node
39 import Gargantext.Database.Schema.Context
40 import Gargantext.Prelude
41 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
42 import Opaleye hiding (Order)
43 import Data.Profunctor.Product (p4)
44 import qualified Opaleye as O hiding (Order)
45
46 ------------------------------------------------------------------------
47 searchDocInDatabase :: HasDBid NodeType
48 => ParentId
49 -> Text
50 -> Cmd err [(NodeId, HyperdataDocument)]
51 searchDocInDatabase p t = runOpaQuery (queryDocInDatabase p t)
52 where
53 -- | Global search query where ParentId is Master Node Corpus Id
54 queryDocInDatabase :: ParentId -> Text -> O.Select (Column SqlInt4, Column SqlJsonb)
55 queryDocInDatabase _p q = proc () -> do
56 row <- queryNodeSearchTable -< ()
57 restrict -< (_ns_search row) @@ (sqlTSQuery (unpack q))
58 restrict -< (_ns_typename row) .== (sqlInt4 $ toDBid NodeDocument)
59 returnA -< (_ns_id row, _ns_hyperdata row)
60
61 ------------------------------------------------------------------------
62 -- | Search ngrams in documents, ranking them by TF-IDF. We narrow our
63 -- search only to map/candidate terms.
64 searchInCorpusWithNgrams :: HasDBid NodeType
65 => CorpusId
66 -> ListId
67 -> IsTrash
68 -> NgramsType
69 -> [[Text]]
70 -> Maybe Offset
71 -> Maybe Limit
72 -> Maybe OrderBy
73 -> Cmd err [FacetDoc]
74 searchInCorpusWithNgrams _cId _lId _t _ngt _q _o _l _order = undefined
75
76 -- | Compute TF-IDF for all 'ngramIds' in given 'CorpusId'. In this
77 -- case only the "TF" part makes sense and so we only compute the
78 -- ratio of "number of times our terms appear in given document" and
79 -- "number of all terms in document" and return a sorted list of
80 -- document ids
81 tfidfAll :: CorpusId -> [Int] -> Cmd err [Int]
82 tfidfAll cId ngramIds = do
83 let ngramIdsSet = Set.fromList ngramIds
84 docsWithNgrams <- runOpaQuery (queryCorpusWithNgrams cId ngramIds) :: Cmd err [(Int, Int, Int)]
85 -- NOTE The query returned docs with ANY ngramIds. We need to further
86 -- restrict to ALL ngramIds.
87 let docsNgramsM =
88 Map.fromListWith (Set.union)
89 [ (ctxId, Set.singleton ngrams_id)
90 | (ctxId, ngrams_id, _) <- docsWithNgrams]
91 let docsWithAllNgramsS = Set.fromList $ List.map fst $
92 List.filter (\(_, docNgrams) ->
93 ngramIdsSet == Set.intersection ngramIdsSet docNgrams) $ Map.toList docsNgramsM
94 let docsWithAllNgrams =
95 List.filter (\(ctxId, _, _) ->
96 Set.member ctxId docsWithAllNgramsS) docsWithNgrams
97 printDebug "[tfidfAll] docsWithAllNgrams" docsWithAllNgrams
98 let docsWithCounts = Map.fromListWith (+) [ (ctxId, doc_count)
99 | (ctxId, _, doc_count) <- docsWithAllNgrams]
100 printDebug "[tfidfAll] docsWithCounts" docsWithCounts
101 let totals = [ ( ctxId
102 , ngrams_id
103 , fromIntegral doc_count :: Double
104 , fromIntegral (fromMaybe 0 $ Map.lookup ctxId docsWithCounts) :: Double)
105 | (ctxId, ngrams_id, doc_count) <- docsWithAllNgrams]
106 let tfidf_sorted = List.sortOn snd [(ctxId, doc_count/s)
107 | (ctxId, _, doc_count, s) <- totals]
108 pure $ List.map fst $ List.reverse tfidf_sorted
109
110 -- | Query for searching the 'context_node_ngrams' table so that we
111 -- find docs with ANY given 'ngramIds'.
112 queryCorpusWithNgrams :: CorpusId -> [Int] -> Select (Column SqlInt4, Column SqlInt4, Column SqlInt4)
113 queryCorpusWithNgrams cId ngramIds = proc () -> do
114 row <- queryContextNodeNgramsTable -< ()
115 restrict -< (_cnng_node_id row) .== (pgNodeId cId)
116 restrict -< in_ (sqlInt4 <$> ngramIds) (_cnng_ngrams_id row)
117 returnA -< ( _cnng_context_id row
118 , _cnng_ngrams_id row
119 , _cnng_doc_count row)
120 --returnA -< row
121 -- returnA -< ( _cnng_context_id row
122 -- , _cnng_node_id row
123 -- , _cnng_ngrams_id row
124 -- , _cnng_ngramsType row
125 -- , _cnng_weight row
126 -- , _cnng_doc_count row)
127
128
129 ------------------------------------------------------------------------
130 -- | todo add limit and offset and order
131 searchInCorpus :: HasDBid NodeType
132 => CorpusId
133 -> IsTrash
134 -> [Text]
135 -> Maybe Offset
136 -> Maybe Limit
137 -> Maybe OrderBy
138 -> Cmd err [FacetDoc]
139 searchInCorpus cId t q o l order = runOpaQuery
140 $ filterWith o l order
141 $ queryInCorpus cId t
142 $ intercalate " | "
143 $ map stemIt q
144
145 searchCountInCorpus :: HasDBid NodeType
146 => CorpusId
147 -> IsTrash
148 -> [Text]
149 -> Cmd err Int
150 searchCountInCorpus cId t q = runCountOpaQuery
151 $ queryInCorpus cId t
152 $ intercalate " | "
153 $ map stemIt q
154
155 queryInCorpus :: HasDBid NodeType
156 => CorpusId
157 -> IsTrash
158 -> Text
159 -> O.Select FacetDocRead
160 queryInCorpus cId t q = proc () -> do
161 c <- queryContextSearchTable -< ()
162 nc <- optionalRestrict queryNodeContextTable -<
163 \nc' -> (nc' ^. nc_context_id) .== _cs_id c
164 restrict -< (view nc_node_id <$> nc) .=== justFields (pgNodeId cId)
165 restrict -< if t
166 then (view nc_category <$> nc) .=== justFields (sqlInt4 0)
167 else matchMaybe (view nc_category <$> nc) $ \case
168 Nothing -> toFields False
169 Just c' -> c' .>= sqlInt4 1
170 restrict -< (c ^. cs_search) @@ sqlTSQuery (unpack q)
171 restrict -< (c ^. cs_typename ) .== sqlInt4 (toDBid NodeDocument)
172 returnA -< FacetDoc { facetDoc_id = c^.cs_id
173 , facetDoc_created = c^.cs_date
174 , facetDoc_title = c^.cs_name
175 , facetDoc_hyperdata = c^.cs_hyperdata
176 , facetDoc_category = maybeFieldsToNullable (view nc_category <$> nc)
177 , facetDoc_ngramCount = maybeFieldsToNullable (view nc_score <$> nc)
178 , facetDoc_score = maybeFieldsToNullable (view nc_score <$> nc)
179 }
180
181 ------------------------------------------------------------------------
182 searchInCorpusWithContacts
183 :: HasDBid NodeType
184 => CorpusId
185 -> AnnuaireId
186 -> [Text]
187 -> Maybe Offset
188 -> Maybe Limit
189 -> Maybe OrderBy
190 -> Cmd err [FacetPaired Int UTCTime HyperdataContact Int]
191 searchInCorpusWithContacts cId aId q o l _order =
192 runOpaQuery $ limit' l
193 $ offset' o
194 $ orderBy (desc _fp_score)
195 $ selectGroup cId aId
196 $ intercalate " | "
197 $ map stemIt q
198
199 selectGroup :: HasDBid NodeType
200 => CorpusId
201 -> AnnuaireId
202 -> Text
203 -> Select FacetPairedRead
204 selectGroup cId aId q = proc () -> do
205 (a, b, c, d) <- aggregate (p4 (groupBy, groupBy, groupBy, O.sum))
206 (selectContactViaDoc cId aId q) -< ()
207 returnA -< FacetPaired a b c d
208
209
210 selectContactViaDoc
211 :: HasDBid NodeType
212 => CorpusId
213 -> AnnuaireId
214 -> Text
215 -> SelectArr ()
216 ( Field SqlInt4
217 , Field SqlTimestamptz
218 , Field SqlJsonb
219 , Field SqlInt4
220 )
221 selectContactViaDoc cId aId query = proc () -> do
222 --(doc, (corpus, (_nodeContext_nodeContext, (annuaire, contact)))) <- queryContactViaDoc -< ()
223 (contact, annuaire, _, corpus, doc) <- queryContactViaDoc -< ()
224 restrict -< matchMaybe (view cs_search <$> doc) $ \case
225 Nothing -> toFields False
226 Just s -> s @@ sqlTSQuery (unpack query)
227 restrict -< (view cs_typename <$> doc) .=== justFields (sqlInt4 (toDBid NodeDocument))
228 restrict -< (view nc_node_id <$> corpus) .=== justFields (pgNodeId cId)
229 restrict -< (view nc_node_id <$> annuaire) .=== justFields (pgNodeId aId)
230 restrict -< (contact ^. context_typename) .== sqlInt4 (toDBid NodeContact)
231 returnA -< ( contact ^. context_id
232 , contact ^. context_date
233 , contact ^. context_hyperdata
234 , sqlInt4 1
235 )
236
237 queryContactViaDoc :: O.Select ( ContextRead
238 , MaybeFields NodeContextRead
239 , MaybeFields NodeContext_NodeContextRead
240 , MaybeFields NodeContextRead
241 , MaybeFields ContextSearchRead )
242 queryContactViaDoc = proc () -> do
243 contact <- queryContextTable -< ()
244 annuaire <- optionalRestrict queryNodeContextTable -<
245 \annuaire' -> (annuaire' ^. nc_context_id) .== (contact ^. context_id)
246 nodeContext_nodeContext <- optionalRestrict queryNodeContext_NodeContextTable -<
247 \ncnc' -> justFields (ncnc' ^. ncnc_nodecontext2) .=== (view nc_id <$> annuaire)
248 corpus <- optionalRestrict queryNodeContextTable -<
249 \corpus' -> justFields (corpus' ^. nc_id) .=== (view ncnc_nodecontext1 <$> nodeContext_nodeContext)
250 doc <- optionalRestrict queryContextSearchTable -<
251 \doc' -> justFields (doc' ^. cs_id) .=== (view nc_context_id <$> corpus)
252
253 returnA -< (contact, annuaire, nodeContext_nodeContext, corpus, doc)
254
255 queryContactViaDoc' :: O.Select ( ContextSearchRead
256 , ( NodeContextReadNull
257 , ( NodeContext_NodeContextReadNull
258 , ( NodeContextReadNull
259 , ContextReadNull
260 )
261 )
262 )
263 )
264 queryContactViaDoc' =
265 leftJoin5
266 queryContextTable
267 queryNodeContextTable
268 queryNodeContext_NodeContextTable
269 queryNodeContextTable
270 queryContextSearchTable
271 cond12
272 cond23
273 cond34
274 cond45
275 where
276 cond12 :: (NodeContextRead, ContextRead) -> Column SqlBool
277 cond12 (annuaire, contact) = contact^.context_id .== annuaire^.nc_context_id
278
279 cond23 :: ( NodeContext_NodeContextRead
280 , ( NodeContextRead
281 , ContextReadNull
282 )
283 ) -> Column SqlBool
284 cond23 (nodeContext_nodeContext, (annuaire, _)) = nodeContext_nodeContext^.ncnc_nodecontext2 .== annuaire^.nc_id
285
286 cond34 :: ( NodeContextRead
287 , ( NodeContext_NodeContextRead
288 , ( NodeContextReadNull
289 , ContextReadNull
290 )
291 )
292 ) -> Column SqlBool
293 cond34 (corpus, (nodeContext_nodeContext, (_,_))) = nodeContext_nodeContext^.ncnc_nodecontext1 .== corpus^.nc_id
294
295
296 cond45 :: ( ContextSearchRead
297 , ( NodeContextRead
298 , ( NodeContext_NodeContextReadNull
299 , ( NodeContextReadNull
300 , ContextReadNull
301 )
302 )
303 )
304 ) -> Column SqlBool
305 cond45 (doc, (corpus, (_,(_,_)))) = doc^.cs_id .== corpus^.nc_context_id