2 Module : Gargantext.Database.TextSearch
3 Description : Postgres text search experimentation
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE Arrows #-}
12 {-# LANGUAGE LambdaCase #-}
14 module Gargantext.Database.Action.Search where
16 import Control.Arrow (returnA)
17 import Control.Lens ((^.), view)
18 import qualified Data.List as List
19 import qualified Data.Map.Strict as Map
21 import qualified Data.Set as Set
22 import Data.Text (Text, unpack, intercalate)
23 import Data.Time (UTCTime)
24 import Gargantext.Core
25 import Gargantext.Core.Types
26 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..), HyperdataContact(..))
27 import Gargantext.Database.Prelude (Cmd, runOpaQuery, runCountOpaQuery)
28 import Gargantext.Database.Query.Facet
29 import Gargantext.Database.Query.Filter
30 import Gargantext.Database.Query.Join (leftJoin5)
31 import Gargantext.Database.Query.Table.Node
32 import Gargantext.Database.Query.Table.Context
33 import Gargantext.Database.Query.Table.ContextNodeNgrams (queryContextNodeNgramsTable)
34 import Gargantext.Database.Query.Table.NodeContext
35 import Gargantext.Database.Query.Table.NodeContext_NodeContext
36 import Gargantext.Database.Schema.ContextNodeNgrams (ContextNodeNgramsPoly(..))
37 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
38 import Gargantext.Database.Schema.Node
39 import Gargantext.Database.Schema.Context
40 import Gargantext.Prelude
41 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
42 import Opaleye hiding (Order)
43 import Data.Profunctor.Product (p4)
44 import qualified Opaleye as O hiding (Order)
46 ------------------------------------------------------------------------
47 searchDocInDatabase :: HasDBid NodeType
50 -> Cmd err [(NodeId, HyperdataDocument)]
51 searchDocInDatabase p t = runOpaQuery (queryDocInDatabase p t)
53 -- | Global search query where ParentId is Master Node Corpus Id
54 queryDocInDatabase :: ParentId -> Text -> O.Select (Column SqlInt4, Column SqlJsonb)
55 queryDocInDatabase _p q = proc () -> do
56 row <- queryNodeSearchTable -< ()
57 restrict -< (_ns_search row) @@ (sqlTSQuery (unpack q))
58 restrict -< (_ns_typename row) .== (sqlInt4 $ toDBid NodeDocument)
59 returnA -< (_ns_id row, _ns_hyperdata row)
61 ------------------------------------------------------------------------
62 -- | Search ngrams in documents, ranking them by TF-IDF. We narrow our
63 -- search only to map/candidate terms.
64 searchInCorpusWithNgrams :: HasDBid NodeType
74 searchInCorpusWithNgrams _cId _lId _t _ngt _q _o _l _order = undefined
76 -- | Compute TF-IDF for all 'ngramIds' in given 'CorpusId'. In this
77 -- case only the "TF" part makes sense and so we only compute the
78 -- ratio of "number of times our terms appear in given document" and
79 -- "number of all terms in document" and return a sorted list of
81 tfidfAll :: CorpusId -> [Int] -> Cmd err [Int]
82 tfidfAll cId ngramIds = do
83 let ngramIdsSet = Set.fromList ngramIds
84 docsWithNgrams <- runOpaQuery (queryCorpusWithNgrams cId ngramIds) :: Cmd err [(Int, Int, Int)]
85 -- NOTE The query returned docs with ANY ngramIds. We need to further
86 -- restrict to ALL ngramIds.
88 Map.fromListWith (Set.union)
89 [ (ctxId, Set.singleton ngrams_id)
90 | (ctxId, ngrams_id, _) <- docsWithNgrams]
91 let docsWithAllNgramsS = Set.fromList $ List.map fst $
92 List.filter (\(_, docNgrams) ->
93 ngramIdsSet == Set.intersection ngramIdsSet docNgrams) $ Map.toList docsNgramsM
94 let docsWithAllNgrams =
95 List.filter (\(ctxId, _, _) ->
96 Set.member ctxId docsWithAllNgramsS) docsWithNgrams
97 printDebug "[tfidfAll] docsWithAllNgrams" docsWithAllNgrams
98 let docsWithCounts = Map.fromListWith (+) [ (ctxId, doc_count)
99 | (ctxId, _, doc_count) <- docsWithAllNgrams]
100 printDebug "[tfidfAll] docsWithCounts" docsWithCounts
101 let totals = [ ( ctxId
103 , fromIntegral doc_count :: Double
104 , fromIntegral (fromMaybe 0 $ Map.lookup ctxId docsWithCounts) :: Double)
105 | (ctxId, ngrams_id, doc_count) <- docsWithAllNgrams]
106 let tfidf_sorted = List.sortOn snd [(ctxId, doc_count/s)
107 | (ctxId, _, doc_count, s) <- totals]
108 pure $ List.map fst $ List.reverse tfidf_sorted
110 -- | Query for searching the 'context_node_ngrams' table so that we
111 -- find docs with ANY given 'ngramIds'.
112 queryCorpusWithNgrams :: CorpusId -> [Int] -> Select (Column SqlInt4, Column SqlInt4, Column SqlInt4)
113 queryCorpusWithNgrams cId ngramIds = proc () -> do
114 row <- queryContextNodeNgramsTable -< ()
115 restrict -< (_cnng_node_id row) .== (pgNodeId cId)
116 restrict -< in_ (sqlInt4 <$> ngramIds) (_cnng_ngrams_id row)
117 returnA -< ( _cnng_context_id row
118 , _cnng_ngrams_id row
119 , _cnng_doc_count row)
121 -- returnA -< ( _cnng_context_id row
122 -- , _cnng_node_id row
123 -- , _cnng_ngrams_id row
124 -- , _cnng_ngramsType row
125 -- , _cnng_weight row
126 -- , _cnng_doc_count row)
129 ------------------------------------------------------------------------
130 -- | todo add limit and offset and order
131 searchInCorpus :: HasDBid NodeType
138 -> Cmd err [FacetDoc]
139 searchInCorpus cId t q o l order = runOpaQuery
140 $ filterWith o l order
141 $ queryInCorpus cId t
145 searchCountInCorpus :: HasDBid NodeType
150 searchCountInCorpus cId t q = runCountOpaQuery
151 $ queryInCorpus cId t
155 queryInCorpus :: HasDBid NodeType
159 -> O.Select FacetDocRead
160 queryInCorpus cId t q = proc () -> do
161 c <- queryContextSearchTable -< ()
162 nc <- optionalRestrict queryNodeContextTable -<
163 \nc' -> (nc' ^. nc_context_id) .== _cs_id c
164 restrict -< (view nc_node_id <$> nc) .=== justFields (pgNodeId cId)
166 then (view nc_category <$> nc) .=== justFields (sqlInt4 0)
167 else matchMaybe (view nc_category <$> nc) $ \case
168 Nothing -> toFields False
169 Just c' -> c' .>= sqlInt4 1
170 restrict -< (c ^. cs_search) @@ sqlTSQuery (unpack q)
171 restrict -< (c ^. cs_typename ) .== sqlInt4 (toDBid NodeDocument)
172 returnA -< FacetDoc { facetDoc_id = c^.cs_id
173 , facetDoc_created = c^.cs_date
174 , facetDoc_title = c^.cs_name
175 , facetDoc_hyperdata = c^.cs_hyperdata
176 , facetDoc_category = maybeFieldsToNullable (view nc_category <$> nc)
177 , facetDoc_ngramCount = maybeFieldsToNullable (view nc_score <$> nc)
178 , facetDoc_score = maybeFieldsToNullable (view nc_score <$> nc)
181 ------------------------------------------------------------------------
182 searchInCorpusWithContacts
190 -> Cmd err [FacetPaired Int UTCTime HyperdataContact Int]
191 searchInCorpusWithContacts cId aId q o l _order =
192 runOpaQuery $ limit' l
194 $ orderBy (desc _fp_score)
195 $ selectGroup cId aId
199 selectGroup :: HasDBid NodeType
203 -> Select FacetPairedRead
204 selectGroup cId aId q = proc () -> do
205 (a, b, c, d) <- aggregate (p4 (groupBy, groupBy, groupBy, O.sum))
206 (selectContactViaDoc cId aId q) -< ()
207 returnA -< FacetPaired a b c d
217 , Field SqlTimestamptz
221 selectContactViaDoc cId aId query = proc () -> do
222 --(doc, (corpus, (_nodeContext_nodeContext, (annuaire, contact)))) <- queryContactViaDoc -< ()
223 (contact, annuaire, _, corpus, doc) <- queryContactViaDoc -< ()
224 restrict -< matchMaybe (view cs_search <$> doc) $ \case
225 Nothing -> toFields False
226 Just s -> s @@ sqlTSQuery (unpack query)
227 restrict -< (view cs_typename <$> doc) .=== justFields (sqlInt4 (toDBid NodeDocument))
228 restrict -< (view nc_node_id <$> corpus) .=== justFields (pgNodeId cId)
229 restrict -< (view nc_node_id <$> annuaire) .=== justFields (pgNodeId aId)
230 restrict -< (contact ^. context_typename) .== sqlInt4 (toDBid NodeContact)
231 returnA -< ( contact ^. context_id
232 , contact ^. context_date
233 , contact ^. context_hyperdata
237 queryContactViaDoc :: O.Select ( ContextRead
238 , MaybeFields NodeContextRead
239 , MaybeFields NodeContext_NodeContextRead
240 , MaybeFields NodeContextRead
241 , MaybeFields ContextSearchRead )
242 queryContactViaDoc = proc () -> do
243 contact <- queryContextTable -< ()
244 annuaire <- optionalRestrict queryNodeContextTable -<
245 \annuaire' -> (annuaire' ^. nc_context_id) .== (contact ^. context_id)
246 nodeContext_nodeContext <- optionalRestrict queryNodeContext_NodeContextTable -<
247 \ncnc' -> justFields (ncnc' ^. ncnc_nodecontext2) .=== (view nc_id <$> annuaire)
248 corpus <- optionalRestrict queryNodeContextTable -<
249 \corpus' -> justFields (corpus' ^. nc_id) .=== (view ncnc_nodecontext1 <$> nodeContext_nodeContext)
250 doc <- optionalRestrict queryContextSearchTable -<
251 \doc' -> justFields (doc' ^. cs_id) .=== (view nc_context_id <$> corpus)
253 returnA -< (contact, annuaire, nodeContext_nodeContext, corpus, doc)
255 queryContactViaDoc' :: O.Select ( ContextSearchRead
256 , ( NodeContextReadNull
257 , ( NodeContext_NodeContextReadNull
258 , ( NodeContextReadNull
264 queryContactViaDoc' =
267 queryNodeContextTable
268 queryNodeContext_NodeContextTable
269 queryNodeContextTable
270 queryContextSearchTable
276 cond12 :: (NodeContextRead, ContextRead) -> Column SqlBool
277 cond12 (annuaire, contact) = contact^.context_id .== annuaire^.nc_context_id
279 cond23 :: ( NodeContext_NodeContextRead
284 cond23 (nodeContext_nodeContext, (annuaire, _)) = nodeContext_nodeContext^.ncnc_nodecontext2 .== annuaire^.nc_id
286 cond34 :: ( NodeContextRead
287 , ( NodeContext_NodeContextRead
288 , ( NodeContextReadNull
293 cond34 (corpus, (nodeContext_nodeContext, (_,_))) = nodeContext_nodeContext^.ncnc_nodecontext1 .== corpus^.nc_id
296 cond45 :: ( ContextSearchRead
298 , ( NodeContext_NodeContextReadNull
299 , ( NodeContextReadNull
305 cond45 (doc, (corpus, (_,(_,_)))) = doc^.cs_id .== corpus^.nc_context_id