2 Module : Gargantext.Database.TextSearch
3 Description : Postgres text search experimentation
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE Arrows #-}
12 {-# LANGUAGE LambdaCase #-}
14 module Gargantext.Database.Action.Search where
16 import Control.Arrow (returnA)
17 import Control.Lens ((^.), view)
18 import qualified Data.List as List
19 import qualified Data.Map.Strict as Map
21 import qualified Data.Set as Set
22 import Data.Text (Text, unpack, intercalate)
23 import Data.Time (UTCTime)
24 import Gargantext.Core
25 import Gargantext.Core.Types
26 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..), HyperdataContact(..))
27 import Gargantext.Database.Prelude (Cmd, runOpaQuery, runCountOpaQuery)
28 import Gargantext.Database.Query.Facet
29 import Gargantext.Database.Query.Filter
30 import Gargantext.Database.Query.Table.Node
31 import Gargantext.Database.Query.Table.Context
32 import Gargantext.Database.Query.Table.ContextNodeNgrams (queryContextNodeNgramsTable)
33 import Gargantext.Database.Query.Table.NodeContext
34 import Gargantext.Database.Query.Table.NodeContext_NodeContext
35 import Gargantext.Database.Schema.ContextNodeNgrams (ContextNodeNgramsPoly(..))
36 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
37 import Gargantext.Database.Schema.Node
38 import Gargantext.Database.Schema.Context
39 import Gargantext.Prelude
40 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
41 import Opaleye hiding (Order)
42 import Data.Profunctor.Product (p4)
43 import qualified Opaleye as O hiding (Order)
45 ------------------------------------------------------------------------
46 searchDocInDatabase :: HasDBid NodeType
49 -> Cmd err [(NodeId, HyperdataDocument)]
50 searchDocInDatabase p t = runOpaQuery (queryDocInDatabase p t)
52 -- | Global search query where ParentId is Master Node Corpus Id
53 queryDocInDatabase :: ParentId -> Text -> O.Select (Column SqlInt4, Column SqlJsonb)
54 queryDocInDatabase _p q = proc () -> do
55 row <- queryNodeSearchTable -< ()
56 restrict -< (_ns_search row) @@ (sqlTSQuery (unpack q))
57 restrict -< (_ns_typename row) .== (sqlInt4 $ toDBid NodeDocument)
58 returnA -< (_ns_id row, _ns_hyperdata row)
60 ------------------------------------------------------------------------
61 -- | Search ngrams in documents, ranking them by TF-IDF. We narrow our
62 -- search only to map/candidate terms.
63 searchInCorpusWithNgrams :: HasDBid NodeType
73 searchInCorpusWithNgrams _cId _lId _t _ngt _q _o _l _order = undefined
75 -- | Compute TF-IDF for all 'ngramIds' in given 'CorpusId'. In this
76 -- case only the "TF" part makes sense and so we only compute the
77 -- ratio of "number of times our terms appear in given document" and
78 -- "number of all terms in document" and return a sorted list of
80 tfidfAll :: CorpusId -> [Int] -> Cmd err [Int]
81 tfidfAll cId ngramIds = do
82 let ngramIdsSet = Set.fromList ngramIds
83 docsWithNgrams <- runOpaQuery (queryCorpusWithNgrams cId ngramIds) :: Cmd err [(Int, Int, Int)]
84 -- NOTE The query returned docs with ANY ngramIds. We need to further
85 -- restrict to ALL ngramIds.
87 Map.fromListWith (Set.union)
88 [ (ctxId, Set.singleton ngrams_id)
89 | (ctxId, ngrams_id, _) <- docsWithNgrams]
90 let docsWithAllNgramsS = Set.fromList $ List.map fst $
91 List.filter (\(_, docNgrams) ->
92 ngramIdsSet == Set.intersection ngramIdsSet docNgrams) $ Map.toList docsNgramsM
93 let docsWithAllNgrams =
94 List.filter (\(ctxId, _, _) ->
95 Set.member ctxId docsWithAllNgramsS) docsWithNgrams
96 printDebug "[tfidfAll] docsWithAllNgrams" docsWithAllNgrams
97 let docsWithCounts = Map.fromListWith (+) [ (ctxId, doc_count)
98 | (ctxId, _, doc_count) <- docsWithAllNgrams]
99 printDebug "[tfidfAll] docsWithCounts" docsWithCounts
100 let totals = [ ( ctxId
102 , fromIntegral doc_count :: Double
103 , fromIntegral (fromMaybe 0 $ Map.lookup ctxId docsWithCounts) :: Double)
104 | (ctxId, ngrams_id, doc_count) <- docsWithAllNgrams]
105 let tfidf_sorted = List.sortOn snd [(ctxId, doc_count/s)
106 | (ctxId, _, doc_count, s) <- totals]
107 pure $ List.map fst $ List.reverse tfidf_sorted
109 -- | Query for searching the 'context_node_ngrams' table so that we
110 -- find docs with ANY given 'ngramIds'.
111 queryCorpusWithNgrams :: CorpusId -> [Int] -> Select (Column SqlInt4, Column SqlInt4, Column SqlInt4)
112 queryCorpusWithNgrams cId ngramIds = proc () -> do
113 row <- queryContextNodeNgramsTable -< ()
114 restrict -< (_cnng_node_id row) .== (pgNodeId cId)
115 restrict -< in_ (sqlInt4 <$> ngramIds) (_cnng_ngrams_id row)
116 returnA -< ( _cnng_context_id row
117 , _cnng_ngrams_id row
118 , _cnng_doc_count row)
120 -- returnA -< ( _cnng_context_id row
121 -- , _cnng_node_id row
122 -- , _cnng_ngrams_id row
123 -- , _cnng_ngramsType row
124 -- , _cnng_weight row
125 -- , _cnng_doc_count row)
128 ------------------------------------------------------------------------
129 -- | todo add limit and offset and order
130 searchInCorpus :: HasDBid NodeType
137 -> Cmd err [FacetDoc]
138 searchInCorpus cId t q o l order = runOpaQuery
139 $ filterWith o l order
140 $ queryInCorpus cId t
144 searchCountInCorpus :: HasDBid NodeType
149 searchCountInCorpus cId t q = runCountOpaQuery
150 $ queryInCorpus cId t
154 queryInCorpus :: HasDBid NodeType
158 -> O.Select FacetDocRead
159 queryInCorpus cId t q = proc () -> do
160 c <- queryContextSearchTable -< ()
161 nc <- optionalRestrict queryNodeContextTable -<
162 \nc' -> (nc' ^. nc_context_id) .== _cs_id c
163 restrict -< (view nc_node_id <$> nc) .=== justFields (pgNodeId cId)
165 then (view nc_category <$> nc) .=== justFields (sqlInt4 0)
166 else matchMaybe (view nc_category <$> nc) $ \case
167 Nothing -> toFields False
168 Just c' -> c' .>= sqlInt4 1
169 restrict -< (c ^. cs_search) @@ sqlTSQuery (unpack q)
170 restrict -< (c ^. cs_typename ) .== sqlInt4 (toDBid NodeDocument)
171 returnA -< FacetDoc { facetDoc_id = c^.cs_id
172 , facetDoc_created = c^.cs_date
173 , facetDoc_title = c^.cs_name
174 , facetDoc_hyperdata = c^.cs_hyperdata
175 , facetDoc_category = maybeFieldsToNullable (view nc_category <$> nc)
176 , facetDoc_ngramCount = maybeFieldsToNullable (view nc_score <$> nc)
177 , facetDoc_score = maybeFieldsToNullable (view nc_score <$> nc)
180 ------------------------------------------------------------------------
181 searchInCorpusWithContacts
189 -> Cmd err [FacetPaired Int UTCTime HyperdataContact Int]
190 searchInCorpusWithContacts cId aId q o l _order =
191 runOpaQuery $ limit' l
193 $ orderBy (desc _fp_score)
194 $ selectGroup cId aId
198 selectGroup :: HasDBid NodeType
202 -> Select FacetPairedRead
203 selectGroup cId aId q = proc () -> do
204 (a, b, c, d) <- aggregate (p4 (groupBy, groupBy, groupBy, O.sum))
205 (selectContactViaDoc cId aId q) -< ()
206 returnA -< FacetPaired a b c d
216 , Field SqlTimestamptz
220 selectContactViaDoc cId aId query = proc () -> do
221 --(doc, (corpus, (_nodeContext_nodeContext, (annuaire, contact)))) <- queryContactViaDoc -< ()
222 (contact, annuaire, _, corpus, doc) <- queryContactViaDoc -< ()
223 restrict -< matchMaybe (view cs_search <$> doc) $ \case
224 Nothing -> toFields False
225 Just s -> s @@ sqlTSQuery (unpack query)
226 restrict -< (view cs_typename <$> doc) .=== justFields (sqlInt4 (toDBid NodeDocument))
227 restrict -< (view nc_node_id <$> corpus) .=== justFields (pgNodeId cId)
228 restrict -< (view nc_node_id <$> annuaire) .=== justFields (pgNodeId aId)
229 restrict -< (contact ^. context_typename) .== sqlInt4 (toDBid NodeContact)
230 returnA -< ( contact ^. context_id
231 , contact ^. context_date
232 , contact ^. context_hyperdata
236 queryContactViaDoc :: O.Select ( ContextRead
237 , MaybeFields NodeContextRead
238 , MaybeFields NodeContext_NodeContextRead
239 , MaybeFields NodeContextRead
240 , MaybeFields ContextSearchRead )
241 queryContactViaDoc = proc () -> do
242 contact <- queryContextTable -< ()
243 annuaire <- optionalRestrict queryNodeContextTable -<
244 \annuaire' -> (annuaire' ^. nc_context_id) .== (contact ^. context_id)
245 nodeContext_nodeContext <- optionalRestrict queryNodeContext_NodeContextTable -<
246 \ncnc' -> justFields (ncnc' ^. ncnc_nodecontext2) .=== (view nc_id <$> annuaire)
247 corpus <- optionalRestrict queryNodeContextTable -<
248 \corpus' -> justFields (corpus' ^. nc_id) .=== (view ncnc_nodecontext1 <$> nodeContext_nodeContext)
249 doc <- optionalRestrict queryContextSearchTable -<
250 \doc' -> justFields (doc' ^. cs_id) .=== (view nc_context_id <$> corpus)
252 returnA -< (contact, annuaire, nodeContext_nodeContext, corpus, doc)