2 Module : Gargantext.Database.TextSearch
3 Description : Postgres text search experimentation
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE Arrows #-}
12 {-# LANGUAGE LambdaCase #-}
14 module Gargantext.Database.Action.Search where
16 import Control.Arrow (returnA)
17 import Control.Lens ((^.), view)
18 import qualified Data.List as List
19 import qualified Data.Map.Strict as Map
21 import qualified Data.Set as Set
22 import Data.Text (Text, unpack, intercalate)
23 import Data.Time (UTCTime)
24 import Gargantext.Core
25 import Gargantext.Core.Types
26 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..), HyperdataContact(..))
27 import Gargantext.Database.Prelude (Cmd, runOpaQuery, runCountOpaQuery)
28 import Gargantext.Database.Query.Facet
29 import Gargantext.Database.Query.Filter
30 import Gargantext.Database.Query.Table.Node
31 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
32 import Gargantext.Database.Query.Table.Context
33 import Gargantext.Database.Query.Table.ContextNodeNgrams (queryContextNodeNgramsTable)
34 import Gargantext.Database.Query.Table.NodeContext
35 import Gargantext.Database.Query.Table.NodeContext_NodeContext
36 import Gargantext.Database.Schema.ContextNodeNgrams (ContextNodeNgramsPoly(..))
37 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
38 import Gargantext.Database.Schema.Node
39 import Gargantext.Database.Schema.Context
40 import Gargantext.Prelude
41 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
42 import Opaleye hiding (Order)
43 import Data.Profunctor.Product (p4)
44 import qualified Opaleye as O hiding (Order)
46 ------------------------------------------------------------------------
47 searchDocInDatabase :: HasDBid NodeType
50 -> Cmd err [(NodeId, HyperdataDocument)]
51 searchDocInDatabase p t = runOpaQuery (queryDocInDatabase p t)
53 -- | Global search query where ParentId is Master Node Corpus Id
54 queryDocInDatabase :: ParentId -> Text -> O.Select (Column SqlInt4, Column SqlJsonb)
55 queryDocInDatabase _p q = proc () -> do
56 row <- queryNodeSearchTable -< ()
57 restrict -< (_ns_search row) @@ (sqlTSQuery (unpack q))
58 restrict -< (_ns_typename row) .== (sqlInt4 $ toDBid NodeDocument)
59 returnA -< (_ns_id row, _ns_hyperdata row)
61 ------------------------------------------------------------------------
62 -- | Search ngrams in documents, ranking them by TF-IDF. We narrow our
63 -- search only to map/candidate terms.
64 searchInCorpusWithNgrams :: HasDBid NodeType
74 searchInCorpusWithNgrams _cId _lId _t _ngt _q _o _l _order = undefined
76 -- | Compute TF-IDF for all 'ngramIds' in given 'CorpusId'. In this
77 -- case only the "TF" part makes sense and so we only compute the
78 -- ratio of "number of times our terms appear in given document" and
79 -- "number of all terms in document" and return a sorted list of
81 tfidfAll :: (HasDBid NodeType, HasNodeError err) => CorpusId -> [Int] -> Cmd err [Int]
82 tfidfAll cId ngramIds = do
83 let ngramIdsSet = Set.fromList ngramIds
84 lId <- defaultList cId
85 docsWithNgrams <- runOpaQuery (queryListWithNgrams lId ngramIds) :: Cmd err [(Int, Int, Int)]
86 -- NOTE The query returned docs with ANY ngramIds. We need to further
87 -- restrict to ALL ngramIds.
89 Map.fromListWith (Set.union)
90 [ (ctxId, Set.singleton ngrams_id)
91 | (ctxId, ngrams_id, _) <- docsWithNgrams]
92 let docsWithAllNgramsS = Set.fromList $ List.map fst $
93 List.filter (\(_, docNgrams) ->
94 ngramIdsSet == Set.intersection ngramIdsSet docNgrams) $ Map.toList docsNgramsM
95 let docsWithAllNgrams =
96 List.filter (\(ctxId, _, _) ->
97 Set.member ctxId docsWithAllNgramsS) docsWithNgrams
98 -- printDebug "[tfidfAll] docsWithAllNgrams" docsWithAllNgrams
99 let docsWithCounts = Map.fromListWith (+) [ (ctxId, doc_count)
100 | (ctxId, _, doc_count) <- docsWithAllNgrams]
101 -- printDebug "[tfidfAll] docsWithCounts" docsWithCounts
102 let totals = [ ( ctxId
104 , fromIntegral doc_count :: Double
105 , fromIntegral (fromMaybe 0 $ Map.lookup ctxId docsWithCounts) :: Double)
106 | (ctxId, ngrams_id, doc_count) <- docsWithAllNgrams]
107 let tfidf_sorted = List.sortOn snd [(ctxId, doc_count/s)
108 | (ctxId, _, doc_count, s) <- totals]
109 pure $ List.map fst $ List.reverse tfidf_sorted
111 -- | Query for searching the 'context_node_ngrams' table so that we
112 -- find docs with ANY given 'ngramIds'.
113 queryListWithNgrams :: ListId -> [Int] -> Select (Column SqlInt4, Column SqlInt4, Column SqlInt4)
114 queryListWithNgrams lId ngramIds = proc () -> do
115 row <- queryContextNodeNgramsTable -< ()
116 restrict -< (_cnng_node_id row) .== (pgNodeId lId)
117 restrict -< in_ (sqlInt4 <$> ngramIds) (_cnng_ngrams_id row)
118 returnA -< ( _cnng_context_id row
119 , _cnng_ngrams_id row
120 , _cnng_doc_count row )
122 -- returnA -< ( _cnng_context_id row
123 -- , _cnng_node_id row
124 -- , _cnng_ngrams_id row
125 -- , _cnng_ngramsType row
126 -- , _cnng_weight row
127 -- , _cnng_doc_count row)
130 ------------------------------------------------------------------------
131 -- | todo add limit and offset and order
132 searchInCorpus :: HasDBid NodeType
139 -> Cmd err [FacetDoc]
140 searchInCorpus cId t q o l order = runOpaQuery
141 $ filterWith o l order
142 $ queryInCorpus cId t
146 searchCountInCorpus :: HasDBid NodeType
151 searchCountInCorpus cId t q = runCountOpaQuery
152 $ queryInCorpus cId t
156 queryInCorpus :: HasDBid NodeType
160 -> O.Select FacetDocRead
161 queryInCorpus cId t q = proc () -> do
162 c <- queryContextSearchTable -< ()
163 nc <- optionalRestrict queryNodeContextTable -<
164 \nc' -> (nc' ^. nc_context_id) .== _cs_id c
165 restrict -< (view nc_node_id <$> nc) .=== justFields (pgNodeId cId)
167 then (view nc_category <$> nc) .=== justFields (sqlInt4 0)
168 else matchMaybe (view nc_category <$> nc) $ \case
169 Nothing -> toFields False
170 Just c' -> c' .>= sqlInt4 1
171 restrict -< (c ^. cs_search) @@ sqlTSQuery (unpack q)
172 restrict -< (c ^. cs_typename ) .== sqlInt4 (toDBid NodeDocument)
173 returnA -< FacetDoc { facetDoc_id = c^.cs_id
174 , facetDoc_created = c^.cs_date
175 , facetDoc_title = c^.cs_name
176 , facetDoc_hyperdata = c^.cs_hyperdata
177 , facetDoc_category = maybeFieldsToNullable (view nc_category <$> nc)
178 , facetDoc_ngramCount = maybeFieldsToNullable (view nc_score <$> nc)
179 , facetDoc_score = maybeFieldsToNullable (view nc_score <$> nc)
182 ------------------------------------------------------------------------
183 searchInCorpusWithContacts
191 -> Cmd err [FacetPaired Int UTCTime HyperdataContact Int]
192 searchInCorpusWithContacts cId aId q o l _order =
193 runOpaQuery $ limit' l
195 $ orderBy (desc _fp_score)
196 $ selectGroup cId aId
200 selectGroup :: HasDBid NodeType
204 -> Select FacetPairedRead
205 selectGroup cId aId q = proc () -> do
206 (a, b, c, d) <- aggregate (p4 (groupBy, groupBy, groupBy, O.sum))
207 (selectContactViaDoc cId aId q) -< ()
208 returnA -< FacetPaired a b c d
218 , Field SqlTimestamptz
222 selectContactViaDoc cId aId query = proc () -> do
223 --(doc, (corpus, (_nodeContext_nodeContext, (annuaire, contact)))) <- queryContactViaDoc -< ()
224 (contact, annuaire, _, corpus, doc) <- queryContactViaDoc -< ()
225 restrict -< matchMaybe (view cs_search <$> doc) $ \case
226 Nothing -> toFields False
227 Just s -> s @@ sqlTSQuery (unpack query)
228 restrict -< (view cs_typename <$> doc) .=== justFields (sqlInt4 (toDBid NodeDocument))
229 restrict -< (view nc_node_id <$> corpus) .=== justFields (pgNodeId cId)
230 restrict -< (view nc_node_id <$> annuaire) .=== justFields (pgNodeId aId)
231 restrict -< (contact ^. context_typename) .== sqlInt4 (toDBid NodeContact)
232 returnA -< ( contact ^. context_id
233 , contact ^. context_date
234 , contact ^. context_hyperdata
238 queryContactViaDoc :: O.Select ( ContextRead
239 , MaybeFields NodeContextRead
240 , MaybeFields NodeContext_NodeContextRead
241 , MaybeFields NodeContextRead
242 , MaybeFields ContextSearchRead )
243 queryContactViaDoc = proc () -> do
244 contact <- queryContextTable -< ()
245 annuaire <- optionalRestrict queryNodeContextTable -<
246 \annuaire' -> (annuaire' ^. nc_context_id) .== (contact ^. context_id)
247 nodeContext_nodeContext <- optionalRestrict queryNodeContext_NodeContextTable -<
248 \ncnc' -> justFields (ncnc' ^. ncnc_nodecontext2) .=== (view nc_id <$> annuaire)
249 corpus <- optionalRestrict queryNodeContextTable -<
250 \corpus' -> justFields (corpus' ^. nc_id) .=== (view ncnc_nodecontext1 <$> nodeContext_nodeContext)
251 doc <- optionalRestrict queryContextSearchTable -<
252 \doc' -> justFields (doc' ^. cs_id) .=== (view nc_context_id <$> corpus)
254 returnA -< (contact, annuaire, nodeContext_nodeContext, corpus, doc)