]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Search.hs
Update README.md
[gargantext.git] / src / Gargantext / Database / Action / Search.hs
1 {-|
2 Module : Gargantext.Database.TextSearch
3 Description : Postgres text search experimentation
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE Arrows #-}
12
13 module Gargantext.Database.Action.Search where
14
15 import Control.Arrow (returnA)
16 import Control.Lens ((^.))
17 import qualified Data.List as List
18 import qualified Data.Map.Strict as Map
19 import Data.Maybe
20 import qualified Data.Set as Set
21 import Data.Text (Text, unpack, intercalate)
22 import Data.Time (UTCTime)
23 import Gargantext.Core
24 import Gargantext.Core.Types
25 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..), HyperdataContact(..))
26 import Gargantext.Database.Prelude (Cmd, runOpaQuery, runCountOpaQuery)
27 import Gargantext.Database.Query.Facet
28 import Gargantext.Database.Query.Filter
29 import Gargantext.Database.Query.Join (leftJoin5)
30 import Gargantext.Database.Query.Table.Node
31 import Gargantext.Database.Query.Table.Context
32 import Gargantext.Database.Query.Table.ContextNodeNgrams (queryContextNodeNgramsTable)
33 import Gargantext.Database.Query.Table.NodeContext
34 import Gargantext.Database.Query.Table.NodeContext_NodeContext
35 import Gargantext.Database.Schema.ContextNodeNgrams (ContextNodeNgramsPoly(..))
36 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
37 import Gargantext.Database.Schema.Node
38 import Gargantext.Database.Schema.Context
39 import Gargantext.Prelude
40 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
41 import Opaleye hiding (Order)
42 import Data.Profunctor.Product (p4)
43 import qualified Opaleye as O hiding (Order)
44
45 ------------------------------------------------------------------------
46 searchDocInDatabase :: HasDBid NodeType
47 => ParentId
48 -> Text
49 -> Cmd err [(NodeId, HyperdataDocument)]
50 searchDocInDatabase p t = runOpaQuery (queryDocInDatabase p t)
51 where
52 -- | Global search query where ParentId is Master Node Corpus Id
53 queryDocInDatabase :: ParentId -> Text -> O.Select (Column SqlInt4, Column SqlJsonb)
54 queryDocInDatabase _p q = proc () -> do
55 row <- queryNodeSearchTable -< ()
56 restrict -< (_ns_search row) @@ (sqlTSQuery (unpack q))
57 restrict -< (_ns_typename row) .== (sqlInt4 $ toDBid NodeDocument)
58 returnA -< (_ns_id row, _ns_hyperdata row)
59
60 ------------------------------------------------------------------------
61 -- | Search ngrams in documents, ranking them by TF-IDF. We narrow our
62 -- search only to map/candidate terms.
63 searchInCorpusWithNgrams :: HasDBid NodeType
64 => CorpusId
65 -> ListId
66 -> IsTrash
67 -> NgramsType
68 -> [[Text]]
69 -> Maybe Offset
70 -> Maybe Limit
71 -> Maybe OrderBy
72 -> Cmd err [FacetDoc]
73 searchInCorpusWithNgrams _cId _lId _t _ngt _q _o _l _order = undefined
74
75 -- | Compute TF-IDF for all 'ngramIds' in given 'CorpusId'. In this
76 -- case only the "TF" part makes sense and so we only compute the
77 -- ratio of "number of times our terms appear in given document" and
78 -- "number of all terms in document" and return a sorted list of
79 -- document ids
80 tfidfAll :: CorpusId -> [Int] -> Cmd err [Int]
81 tfidfAll cId ngramIds = do
82 let ngramIdsSet = Set.fromList ngramIds
83 docsWithNgrams <- runOpaQuery (queryCorpusWithNgrams cId ngramIds) :: Cmd err [(Int, Int, Int)]
84 -- NOTE The query returned docs with ANY ngramIds. We need to further
85 -- restrict to ALL ngramIds.
86 let docsNgramsM =
87 Map.fromListWith (Set.union)
88 [ (ctxId, Set.singleton ngrams_id)
89 | (ctxId, ngrams_id, _) <- docsWithNgrams]
90 let docsWithAllNgramsS = Set.fromList $ List.map fst $
91 List.filter (\(_, docNgrams) ->
92 ngramIdsSet == Set.intersection ngramIdsSet docNgrams) $ Map.toList docsNgramsM
93 let docsWithAllNgrams =
94 List.filter (\(ctxId, _, _) ->
95 Set.member ctxId docsWithAllNgramsS) docsWithNgrams
96 printDebug "[tfidfAll] docsWithAllNgrams" docsWithAllNgrams
97 let docsWithCounts = Map.fromListWith (+) [ (ctxId, doc_count)
98 | (ctxId, _, doc_count) <- docsWithAllNgrams]
99 printDebug "[tfidfAll] docsWithCounts" docsWithCounts
100 let totals = [ ( ctxId
101 , ngrams_id
102 , fromIntegral doc_count :: Double
103 , fromIntegral (fromMaybe 0 $ Map.lookup ctxId docsWithCounts) :: Double)
104 | (ctxId, ngrams_id, doc_count) <- docsWithAllNgrams]
105 let tfidf_sorted = List.sortOn snd [(ctxId, doc_count/s)
106 | (ctxId, _, doc_count, s) <- totals]
107 pure $ List.map fst $ List.reverse tfidf_sorted
108
109 -- | Query for searching the 'context_node_ngrams' table so that we
110 -- find docs with ANY given 'ngramIds'.
111 queryCorpusWithNgrams :: CorpusId -> [Int] -> Select (Column SqlInt4, Column SqlInt4, Column SqlInt4)
112 queryCorpusWithNgrams cId ngramIds = proc () -> do
113 row <- queryContextNodeNgramsTable -< ()
114 restrict -< (_cnng_node_id row) .== (pgNodeId cId)
115 restrict -< in_ (sqlInt4 <$> ngramIds) (_cnng_ngrams_id row)
116 returnA -< ( _cnng_context_id row
117 , _cnng_ngrams_id row
118 , _cnng_doc_count row)
119 --returnA -< row
120 -- returnA -< ( _cnng_context_id row
121 -- , _cnng_node_id row
122 -- , _cnng_ngrams_id row
123 -- , _cnng_ngramsType row
124 -- , _cnng_weight row
125 -- , _cnng_doc_count row)
126
127
128 ------------------------------------------------------------------------
129 -- | todo add limit and offset and order
130 searchInCorpus :: HasDBid NodeType
131 => CorpusId
132 -> IsTrash
133 -> [Text]
134 -> Maybe Offset
135 -> Maybe Limit
136 -> Maybe OrderBy
137 -> Cmd err [FacetDoc]
138 searchInCorpus cId t q o l order = runOpaQuery
139 $ filterWith o l order
140 $ queryInCorpus cId t
141 $ intercalate " | "
142 $ map stemIt q
143
144 searchCountInCorpus :: HasDBid NodeType
145 => CorpusId
146 -> IsTrash
147 -> [Text]
148 -> Cmd err Int
149 searchCountInCorpus cId t q = runCountOpaQuery
150 $ queryInCorpus cId t
151 $ intercalate " | "
152 $ map stemIt q
153
154 queryInCorpus :: HasDBid NodeType
155 => CorpusId
156 -> IsTrash
157 -> Text
158 -> O.Select FacetDocRead
159 queryInCorpus cId t q = proc () -> do
160 (c, nc) <- joinInCorpus -< ()
161 restrict -< (nc^.nc_node_id) .== (toNullable $ pgNodeId cId)
162 restrict -< if t
163 then (nc^.nc_category) .== (toNullable $ sqlInt4 0)
164 else (nc^.nc_category) .>= (toNullable $ sqlInt4 1)
165 restrict -< (c ^. cs_search) @@ (sqlTSQuery (unpack q))
166 restrict -< (c ^. cs_typename ) .== (sqlInt4 $ toDBid NodeDocument)
167 returnA -< FacetDoc { facetDoc_id = c^.cs_id
168 , facetDoc_created = c^.cs_date
169 , facetDoc_title = c^.cs_name
170 , facetDoc_hyperdata = c^.cs_hyperdata
171 , facetDoc_category = nc^.nc_category
172 , facetDoc_ngramCount = nc^.nc_score
173 , facetDoc_score = nc^.nc_score
174 }
175
176 joinInCorpus :: O.Select (ContextSearchRead, NodeContextReadNull)
177 joinInCorpus = leftJoin queryContextSearchTable queryNodeContextTable cond
178 where
179 cond :: (ContextSearchRead, NodeContextRead) -> Column SqlBool
180 cond (c, nc) = nc^.nc_context_id .== _cs_id c
181
182 ------------------------------------------------------------------------
183 searchInCorpusWithContacts
184 :: HasDBid NodeType
185 => CorpusId
186 -> AnnuaireId
187 -> [Text]
188 -> Maybe Offset
189 -> Maybe Limit
190 -> Maybe OrderBy
191 -> Cmd err [FacetPaired Int UTCTime HyperdataContact Int]
192 searchInCorpusWithContacts cId aId q o l _order =
193 runOpaQuery $ limit' l
194 $ offset' o
195 $ orderBy (desc _fp_score)
196 $ selectGroup cId aId
197 $ intercalate " | "
198 $ map stemIt q
199
200 selectGroup :: HasDBid NodeType
201 => CorpusId
202 -> AnnuaireId
203 -> Text
204 -> Select FacetPairedReadNull
205 selectGroup cId aId q = proc () -> do
206 (a, b, c, d) <- aggregate (p4 (groupBy, groupBy, groupBy, O.sum))
207 (selectContactViaDoc cId aId q) -< ()
208 returnA -< FacetPaired a b c d
209
210
211 selectContactViaDoc
212 :: HasDBid NodeType
213 => CorpusId
214 -> AnnuaireId
215 -> Text
216 -> SelectArr ()
217 ( Column (Nullable SqlInt4)
218 , Column (Nullable SqlTimestamptz)
219 , Column (Nullable SqlJsonb)
220 , Column (Nullable SqlInt4)
221 )
222 selectContactViaDoc cId aId query = proc () -> do
223 (doc, (corpus, (_nodeContext_nodeContext, (annuaire, contact)))) <- queryContactViaDoc -< ()
224 restrict -< (doc^.cs_search) @@ (sqlTSQuery $ unpack query )
225 restrict -< (doc^.cs_typename) .== (sqlInt4 $ toDBid NodeDocument )
226 restrict -< (corpus^.nc_node_id) .== (toNullable $ pgNodeId cId )
227 restrict -< (annuaire^.nc_node_id) .== (toNullable $ pgNodeId aId )
228 restrict -< (contact^.context_typename) .== (toNullable $ sqlInt4 $ toDBid NodeContact)
229 returnA -< ( contact^.context_id
230 , contact^.context_date
231 , contact^.context_hyperdata
232 , toNullable $ sqlInt4 1
233 )
234
235 queryContactViaDoc :: O.Select ( ContextSearchRead
236 , ( NodeContextReadNull
237 , ( NodeContext_NodeContextReadNull
238 , ( NodeContextReadNull
239 , ContextReadNull
240 )
241 )
242 )
243 )
244 queryContactViaDoc =
245 leftJoin5
246 queryContextTable
247 queryNodeContextTable
248 queryNodeContext_NodeContextTable
249 queryNodeContextTable
250 queryContextSearchTable
251 cond12
252 cond23
253 cond34
254 cond45
255 where
256 cond12 :: (NodeContextRead, ContextRead) -> Column SqlBool
257 cond12 (annuaire, contact) = contact^.context_id .== annuaire^.nc_context_id
258
259 cond23 :: ( NodeContext_NodeContextRead
260 , ( NodeContextRead
261 , ContextReadNull
262 )
263 ) -> Column SqlBool
264 cond23 (nodeContext_nodeContext, (annuaire, _)) = nodeContext_nodeContext^.ncnc_nodecontext2 .== annuaire^.nc_id
265
266 cond34 :: ( NodeContextRead
267 , ( NodeContext_NodeContextRead
268 , ( NodeContextReadNull
269 , ContextReadNull
270 )
271 )
272 ) -> Column SqlBool
273 cond34 (corpus, (nodeContext_nodeContext, (_,_))) = nodeContext_nodeContext^.ncnc_nodecontext1 .== corpus^.nc_id
274
275
276 cond45 :: ( ContextSearchRead
277 , ( NodeContextRead
278 , ( NodeContext_NodeContextReadNull
279 , ( NodeContextReadNull
280 , ContextReadNull
281 )
282 )
283 )
284 ) -> Column SqlBool
285 cond45 (doc, (corpus, (_,(_,_)))) = doc^.cs_id .== corpus^.nc_context_id