]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Search.hs
[FIX] Clean Text before sending it to NLP micro services + tests + clean code for...
[gargantext.git] / src / Gargantext / Database / Action / Search.hs
1 {-|
2 Module : Gargantext.Database.TextSearch
3 Description : Postgres text search experimentation
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE Arrows #-}
12 {-# LANGUAGE LambdaCase #-}
13
14 module Gargantext.Database.Action.Search where
15
16 import Control.Arrow (returnA)
17 import Control.Lens ((^.), view)
18 import qualified Data.List as List
19 import qualified Data.Map.Strict as Map
20 import Data.Maybe
21 import qualified Data.Set as Set
22 import Data.Text (Text, unpack, intercalate)
23 import Data.Time (UTCTime)
24 import Gargantext.Core
25 import Gargantext.Core.Types
26 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..), HyperdataContact(..))
27 import Gargantext.Database.Prelude (Cmd, runOpaQuery, runCountOpaQuery)
28 import Gargantext.Database.Query.Facet
29 import Gargantext.Database.Query.Filter
30 import Gargantext.Database.Query.Table.Node
31 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
32 import Gargantext.Database.Query.Table.Context
33 import Gargantext.Database.Query.Table.ContextNodeNgrams (queryContextNodeNgramsTable)
34 import Gargantext.Database.Query.Table.NodeContext
35 import Gargantext.Database.Query.Table.NodeContext_NodeContext
36 import Gargantext.Database.Schema.ContextNodeNgrams (ContextNodeNgramsPoly(..))
37 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
38 import Gargantext.Database.Schema.Node
39 import Gargantext.Database.Schema.Context
40 import Gargantext.Prelude
41 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
42 import Opaleye hiding (Order)
43 import Data.Profunctor.Product (p4)
44 import qualified Opaleye as O hiding (Order)
45
46 ------------------------------------------------------------------------
47 searchDocInDatabase :: HasDBid NodeType
48 => ParentId
49 -> Text
50 -> Cmd err [(NodeId, HyperdataDocument)]
51 searchDocInDatabase p t = runOpaQuery (queryDocInDatabase p t)
52 where
53 -- | Global search query where ParentId is Master Node Corpus Id
54 queryDocInDatabase :: ParentId -> Text -> O.Select (Column SqlInt4, Column SqlJsonb)
55 queryDocInDatabase _p q = proc () -> do
56 row <- queryNodeSearchTable -< ()
57 restrict -< (_ns_search row) @@ (sqlTSQuery (unpack q))
58 restrict -< (_ns_typename row) .== (sqlInt4 $ toDBid NodeDocument)
59 returnA -< (_ns_id row, _ns_hyperdata row)
60
61 ------------------------------------------------------------------------
62 -- | Search ngrams in documents, ranking them by TF-IDF. We narrow our
63 -- search only to map/candidate terms.
64 searchInCorpusWithNgrams :: HasDBid NodeType
65 => CorpusId
66 -> ListId
67 -> IsTrash
68 -> NgramsType
69 -> [[Text]]
70 -> Maybe Offset
71 -> Maybe Limit
72 -> Maybe OrderBy
73 -> Cmd err [FacetDoc]
74 searchInCorpusWithNgrams _cId _lId _t _ngt _q _o _l _order = undefined
75
76 -- | Compute TF-IDF for all 'ngramIds' in given 'CorpusId'. In this
77 -- case only the "TF" part makes sense and so we only compute the
78 -- ratio of "number of times our terms appear in given document" and
79 -- "number of all terms in document" and return a sorted list of
80 -- document ids
81 tfidfAll :: (HasDBid NodeType, HasNodeError err) => CorpusId -> [Int] -> Cmd err [Int]
82 tfidfAll cId ngramIds = do
83 let ngramIdsSet = Set.fromList ngramIds
84 lId <- defaultList cId
85 docsWithNgrams <- runOpaQuery (queryListWithNgrams lId ngramIds) :: Cmd err [(Int, Int, Int)]
86 -- NOTE The query returned docs with ANY ngramIds. We need to further
87 -- restrict to ALL ngramIds.
88 let docsNgramsM =
89 Map.fromListWith (Set.union)
90 [ (ctxId, Set.singleton ngrams_id)
91 | (ctxId, ngrams_id, _) <- docsWithNgrams]
92 let docsWithAllNgramsS = Set.fromList $ List.map fst $
93 List.filter (\(_, docNgrams) ->
94 ngramIdsSet == Set.intersection ngramIdsSet docNgrams) $ Map.toList docsNgramsM
95 let docsWithAllNgrams =
96 List.filter (\(ctxId, _, _) ->
97 Set.member ctxId docsWithAllNgramsS) docsWithNgrams
98 -- printDebug "[tfidfAll] docsWithAllNgrams" docsWithAllNgrams
99 let docsWithCounts = Map.fromListWith (+) [ (ctxId, doc_count)
100 | (ctxId, _, doc_count) <- docsWithAllNgrams]
101 -- printDebug "[tfidfAll] docsWithCounts" docsWithCounts
102 let totals = [ ( ctxId
103 , ngrams_id
104 , fromIntegral doc_count :: Double
105 , fromIntegral (fromMaybe 0 $ Map.lookup ctxId docsWithCounts) :: Double)
106 | (ctxId, ngrams_id, doc_count) <- docsWithAllNgrams]
107 let tfidf_sorted = List.sortOn snd [(ctxId, doc_count/s)
108 | (ctxId, _, doc_count, s) <- totals]
109 pure $ List.map fst $ List.reverse tfidf_sorted
110
111 -- | Query for searching the 'context_node_ngrams' table so that we
112 -- find docs with ANY given 'ngramIds'.
113 queryListWithNgrams :: ListId -> [Int] -> Select (Column SqlInt4, Column SqlInt4, Column SqlInt4)
114 queryListWithNgrams lId ngramIds = proc () -> do
115 row <- queryContextNodeNgramsTable -< ()
116 restrict -< (_cnng_node_id row) .== (pgNodeId lId)
117 restrict -< in_ (sqlInt4 <$> ngramIds) (_cnng_ngrams_id row)
118 returnA -< ( _cnng_context_id row
119 , _cnng_ngrams_id row
120 , _cnng_doc_count row )
121 --returnA -< row
122 -- returnA -< ( _cnng_context_id row
123 -- , _cnng_node_id row
124 -- , _cnng_ngrams_id row
125 -- , _cnng_ngramsType row
126 -- , _cnng_weight row
127 -- , _cnng_doc_count row)
128
129
130 ------------------------------------------------------------------------
131 -- | todo add limit and offset and order
132 searchInCorpus :: HasDBid NodeType
133 => CorpusId
134 -> IsTrash
135 -> [Text]
136 -> Maybe Offset
137 -> Maybe Limit
138 -> Maybe OrderBy
139 -> Cmd err [FacetDoc]
140 searchInCorpus cId t q o l order = runOpaQuery
141 $ filterWith o l order
142 $ queryInCorpus cId t
143 $ intercalate " | "
144 $ map stemIt q
145
146 searchCountInCorpus :: HasDBid NodeType
147 => CorpusId
148 -> IsTrash
149 -> [Text]
150 -> Cmd err Int
151 searchCountInCorpus cId t q = runCountOpaQuery
152 $ queryInCorpus cId t
153 $ intercalate " | "
154 $ map stemIt q
155
156 queryInCorpus :: HasDBid NodeType
157 => CorpusId
158 -> IsTrash
159 -> Text
160 -> O.Select FacetDocRead
161 queryInCorpus cId t q = proc () -> do
162 c <- queryContextSearchTable -< ()
163 nc <- optionalRestrict queryNodeContextTable -<
164 \nc' -> (nc' ^. nc_context_id) .== _cs_id c
165 restrict -< (view nc_node_id <$> nc) .=== justFields (pgNodeId cId)
166 restrict -< if t
167 then (view nc_category <$> nc) .=== justFields (sqlInt4 0)
168 else matchMaybe (view nc_category <$> nc) $ \case
169 Nothing -> toFields False
170 Just c' -> c' .>= sqlInt4 1
171 restrict -< (c ^. cs_search) @@ sqlTSQuery (unpack q)
172 restrict -< (c ^. cs_typename ) .== sqlInt4 (toDBid NodeDocument)
173 returnA -< FacetDoc { facetDoc_id = c^.cs_id
174 , facetDoc_created = c^.cs_date
175 , facetDoc_title = c^.cs_name
176 , facetDoc_hyperdata = c^.cs_hyperdata
177 , facetDoc_category = maybeFieldsToNullable (view nc_category <$> nc)
178 , facetDoc_ngramCount = maybeFieldsToNullable (view nc_score <$> nc)
179 , facetDoc_score = maybeFieldsToNullable (view nc_score <$> nc)
180 }
181
182 ------------------------------------------------------------------------
183 searchInCorpusWithContacts
184 :: HasDBid NodeType
185 => CorpusId
186 -> AnnuaireId
187 -> [Text]
188 -> Maybe Offset
189 -> Maybe Limit
190 -> Maybe OrderBy
191 -> Cmd err [FacetPaired Int UTCTime HyperdataContact Int]
192 searchInCorpusWithContacts cId aId q o l _order =
193 runOpaQuery $ limit' l
194 $ offset' o
195 $ orderBy (desc _fp_score)
196 $ selectGroup cId aId
197 $ intercalate " | "
198 $ map stemIt q
199
200 selectGroup :: HasDBid NodeType
201 => CorpusId
202 -> AnnuaireId
203 -> Text
204 -> Select FacetPairedRead
205 selectGroup cId aId q = proc () -> do
206 (a, b, c, d) <- aggregate (p4 (groupBy, groupBy, groupBy, O.sum))
207 (selectContactViaDoc cId aId q) -< ()
208 returnA -< FacetPaired a b c d
209
210
211 selectContactViaDoc
212 :: HasDBid NodeType
213 => CorpusId
214 -> AnnuaireId
215 -> Text
216 -> SelectArr ()
217 ( Field SqlInt4
218 , Field SqlTimestamptz
219 , Field SqlJsonb
220 , Field SqlInt4
221 )
222 selectContactViaDoc cId aId query = proc () -> do
223 --(doc, (corpus, (_nodeContext_nodeContext, (annuaire, contact)))) <- queryContactViaDoc -< ()
224 (contact, annuaire, _, corpus, doc) <- queryContactViaDoc -< ()
225 restrict -< matchMaybe (view cs_search <$> doc) $ \case
226 Nothing -> toFields False
227 Just s -> s @@ sqlTSQuery (unpack query)
228 restrict -< (view cs_typename <$> doc) .=== justFields (sqlInt4 (toDBid NodeDocument))
229 restrict -< (view nc_node_id <$> corpus) .=== justFields (pgNodeId cId)
230 restrict -< (view nc_node_id <$> annuaire) .=== justFields (pgNodeId aId)
231 restrict -< (contact ^. context_typename) .== sqlInt4 (toDBid NodeContact)
232 returnA -< ( contact ^. context_id
233 , contact ^. context_date
234 , contact ^. context_hyperdata
235 , sqlInt4 1
236 )
237
238 queryContactViaDoc :: O.Select ( ContextRead
239 , MaybeFields NodeContextRead
240 , MaybeFields NodeContext_NodeContextRead
241 , MaybeFields NodeContextRead
242 , MaybeFields ContextSearchRead )
243 queryContactViaDoc = proc () -> do
244 contact <- queryContextTable -< ()
245 annuaire <- optionalRestrict queryNodeContextTable -<
246 \annuaire' -> (annuaire' ^. nc_context_id) .== (contact ^. context_id)
247 nodeContext_nodeContext <- optionalRestrict queryNodeContext_NodeContextTable -<
248 \ncnc' -> justFields (ncnc' ^. ncnc_nodecontext2) .=== (view nc_id <$> annuaire)
249 corpus <- optionalRestrict queryNodeContextTable -<
250 \corpus' -> justFields (corpus' ^. nc_id) .=== (view ncnc_nodecontext1 <$> nodeContext_nodeContext)
251 doc <- optionalRestrict queryContextSearchTable -<
252 \doc' -> justFields (doc' ^. cs_id) .=== (view nc_context_id <$> corpus)
253
254 returnA -< (contact, annuaire, nodeContext_nodeContext, corpus, doc)