]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Search.hs
[FEAT] Graph O2 bridgeness improvement and FrameWrite Extraction
[gargantext.git] / src / Gargantext / Database / Action / Search.hs
1 {-|
2 Module : Gargantext.Database.TextSearch
3 Description : Postgres text search experimentation
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE Arrows #-}
12 {-# LANGUAGE LambdaCase #-}
13
14 module Gargantext.Database.Action.Search where
15
16 import Control.Arrow (returnA)
17 import Control.Lens ((^.), view)
18 import qualified Data.List as List
19 import qualified Data.Map.Strict as Map
20 import Data.Maybe
21 import qualified Data.Set as Set
22 import Data.Text (Text, unpack, intercalate)
23 import Data.Time (UTCTime)
24 import Gargantext.Core
25 import Gargantext.Core.Types
26 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..), HyperdataContact(..))
27 import Gargantext.Database.Prelude (Cmd, runOpaQuery, runCountOpaQuery)
28 import Gargantext.Database.Query.Facet
29 import Gargantext.Database.Query.Filter
30 import Gargantext.Database.Query.Table.Node
31 import Gargantext.Database.Query.Table.Context
32 import Gargantext.Database.Query.Table.ContextNodeNgrams (queryContextNodeNgramsTable)
33 import Gargantext.Database.Query.Table.NodeContext
34 import Gargantext.Database.Query.Table.NodeContext_NodeContext
35 import Gargantext.Database.Schema.ContextNodeNgrams (ContextNodeNgramsPoly(..))
36 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
37 import Gargantext.Database.Schema.Node
38 import Gargantext.Database.Schema.Context
39 import Gargantext.Prelude
40 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
41 import Opaleye hiding (Order)
42 import Data.Profunctor.Product (p4)
43 import qualified Opaleye as O hiding (Order)
44
45 ------------------------------------------------------------------------
46 searchDocInDatabase :: HasDBid NodeType
47 => ParentId
48 -> Text
49 -> Cmd err [(NodeId, HyperdataDocument)]
50 searchDocInDatabase p t = runOpaQuery (queryDocInDatabase p t)
51 where
52 -- | Global search query where ParentId is Master Node Corpus Id
53 queryDocInDatabase :: ParentId -> Text -> O.Select (Column SqlInt4, Column SqlJsonb)
54 queryDocInDatabase _p q = proc () -> do
55 row <- queryNodeSearchTable -< ()
56 restrict -< (_ns_search row) @@ (sqlTSQuery (unpack q))
57 restrict -< (_ns_typename row) .== (sqlInt4 $ toDBid NodeDocument)
58 returnA -< (_ns_id row, _ns_hyperdata row)
59
60 ------------------------------------------------------------------------
61 -- | Search ngrams in documents, ranking them by TF-IDF. We narrow our
62 -- search only to map/candidate terms.
63 searchInCorpusWithNgrams :: HasDBid NodeType
64 => CorpusId
65 -> ListId
66 -> IsTrash
67 -> NgramsType
68 -> [[Text]]
69 -> Maybe Offset
70 -> Maybe Limit
71 -> Maybe OrderBy
72 -> Cmd err [FacetDoc]
73 searchInCorpusWithNgrams _cId _lId _t _ngt _q _o _l _order = undefined
74
75 -- | Compute TF-IDF for all 'ngramIds' in given 'CorpusId'. In this
76 -- case only the "TF" part makes sense and so we only compute the
77 -- ratio of "number of times our terms appear in given document" and
78 -- "number of all terms in document" and return a sorted list of
79 -- document ids
80 tfidfAll :: CorpusId -> [Int] -> Cmd err [Int]
81 tfidfAll cId ngramIds = do
82 let ngramIdsSet = Set.fromList ngramIds
83 docsWithNgrams <- runOpaQuery (queryCorpusWithNgrams cId ngramIds) :: Cmd err [(Int, Int, Int)]
84 -- NOTE The query returned docs with ANY ngramIds. We need to further
85 -- restrict to ALL ngramIds.
86 let docsNgramsM =
87 Map.fromListWith (Set.union)
88 [ (ctxId, Set.singleton ngrams_id)
89 | (ctxId, ngrams_id, _) <- docsWithNgrams]
90 let docsWithAllNgramsS = Set.fromList $ List.map fst $
91 List.filter (\(_, docNgrams) ->
92 ngramIdsSet == Set.intersection ngramIdsSet docNgrams) $ Map.toList docsNgramsM
93 let docsWithAllNgrams =
94 List.filter (\(ctxId, _, _) ->
95 Set.member ctxId docsWithAllNgramsS) docsWithNgrams
96 -- printDebug "[tfidfAll] docsWithAllNgrams" docsWithAllNgrams
97 let docsWithCounts = Map.fromListWith (+) [ (ctxId, doc_count)
98 | (ctxId, _, doc_count) <- docsWithAllNgrams]
99 -- printDebug "[tfidfAll] docsWithCounts" docsWithCounts
100 let totals = [ ( ctxId
101 , ngrams_id
102 , fromIntegral doc_count :: Double
103 , fromIntegral (fromMaybe 0 $ Map.lookup ctxId docsWithCounts) :: Double)
104 | (ctxId, ngrams_id, doc_count) <- docsWithAllNgrams]
105 let tfidf_sorted = List.sortOn snd [(ctxId, doc_count/s)
106 | (ctxId, _, doc_count, s) <- totals]
107 pure $ List.map fst $ List.reverse tfidf_sorted
108
109 -- | Query for searching the 'context_node_ngrams' table so that we
110 -- find docs with ANY given 'ngramIds'.
111 queryCorpusWithNgrams :: CorpusId -> [Int] -> Select (Column SqlInt4, Column SqlInt4, Column SqlInt4)
112 queryCorpusWithNgrams cId ngramIds = proc () -> do
113 row <- queryContextNodeNgramsTable -< ()
114 restrict -< (_cnng_node_id row) .== (pgNodeId cId)
115 restrict -< in_ (sqlInt4 <$> ngramIds) (_cnng_ngrams_id row)
116 returnA -< ( _cnng_context_id row
117 , _cnng_ngrams_id row
118 , _cnng_doc_count row)
119 --returnA -< row
120 -- returnA -< ( _cnng_context_id row
121 -- , _cnng_node_id row
122 -- , _cnng_ngrams_id row
123 -- , _cnng_ngramsType row
124 -- , _cnng_weight row
125 -- , _cnng_doc_count row)
126
127
128 ------------------------------------------------------------------------
129 -- | todo add limit and offset and order
130 searchInCorpus :: HasDBid NodeType
131 => CorpusId
132 -> IsTrash
133 -> [Text]
134 -> Maybe Offset
135 -> Maybe Limit
136 -> Maybe OrderBy
137 -> Cmd err [FacetDoc]
138 searchInCorpus cId t q o l order = runOpaQuery
139 $ filterWith o l order
140 $ queryInCorpus cId t
141 $ intercalate " | "
142 $ map stemIt q
143
144 searchCountInCorpus :: HasDBid NodeType
145 => CorpusId
146 -> IsTrash
147 -> [Text]
148 -> Cmd err Int
149 searchCountInCorpus cId t q = runCountOpaQuery
150 $ queryInCorpus cId t
151 $ intercalate " | "
152 $ map stemIt q
153
154 queryInCorpus :: HasDBid NodeType
155 => CorpusId
156 -> IsTrash
157 -> Text
158 -> O.Select FacetDocRead
159 queryInCorpus cId t q = proc () -> do
160 c <- queryContextSearchTable -< ()
161 nc <- optionalRestrict queryNodeContextTable -<
162 \nc' -> (nc' ^. nc_context_id) .== _cs_id c
163 restrict -< (view nc_node_id <$> nc) .=== justFields (pgNodeId cId)
164 restrict -< if t
165 then (view nc_category <$> nc) .=== justFields (sqlInt4 0)
166 else matchMaybe (view nc_category <$> nc) $ \case
167 Nothing -> toFields False
168 Just c' -> c' .>= sqlInt4 1
169 restrict -< (c ^. cs_search) @@ sqlTSQuery (unpack q)
170 restrict -< (c ^. cs_typename ) .== sqlInt4 (toDBid NodeDocument)
171 returnA -< FacetDoc { facetDoc_id = c^.cs_id
172 , facetDoc_created = c^.cs_date
173 , facetDoc_title = c^.cs_name
174 , facetDoc_hyperdata = c^.cs_hyperdata
175 , facetDoc_category = maybeFieldsToNullable (view nc_category <$> nc)
176 , facetDoc_ngramCount = maybeFieldsToNullable (view nc_score <$> nc)
177 , facetDoc_score = maybeFieldsToNullable (view nc_score <$> nc)
178 }
179
180 ------------------------------------------------------------------------
181 searchInCorpusWithContacts
182 :: HasDBid NodeType
183 => CorpusId
184 -> AnnuaireId
185 -> [Text]
186 -> Maybe Offset
187 -> Maybe Limit
188 -> Maybe OrderBy
189 -> Cmd err [FacetPaired Int UTCTime HyperdataContact Int]
190 searchInCorpusWithContacts cId aId q o l _order =
191 runOpaQuery $ limit' l
192 $ offset' o
193 $ orderBy (desc _fp_score)
194 $ selectGroup cId aId
195 $ intercalate " | "
196 $ map stemIt q
197
198 selectGroup :: HasDBid NodeType
199 => CorpusId
200 -> AnnuaireId
201 -> Text
202 -> Select FacetPairedRead
203 selectGroup cId aId q = proc () -> do
204 (a, b, c, d) <- aggregate (p4 (groupBy, groupBy, groupBy, O.sum))
205 (selectContactViaDoc cId aId q) -< ()
206 returnA -< FacetPaired a b c d
207
208
209 selectContactViaDoc
210 :: HasDBid NodeType
211 => CorpusId
212 -> AnnuaireId
213 -> Text
214 -> SelectArr ()
215 ( Field SqlInt4
216 , Field SqlTimestamptz
217 , Field SqlJsonb
218 , Field SqlInt4
219 )
220 selectContactViaDoc cId aId query = proc () -> do
221 --(doc, (corpus, (_nodeContext_nodeContext, (annuaire, contact)))) <- queryContactViaDoc -< ()
222 (contact, annuaire, _, corpus, doc) <- queryContactViaDoc -< ()
223 restrict -< matchMaybe (view cs_search <$> doc) $ \case
224 Nothing -> toFields False
225 Just s -> s @@ sqlTSQuery (unpack query)
226 restrict -< (view cs_typename <$> doc) .=== justFields (sqlInt4 (toDBid NodeDocument))
227 restrict -< (view nc_node_id <$> corpus) .=== justFields (pgNodeId cId)
228 restrict -< (view nc_node_id <$> annuaire) .=== justFields (pgNodeId aId)
229 restrict -< (contact ^. context_typename) .== sqlInt4 (toDBid NodeContact)
230 returnA -< ( contact ^. context_id
231 , contact ^. context_date
232 , contact ^. context_hyperdata
233 , sqlInt4 1
234 )
235
236 queryContactViaDoc :: O.Select ( ContextRead
237 , MaybeFields NodeContextRead
238 , MaybeFields NodeContext_NodeContextRead
239 , MaybeFields NodeContextRead
240 , MaybeFields ContextSearchRead )
241 queryContactViaDoc = proc () -> do
242 contact <- queryContextTable -< ()
243 annuaire <- optionalRestrict queryNodeContextTable -<
244 \annuaire' -> (annuaire' ^. nc_context_id) .== (contact ^. context_id)
245 nodeContext_nodeContext <- optionalRestrict queryNodeContext_NodeContextTable -<
246 \ncnc' -> justFields (ncnc' ^. ncnc_nodecontext2) .=== (view nc_id <$> annuaire)
247 corpus <- optionalRestrict queryNodeContextTable -<
248 \corpus' -> justFields (corpus' ^. nc_id) .=== (view ncnc_nodecontext1 <$> nodeContext_nodeContext)
249 doc <- optionalRestrict queryContextSearchTable -<
250 \doc' -> justFields (doc' ^. cs_id) .=== (view nc_context_id <$> corpus)
251
252 returnA -< (contact, annuaire, nodeContext_nodeContext, corpus, doc)