]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Search.hs
[FEAT] Iramuteq parser (WIP)
[gargantext.git] / src / Gargantext / Database / Action / Search.hs
1 {-|
2 Module : Gargantext.Database.TextSearch
3 Description : Postgres text search experimentation
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE Arrows #-}
12 {-# LANGUAGE LambdaCase #-}
13
14 module Gargantext.Database.Action.Search where
15
16 import Control.Arrow (returnA)
17 import Control.Lens ((^.), view)
18 import qualified Data.List as List
19 import qualified Data.Map.Strict as Map
20 import Data.Maybe
21 import qualified Data.Set as Set
22 import Data.Text (Text, unpack, intercalate)
23 import Data.Time (UTCTime)
24 import Gargantext.Core
25 import Gargantext.Core.Types
26 import Gargantext.Core.Types.Query (IsTrash, Limit, Offset)
27 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..), HyperdataContact(..))
28 import Gargantext.Database.Prelude (Cmd, runOpaQuery, runCountOpaQuery)
29 import Gargantext.Database.Query.Facet
30 import Gargantext.Database.Query.Filter
31 import Gargantext.Database.Query.Table.Node
32 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
33 import Gargantext.Database.Query.Table.Context
34 import Gargantext.Database.Query.Table.ContextNodeNgrams (queryContextNodeNgramsTable)
35 import Gargantext.Database.Query.Table.NodeContext
36 import Gargantext.Database.Query.Table.NodeContext_NodeContext
37 import Gargantext.Database.Schema.ContextNodeNgrams (ContextNodeNgramsPoly(..))
38 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
39 import Gargantext.Database.Schema.Node
40 import Gargantext.Database.Schema.Context
41 import Gargantext.Prelude
42 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
43 import Opaleye hiding (Order)
44 import Data.Profunctor.Product (p4)
45 import qualified Opaleye as O hiding (Order)
46
47 ------------------------------------------------------------------------
48 searchDocInDatabase :: HasDBid NodeType
49 => ParentId
50 -> Text
51 -> Cmd err [(NodeId, HyperdataDocument)]
52 searchDocInDatabase p t = runOpaQuery (queryDocInDatabase p t)
53 where
54 -- | Global search query where ParentId is Master Node Corpus Id
55 queryDocInDatabase :: ParentId -> Text -> O.Select (Column SqlInt4, Column SqlJsonb)
56 queryDocInDatabase _p q = proc () -> do
57 row <- queryNodeSearchTable -< ()
58 restrict -< (_ns_search row) @@ (sqlTSQuery (unpack q))
59 restrict -< (_ns_typename row) .== (sqlInt4 $ toDBid NodeDocument)
60 returnA -< (_ns_id row, _ns_hyperdata row)
61
62 ------------------------------------------------------------------------
63 -- | Search ngrams in documents, ranking them by TF-IDF. We narrow our
64 -- search only to map/candidate terms.
65 searchInCorpusWithNgrams :: HasDBid NodeType
66 => CorpusId
67 -> ListId
68 -> IsTrash
69 -> NgramsType
70 -> [[Text]]
71 -> Maybe Offset
72 -> Maybe Limit
73 -> Maybe OrderBy
74 -> Cmd err [FacetDoc]
75 searchInCorpusWithNgrams _cId _lId _t _ngt _q _o _l _order = undefined
76
77 -- | Compute TF-IDF for all 'ngramIds' in given 'CorpusId'. In this
78 -- case only the "TF" part makes sense and so we only compute the
79 -- ratio of "number of times our terms appear in given document" and
80 -- "number of all terms in document" and return a sorted list of
81 -- document ids
82 tfidfAll :: (HasDBid NodeType, HasNodeError err) => CorpusId -> [Int] -> Cmd err [Int]
83 tfidfAll cId ngramIds = do
84 let ngramIdsSet = Set.fromList ngramIds
85 lId <- defaultList cId
86 docsWithNgrams <- runOpaQuery (queryListWithNgrams lId ngramIds) :: Cmd err [(Int, Int, Int)]
87 -- NOTE The query returned docs with ANY ngramIds. We need to further
88 -- restrict to ALL ngramIds.
89 let docsNgramsM =
90 Map.fromListWith (Set.union)
91 [ (ctxId, Set.singleton ngrams_id)
92 | (ctxId, ngrams_id, _) <- docsWithNgrams]
93 let docsWithAllNgramsS = Set.fromList $ List.map fst $
94 List.filter (\(_, docNgrams) ->
95 ngramIdsSet == Set.intersection ngramIdsSet docNgrams) $ Map.toList docsNgramsM
96 let docsWithAllNgrams =
97 List.filter (\(ctxId, _, _) ->
98 Set.member ctxId docsWithAllNgramsS) docsWithNgrams
99 -- printDebug "[tfidfAll] docsWithAllNgrams" docsWithAllNgrams
100 let docsWithCounts = Map.fromListWith (+) [ (ctxId, doc_count)
101 | (ctxId, _, doc_count) <- docsWithAllNgrams]
102 -- printDebug "[tfidfAll] docsWithCounts" docsWithCounts
103 let totals = [ ( ctxId
104 , ngrams_id
105 , fromIntegral doc_count :: Double
106 , fromIntegral (fromMaybe 0 $ Map.lookup ctxId docsWithCounts) :: Double)
107 | (ctxId, ngrams_id, doc_count) <- docsWithAllNgrams]
108 let tfidf_sorted = List.sortOn snd [(ctxId, doc_count/s)
109 | (ctxId, _, doc_count, s) <- totals]
110 pure $ List.map fst $ List.reverse tfidf_sorted
111
112 -- | Query for searching the 'context_node_ngrams' table so that we
113 -- find docs with ANY given 'ngramIds'.
114 queryListWithNgrams :: ListId -> [Int] -> Select (Column SqlInt4, Column SqlInt4, Column SqlInt4)
115 queryListWithNgrams lId ngramIds = proc () -> do
116 row <- queryContextNodeNgramsTable -< ()
117 restrict -< (_cnng_node_id row) .== (pgNodeId lId)
118 restrict -< in_ (sqlInt4 <$> ngramIds) (_cnng_ngrams_id row)
119 returnA -< ( _cnng_context_id row
120 , _cnng_ngrams_id row
121 , _cnng_doc_count row )
122 --returnA -< row
123 -- returnA -< ( _cnng_context_id row
124 -- , _cnng_node_id row
125 -- , _cnng_ngrams_id row
126 -- , _cnng_ngramsType row
127 -- , _cnng_weight row
128 -- , _cnng_doc_count row)
129
130
131 ------------------------------------------------------------------------
132 -- | todo add limit and offset and order
133 searchInCorpus :: HasDBid NodeType
134 => CorpusId
135 -> IsTrash
136 -> [Text]
137 -> Maybe Offset
138 -> Maybe Limit
139 -> Maybe OrderBy
140 -> Cmd err [FacetDoc]
141 searchInCorpus cId t q o l order = runOpaQuery
142 $ filterWith o l order
143 $ queryInCorpus cId t
144 $ intercalate " | "
145 $ map stemIt q
146
147 searchCountInCorpus :: HasDBid NodeType
148 => CorpusId
149 -> IsTrash
150 -> [Text]
151 -> Cmd err Int
152 searchCountInCorpus cId t q = runCountOpaQuery
153 $ queryInCorpus cId t
154 $ intercalate " | "
155 $ map stemIt q
156
157 queryInCorpus :: HasDBid NodeType
158 => CorpusId
159 -> IsTrash
160 -> Text
161 -> O.Select FacetDocRead
162 queryInCorpus cId t q = proc () -> do
163 c <- queryContextSearchTable -< ()
164 nc <- optionalRestrict queryNodeContextTable -<
165 \nc' -> (nc' ^. nc_context_id) .== _cs_id c
166 restrict -< (view nc_node_id <$> nc) .=== justFields (pgNodeId cId)
167 restrict -< if t
168 then (view nc_category <$> nc) .=== justFields (sqlInt4 0)
169 else matchMaybe (view nc_category <$> nc) $ \case
170 Nothing -> toFields False
171 Just c' -> c' .>= sqlInt4 1
172 restrict -< (c ^. cs_search) @@ sqlTSQuery (unpack q)
173 restrict -< (c ^. cs_typename ) .== sqlInt4 (toDBid NodeDocument)
174 returnA -< FacetDoc { facetDoc_id = c^.cs_id
175 , facetDoc_created = c^.cs_date
176 , facetDoc_title = c^.cs_name
177 , facetDoc_hyperdata = c^.cs_hyperdata
178 , facetDoc_category = maybeFieldsToNullable (view nc_category <$> nc)
179 , facetDoc_ngramCount = maybeFieldsToNullable (view nc_score <$> nc)
180 , facetDoc_score = maybeFieldsToNullable (view nc_score <$> nc)
181 }
182
183 ------------------------------------------------------------------------
184 searchInCorpusWithContacts
185 :: HasDBid NodeType
186 => CorpusId
187 -> AnnuaireId
188 -> [Text]
189 -> Maybe Offset
190 -> Maybe Limit
191 -> Maybe OrderBy
192 -> Cmd err [FacetPaired Int UTCTime HyperdataContact Int]
193 searchInCorpusWithContacts cId aId q o l _order =
194 runOpaQuery $ limit' l
195 $ offset' o
196 $ orderBy (desc _fp_score)
197 $ selectGroup cId aId
198 $ intercalate " | "
199 $ map stemIt q
200
201 selectGroup :: HasDBid NodeType
202 => CorpusId
203 -> AnnuaireId
204 -> Text
205 -> Select FacetPairedRead
206 selectGroup cId aId q = proc () -> do
207 (a, b, c, d) <- aggregate (p4 (groupBy, groupBy, groupBy, O.sum))
208 (selectContactViaDoc cId aId q) -< ()
209 returnA -< FacetPaired a b c d
210
211
212 selectContactViaDoc
213 :: HasDBid NodeType
214 => CorpusId
215 -> AnnuaireId
216 -> Text
217 -> SelectArr ()
218 ( Field SqlInt4
219 , Field SqlTimestamptz
220 , Field SqlJsonb
221 , Field SqlInt4
222 )
223 selectContactViaDoc cId aId query = proc () -> do
224 --(doc, (corpus, (_nodeContext_nodeContext, (annuaire, contact)))) <- queryContactViaDoc -< ()
225 (contact, annuaire, _, corpus, doc) <- queryContactViaDoc -< ()
226 restrict -< matchMaybe (view cs_search <$> doc) $ \case
227 Nothing -> toFields False
228 Just s -> s @@ sqlTSQuery (unpack query)
229 restrict -< (view cs_typename <$> doc) .=== justFields (sqlInt4 (toDBid NodeDocument))
230 restrict -< (view nc_node_id <$> corpus) .=== justFields (pgNodeId cId)
231 restrict -< (view nc_node_id <$> annuaire) .=== justFields (pgNodeId aId)
232 restrict -< (contact ^. context_typename) .== sqlInt4 (toDBid NodeContact)
233 returnA -< ( contact ^. context_id
234 , contact ^. context_date
235 , contact ^. context_hyperdata
236 , sqlInt4 1
237 )
238
239 queryContactViaDoc :: O.Select ( ContextRead
240 , MaybeFields NodeContextRead
241 , MaybeFields NodeContext_NodeContextRead
242 , MaybeFields NodeContextRead
243 , MaybeFields ContextSearchRead )
244 queryContactViaDoc = proc () -> do
245 contact <- queryContextTable -< ()
246 annuaire <- optionalRestrict queryNodeContextTable -<
247 \annuaire' -> (annuaire' ^. nc_context_id) .== (contact ^. context_id)
248 nodeContext_nodeContext <- optionalRestrict queryNodeContext_NodeContextTable -<
249 \ncnc' -> justFields (ncnc' ^. ncnc_nodecontext2) .=== (view nc_id <$> annuaire)
250 corpus <- optionalRestrict queryNodeContextTable -<
251 \corpus' -> justFields (corpus' ^. nc_id) .=== (view ncnc_nodecontext1 <$> nodeContext_nodeContext)
252 doc <- optionalRestrict queryContextSearchTable -<
253 \doc' -> justFields (doc' ^. cs_id) .=== (view nc_context_id <$> corpus)
254
255 returnA -< (contact, annuaire, nodeContext_nodeContext, corpus, doc)