]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/TextSearch.hs
Merge branch 'dev-ngrams-repo' of ssh://delanoe.org/haskell-gargantext into dev-ngram...
[gargantext.git] / src / Gargantext / Database / TextSearch.hs
1 {-|
2 Module : Gargantext.Database.TextSearch
3 Description : Postgres text search experimentation
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE Arrows #-}
12 {-# LANGUAGE NoImplicitPrelude #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE RankNTypes #-}
15
16 module Gargantext.Database.TextSearch where
17
18 import Data.Aeson
19 import Data.Map.Strict hiding (map, drop, take)
20 import Data.Maybe
21 import Data.List (intersperse, take, drop)
22 import Data.String (IsString(..))
23 import Data.Text (Text, words, unpack, intercalate)
24 import Data.Time (UTCTime)
25 import Database.PostgreSQL.Simple (Query)
26 import Database.PostgreSQL.Simple.ToField
27 import Gargantext.Database.Config (nodeTypeId)
28 import Gargantext.Database.Types.Node (NodeType(..))
29 import Gargantext.Prelude
30 --import Gargantext.Database.Node.Contact
31 import Gargantext.Database.Facet
32 import Gargantext.Database.Schema.Node
33 import Gargantext.Database.Schema.Ngrams
34 import Gargantext.Database.Schema.NodeNode
35 import Gargantext.Database.Schema.NodeNgram
36 import Gargantext.Database.Queries.Join (leftJoin6)
37 import Gargantext.Database.Utils (Cmd, runPGSQuery, runOpaQuery)
38 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
39 import Gargantext.Core.Types
40 import Control.Arrow (returnA)
41 import qualified Opaleye as O hiding (Order)
42 import Opaleye hiding (Query, Order)
43
44
45 ------------------------------------------------------------------------
46 searchInDatabase :: ParentId -> Text -> Cmd err [(NodeId, HyperdataDocument)]
47 searchInDatabase p t = runOpaQuery (queryInDatabase p t)
48
49 -- | Global search query where ParentId is Master Node Corpus Id
50 queryInDatabase :: ParentId -> Text -> O.Query (Column PGInt4, Column PGJsonb)
51 queryInDatabase _ q = proc () -> do
52 row <- queryNodeSearchTable -< ()
53 restrict -< (_ns_search row) @@ (pgTSQuery (unpack q))
54 restrict -< (_ns_typename row) .== (pgInt4 $ nodeTypeId NodeDocument)
55 returnA -< (_ns_id row, _ns_hyperdata row)
56
57 ------------------------------------------------------------------------
58 -- | todo add limit and offset and order
59 searchInCorpus :: CorpusId -> [Text] -> Maybe Offset -> Maybe Limit -> Maybe OrderBy -> Cmd err [FacetDoc]
60 searchInCorpus cId q o l order = runOpaQuery (filterWith o l order $ queryInCorpus cId q')
61 where
62 q' = intercalate " | " $ map stemIt q
63
64 queryInCorpus :: CorpusId -> Text -> O.Query FacetDocRead
65 queryInCorpus cId q = proc () -> do
66 (n, nn) <- joinInCorpus -< ()
67 restrict -< ( nodeNode_node1_id nn) .== (toNullable $ pgNodeId cId)
68 restrict -< (_ns_search n) @@ (pgTSQuery (unpack q))
69 restrict -< (_ns_typename n) .== (pgInt4 $ nodeTypeId NodeDocument)
70 returnA -< FacetDoc (_ns_id n) (_ns_date n) (_ns_name n) (_ns_hyperdata n) (pgBool True) (pgInt4 1)
71
72 joinInCorpus :: O.Query (NodeSearchRead, NodeNodeReadNull)
73 joinInCorpus = leftJoin queryNodeSearchTable queryNodeNodeTable cond
74 where
75 cond :: (NodeSearchRead, NodeNodeRead) -> Column PGBool
76 cond (n, nn) = nodeNode_node2_id nn .== _ns_id n
77
78 ------------------------------------------------------------------------
79 type AuthorName = Text
80
81 -- | TODO Optim: Offset and Limit in the Query
82 searchInCorpusWithContacts :: CorpusId -> [Text] -> Maybe Offset -> Maybe Limit -> Maybe OrderBy -> Cmd err [FacetPaired Int UTCTime HyperdataDocument Int [Pair Int Text]]
83 searchInCorpusWithContacts cId q o l order = take (maybe 5 identity l) <$> drop (maybe 0 identity o)
84 <$> map (\((i,u,h,s), ps) -> FacetPaired i u h s (catMaybes ps))
85 <$> toList <$> fromListWith (<>)
86 <$> map (\(FacetPaired i u h s p) -> ((i,u,h,s), [maybePair p]))
87 <$> searchInCorpusWithContacts' cId q o l order
88 where
89 maybePair (Pair Nothing Nothing) = Nothing
90 maybePair (Pair _ Nothing) = Nothing
91 maybePair (Pair Nothing _) = Nothing
92 maybePair (Pair (Just p_id) (Just p_label)) = Just $ Pair p_id p_label
93
94 searchInCorpusWithContacts' :: CorpusId -> [Text] -> Maybe Offset -> Maybe Limit -> Maybe OrderBy -> Cmd err [(FacetPaired Int UTCTime HyperdataDocument Int (Pair (Maybe Int) (Maybe Text)))]
95 searchInCorpusWithContacts' cId q o l order = runOpaQuery $ queryInCorpusWithContacts cId q' o l order
96 where
97 q' = intercalate " | " $ map stemIt q
98
99
100
101 queryInCorpusWithContacts :: CorpusId -> Text -> Maybe Offset -> Maybe Limit -> Maybe OrderBy -> O.Query FacetPairedRead
102 queryInCorpusWithContacts cId q _ _ _ = proc () -> do
103 (docs, (corpusDoc, (docNgrams, (ngrams', (_, contacts))))) <- joinInCorpusWithContacts -< ()
104 restrict -< (_ns_search docs) @@ (pgTSQuery $ unpack q )
105 restrict -< (_ns_typename docs) .== (pgInt4 $ nodeTypeId NodeDocument)
106 restrict -< (nodeNode_node1_id corpusDoc) .== (toNullable $ pgNodeId cId)
107 restrict -< (_nn_listType docNgrams) .== (toNullable $ pgNgramsType Authors)
108 restrict -< (_node_typename contacts) .== (toNullable $ pgInt4 $ nodeTypeId NodeContact)
109 -- let contact_id = ifThenElse (isNull $ _node_id contacts) (toNullable $ pgInt4 0) (_node_id contacts)
110 returnA -< FacetPaired (_ns_id docs) (_ns_date docs) (_ns_hyperdata docs) (pgInt4 0) (Pair (_node_id contacts) (ngrams_terms ngrams'))
111
112 joinInCorpusWithContacts :: O.Query (NodeSearchRead, (NodeNodeReadNull, (NodeNgramReadNull, (NgramsReadNull, (NodeNgramReadNull, NodeReadNull)))))
113 joinInCorpusWithContacts = leftJoin6 queryNodeTable queryNodeNgramTable queryNgramsTable queryNodeNgramTable queryNodeNodeTable queryNodeSearchTable cond12 cond23 cond34 cond45 cond56
114 where
115 cond12 :: (NodeNgramRead, NodeRead) -> Column PGBool
116 cond12 (ng3, n2) = _node_id n2 .== _nn_node_id ng3
117 ---------
118 cond23 :: (NgramsRead, (NodeNgramRead, NodeReadNull)) -> Column PGBool
119 cond23 (ng2, (nng2, _)) = _nn_ngrams_id nng2 .== ngrams_id ng2
120
121 cond34 :: (NodeNgramRead, (NgramsRead, (NodeNgramReadNull, NodeReadNull))) -> Column PGBool
122 cond34 (nng, (ng, (_,_))) = ngrams_id ng .== _nn_ngrams_id nng
123
124 cond45 :: (NodeNodeRead, (NodeNgramRead, (NgramsReadNull, (NodeNgramReadNull, NodeReadNull)))) -> Column PGBool
125 cond45 (nn, (nng, (_,(_,_)))) = _nn_node_id nng .== nodeNode_node2_id nn
126
127 cond56 :: (NodeSearchRead, (NodeNodeRead, (NodeNgramReadNull, (NgramsReadNull, (NodeNgramReadNull, NodeReadNull))))) -> Column PGBool
128 cond56 (n, (nn, (_,(_,(_,_))))) = _ns_id n .== nodeNode_node2_id nn
129
130
131 {-
132 queryGraphCorpusAuthors' :: O.Query (NodeSearchRead, (NodeNodeReadNull, (NodeNgramReadNull, NgramsReadNull)))
133 queryGraphCorpusAuthors' = leftJoin4 queryNgramsTable queryNodeNgramTable queryNodeNodeTable queryNodeSearchTable cond12 cond23 cond34
134 where
135 cond23 :: (NgramsRead, (NodeNgramRead, NodeReadNull)) -> Column PGBool
136 cond23 (ng2, (nng2, _)) = nodeNgram_ngrams_id nng2 .== ngrams_id ng2
137
138 cond34 :: (NodeNgramRead, (NgramsRead, (NodeNgramReadNull, NodeReadNull))) -> Column PGBool
139 cond34 (nng, (ng, (_,_))) = ngrams_id ng .== nodeNgram_ngrams_id nng
140
141 cond45 :: (NodeNodeRead, (NodeNgramRead, (NgramsReadNull, (NodeNgramReadNull, NodeReadNull)))) -> Column PGBool
142 cond45 (nn, (nng, (_,(_,_)))) = nodeNgram_node_id nng .== nodeNode_node2_id nn
143
144 cond56 :: (NodeSearchRead, (NodeNodeRead, (NodeNgramReadNull, (NgramsReadNull, (NodeNgramReadNull, NodeReadNull))))) -> Column PGBool
145 cond56 (n, (nn, (_,(_,(_,_))))) = _ns_id n .== nodeNode_node2_id nn
146 -}
147
148
149
150 newtype TSQuery = UnsafeTSQuery [Text]
151
152 -- | TODO [""] -> panic "error"
153 toTSQuery :: [Text] -> TSQuery
154 toTSQuery txt = UnsafeTSQuery $ map stemIt txt
155
156
157 instance IsString TSQuery
158 where
159 fromString = UnsafeTSQuery . words . cs
160
161
162 instance ToField TSQuery
163 where
164 toField (UnsafeTSQuery xs)
165 = Many $ intersperse (Plain " && ")
166 $ map (\q -> Many [ Plain "plainto_tsquery("
167 , Escape (cs q)
168 , Plain ")"
169 ]
170 ) xs
171
172 data Order = Asc | Desc
173
174 instance ToField Order
175 where
176 toField Asc = Plain "ASC"
177 toField Desc = Plain "DESC"
178
179 -- TODO
180 -- FIX fav
181 -- ADD ngrams count
182 -- TESTS
183 textSearchQuery :: Query
184 textSearchQuery = "SELECT n.id, n.hyperdata->'publication_year' \
185 \ , n.hyperdata->'title' \
186 \ , n.hyperdata->'source' \
187 \ , n.hyperdata->'authors' \
188 \ , COALESCE(nn.score,null) \
189 \ FROM nodes n \
190 \ LEFT JOIN nodes_nodes nn ON nn.node2_id = n.id \
191 \ WHERE \
192 \ n.search @@ (?::tsquery) \
193 \ AND (n.parent_id = ? OR nn.node1_id = ?) \
194 \ AND n.typename = ? \
195 \ ORDER BY n.hyperdata -> 'publication_date' ? \
196 \ offset ? limit ?;"
197
198 -- | Text Search Function for Master Corpus
199 -- TODO : text search for user corpus
200 -- Example:
201 -- textSearchTest :: ParentId -> TSQuery -> Cmd err [(Int, Value, Value, Value, Value, Maybe Int)]
202 -- textSearchTest pId q = textSearch q pId 5 0 Asc
203 textSearch :: TSQuery -> ParentId
204 -> Limit -> Offset -> Order
205 -> Cmd err [(Int,Value,Value,Value, Value, Maybe Int)]
206 textSearch q p l o ord = runPGSQuery textSearchQuery (q,p,p,typeId,ord,o,l)
207 where
208 typeId = nodeTypeId NodeDocument
209
210