]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/Searx.hs
Merge remote-tracking branch 'origin/175-dev-doc-table-count' into dev-merge
[gargantext.git] / src / Gargantext / API / Node / Corpus / Searx.hs
1 {-# LANGUAGE TemplateHaskell #-}
2
3 module Gargantext.API.Node.Corpus.Searx where
4
5
6
7 import Control.Lens (view)
8 import qualified Data.Aeson as Aeson
9 import Data.Aeson.TH (deriveJSON)
10 import Data.Either (Either(..))
11 import qualified Data.HashMap.Strict as HashMap
12 import qualified Data.Text as T
13 import Data.Time.Calendar (Day, toGregorian)
14 import Data.Time.Format (defaultTimeLocale, formatTime, parseTimeM)
15 import Data.Tuple.Select (sel1, sel2, sel3)
16 import GHC.Generics (Generic)
17 import Network.HTTP.Client
18 import Network.HTTP.Client.TLS
19
20 import qualified Prelude
21 import Protolude (catMaybes, encodeUtf8, rightToMaybe, Text)
22 import Gargantext.Prelude
23 import Gargantext.Prelude.Config
24
25 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
26 import Gargantext.API.Job (jobLogSuccess)
27 import Gargantext.Core (Lang(..))
28 import Gargantext.Core.NLP (nlpServerGet)
29 import qualified Gargantext.Core.Text.Corpus.API as API
30 import Gargantext.Core.Text.List (buildNgramsLists)
31 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
32 import Gargantext.Core.Text.Terms (TermType(..))
33 import Gargantext.Core.Types.Individu (User(..))
34 import Gargantext.Core.Utils.Prefix (unPrefix)
35 import Gargantext.Database.Action.Flow (insertMasterDocs) --, DataText(..))
36 import Gargantext.Database.Action.Flow.List (flowList_DbRepo)
37 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
38 import Gargantext.Database.Action.User (getUserId)
39 import Gargantext.Database.Admin.Config (userMaster)
40 import Gargantext.Database.Admin.Types.Hyperdata.Corpus (HyperdataCorpus)
41 import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
42 import Gargantext.Database.Admin.Types.Node (CorpusId, ListId)
43 import Gargantext.Database.Prelude (hasConfig)
44 import Gargantext.Database.Query.Table.Node (defaultListMaybe, getOrMkList)
45 import Gargantext.Database.Query.Tree.Root (getOrMk_RootWithCorpus)
46 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
47
48 langToSearx :: Lang -> Text
49 langToSearx EN = "en-US"
50 langToSearx FR = "fr-FR"
51 langToSearx All = "en-US"
52
53 data SearxResult = SearxResult
54 { _sr_url :: Text
55 , _sr_title :: Text
56 , _sr_content :: Maybe Text
57 , _sr_engine :: Text
58 , _sr_score :: Double
59 , _sr_category :: Text
60 , _sr_pretty_url :: Text
61 , _sr_publishedDate :: Text -- "Nov 19, 2021"
62 , _sr_pubdate :: Text -- "2021-11-19 02:12:00+0000"
63 }
64 deriving (Show, Eq, Generic)
65 -- , _sr_parsed_url
66 -- , _sr_engines
67 -- , _sr_positions
68
69 $(deriveJSON (unPrefix "_sr_") ''SearxResult)
70
71 data SearxResponse = SearxResponse
72 { _srs_query :: Text
73 , _srs_number_of_results :: Int
74 , _srs_results :: [SearxResult] }
75 deriving (Show, Eq, Generic)
76 -- , _srs_answers
77 -- , _srs_corrections
78 -- , _srs_infoboxes
79 -- , _srs_suggestions :: [Text]
80 -- , _srs_unresponsive_engines :: [Text] }
81
82 $(deriveJSON (unPrefix "_srs_") ''SearxResponse)
83
84 data FetchSearxParams = FetchSearxParams
85 { _fsp_language :: Lang
86 , _fsp_manager :: Manager
87 , _fsp_pageno :: Int
88 , _fsp_query :: Text
89 , _fsp_url :: Text
90 }
91
92 fetchSearxPage :: FetchSearxParams -> IO (Either Prelude.String SearxResponse)
93 fetchSearxPage (FetchSearxParams { _fsp_language
94 , _fsp_manager
95 , _fsp_pageno
96 , _fsp_query
97 , _fsp_url }) = do
98 -- searx search API:
99 -- https://searx.github.io/searx/dev/search_api.html?highlight=json
100 req <- parseRequest $ T.unpack _fsp_url
101 let request = urlEncodedBody
102 [ --("category_general", "1")
103 ("q", encodeUtf8 _fsp_query)
104 , ("categories", "news") -- https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/70#note_3976
105 , ("pageno", encodeUtf8 $ T.pack $ show _fsp_pageno)
106 --, ("time_range", "None")
107 , ("language", encodeUtf8 $ langToSearx _fsp_language)
108 , ("format", "json")
109 ] req
110 res <- httpLbs request _fsp_manager
111 let dec = Aeson.eitherDecode $ responseBody res :: (Either Prelude.String SearxResponse)
112 pure dec
113
114 insertSearxResponse :: (MonadBase IO m, FlowCmdM env err m)
115 => User
116 -> CorpusId
117 -> ListId
118 -> Lang
119 -> Either Prelude.String SearxResponse
120 -> m ()
121 insertSearxResponse _ _ _ _ (Left _) = pure ()
122 insertSearxResponse user cId listId l (Right (SearxResponse { _srs_results })) = do
123 server <- view (nlpServerGet l)
124 -- docs :: [Either Text HyperdataDocument]
125 let docs = hyperdataDocumentFromSearxResult l <$> _srs_results
126 --printDebug "[triggerSearxSearch] docs" docs
127 let docs' = catMaybes $ rightToMaybe <$> docs
128 {-
129 Prelude.mapM_ (\(HyperdataDocument { _hd_title, _hd_publication_year, _hd_publication_date }) -> do
130 printDebug "[triggerSearxSearch] doc time" $
131 "[title] " <> (show _hd_title) <>
132 " :: [publication_year] " <> (show _hd_publication_year) <>
133 " :: [publication_date] " <> (show _hd_publication_date)
134 ) docs'
135 -}
136 --_ <- flowDataText user (DataNew [docs']) (Multi l) cId Nothing logStatus
137 let mCorpus = Nothing :: Maybe HyperdataCorpus
138 ids <- insertMasterDocs mCorpus (Multi l) docs'
139 _ <- Doc.add cId ids
140 (_masterUserId, _masterRootId, masterCorpusId)
141 <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") mCorpus
142 let gp = GroupWithPosTag l server HashMap.empty
143 -- gp = case l of
144 -- FR -> GroupWithPosTag l Spacy HashMap.empty
145 -- _ -> GroupWithPosTag l CoreNLP HashMap.empty
146 ngs <- buildNgramsLists user cId masterCorpusId Nothing gp
147 _userListId <- flowList_DbRepo listId ngs
148
149 pure ()
150
151 -- TODO Make an async task out of this?
152 triggerSearxSearch :: (MonadBase IO m, FlowCmdM env err m)
153 => User
154 -> CorpusId
155 -> API.Query
156 -> Lang
157 -> (JobLog -> m ())
158 -> m JobLog
159 triggerSearxSearch user cId q l logStatus = do
160 let numPages = 100
161 let jobLog = JobLog { _scst_succeeded = Just 0
162 , _scst_failed = Just 0
163 , _scst_remaining = Just numPages
164 , _scst_events = Just []
165 }
166 logStatus jobLog
167
168 -- printDebug "[triggerSearxSearch] cId" cId
169 -- printDebug "[triggerSearxSearch] q" q
170 -- printDebug "[triggerSearxSearch] l" l
171 cfg <- view hasConfig
172 uId <- getUserId user
173 let surl = _gc_frame_searx_url cfg
174 -- printDebug "[triggerSearxSearch] surl" surl
175 mListId <- defaultListMaybe cId
176 listId <- case mListId of
177 Nothing -> do
178 listId <- getOrMkList cId uId
179 pure listId
180 Just listId -> pure listId
181
182 -- printDebug "[triggerSearxSearch] listId" listId
183
184 manager <- liftBase $ newManager tlsManagerSettings
185 _ <- mapM (\page -> do
186 res <- liftBase $ fetchSearxPage $ FetchSearxParams { _fsp_language = l
187 , _fsp_manager = manager
188 , _fsp_pageno = page
189 , _fsp_query = q
190 , _fsp_url = surl }
191
192 insertSearxResponse user cId listId l res
193
194 logStatus $ JobLog { _scst_succeeded = Just page
195 , _scst_failed = Just 0
196 , _scst_remaining = Just (numPages - page)
197 , _scst_events = Just [] }
198 ) [1..numPages]
199 --printDebug "[triggerSearxSearch] res" res
200
201 pure $ jobLogSuccess jobLog
202
203 hyperdataDocumentFromSearxResult :: Lang -> SearxResult -> Either T.Text HyperdataDocument
204 hyperdataDocumentFromSearxResult l (SearxResult { _sr_content, _sr_engine, _sr_pubdate, _sr_title }) = do
205 let mDate = parseTimeM False defaultTimeLocale "%Y-%m-%d %H:%M:%S+0000" (T.unpack _sr_pubdate) :: Maybe Day
206 let mGregorian = toGregorian <$> mDate
207 Right HyperdataDocument { _hd_bdd = Just "Searx"
208 , _hd_doi = Nothing
209 , _hd_url = Nothing
210 , _hd_uniqId = Nothing
211 , _hd_uniqIdBdd = Nothing
212 , _hd_page = Nothing
213 , _hd_title = Just _sr_title
214 , _hd_authors = Nothing
215 , _hd_institutes = Nothing
216 , _hd_source = Just _sr_engine
217 , _hd_abstract = _sr_content
218 , _hd_publication_date = T.pack <$> formatTime defaultTimeLocale "%Y-%m-%dT%H:%M:%S" <$> mDate
219 , _hd_publication_year = fromIntegral <$> sel1 <$> mGregorian
220 , _hd_publication_month = sel2 <$> mGregorian
221 , _hd_publication_day = sel3 <$> mGregorian
222 , _hd_publication_hour = Nothing
223 , _hd_publication_minute = Nothing
224 , _hd_publication_second = Nothing
225 , _hd_language_iso2 = Just $ T.pack $ show l }