1 {-# LANGUAGE TemplateHaskell #-}
3 module Gargantext.API.Node.Corpus.Searx where
7 import Control.Lens (view)
8 import qualified Data.Aeson as Aeson
9 import Data.Aeson.TH (deriveJSON)
10 import Data.Either (Either(..))
11 import qualified Data.Text as T
12 import Data.Time.Calendar (Day, toGregorian)
13 import Data.Time.Format (defaultTimeLocale, formatTime, parseTimeM)
14 import Data.Tuple.Select (sel1, sel2, sel3)
15 import GHC.Generics (Generic)
16 import Network.HTTP.Client
17 import Network.HTTP.Client.TLS
19 import qualified Prelude as Prelude
20 import Protolude (catMaybes, encodeUtf8, rightToMaybe, Text)
21 import Gargantext.Prelude
22 import Gargantext.Prelude.Config
24 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
25 --import Gargantext.API.Admin.Types (HasSettings)
26 import Gargantext.API.Job (jobLogSuccess, jobLogFailTotalWithMessage)
27 import Gargantext.Core (Lang(..))
28 import qualified Gargantext.Core.Text.Corpus.API as API
29 import Gargantext.Core.Text.Terms (TermType(..))
30 import Gargantext.Core.Types.Individu (User(..))
31 import Gargantext.Core.Utils.Prefix (unPrefix)
32 import Gargantext.Database.Action.Flow (flowDataText, DataText(..))
33 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
34 import Gargantext.Database.Admin.Config ()
35 import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
36 import Gargantext.Database.Admin.Types.Node (CorpusId)
37 import Gargantext.Database.Prelude (hasConfig)
38 import Gargantext.Database.Query.Table.Node (defaultListMaybe)
41 langToSearx :: Lang -> Text
42 langToSearx EN = "en-US"
43 langToSearx FR = "fr-FR"
44 langToSearx All = "en-US"
46 data SearxResult = SearxResult
49 , _sr_content :: Maybe Text
52 , _sr_category :: Text
53 , _sr_pretty_url :: Text
54 , _sr_publishedDate :: Text -- "Nov 19, 2021"
55 , _sr_pubdate :: Text -- "2021-11-19 02:12:00+0000"
57 deriving (Show, Eq, Generic)
62 $(deriveJSON (unPrefix "_sr_") ''SearxResult)
64 data SearxResponse = SearxResponse
66 , _srs_number_of_results :: Int
67 , _srs_results :: [SearxResult] }
68 deriving (Show, Eq, Generic)
72 -- , _srs_suggestions :: [Text]
73 -- , _srs_unresponsive_engines :: [Text] }
75 $(deriveJSON (unPrefix "_srs_") ''SearxResponse)
77 data FetchSearxParams = FetchSearxParams
78 { _fsp_language :: Lang
79 , _fsp_manager :: Manager
85 fetchSearxPage :: FetchSearxParams -> IO (Either Prelude.String SearxResponse)
86 fetchSearxPage (FetchSearxParams { _fsp_language
92 -- https://searx.github.io/searx/dev/search_api.html?highlight=json
93 req <- parseRequest $ T.unpack _fsp_url
94 let request = urlEncodedBody
95 [ --("category_general", "1")
96 ("q", encodeUtf8 _fsp_query)
97 , ("categories", "news") -- https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/70#note_3976
98 , ("pageno", encodeUtf8 $ T.pack $ show _fsp_pageno)
99 --, ("time_range", "None")
100 , ("language", encodeUtf8 $ langToSearx _fsp_language)
103 res <- httpLbs request _fsp_manager
104 let dec = Aeson.eitherDecode $ responseBody res :: (Either Prelude.String SearxResponse)
107 -- TODO Make an async task out of this?
108 triggerSearxSearch :: (MonadBase IO m, FlowCmdM env err m)
115 triggerSearxSearch user cId q l logStatus = do
116 let jobLog = JobLog { _scst_succeeded = Just 1
117 , _scst_failed = Just 0
118 , _scst_remaining = Just 1
119 , _scst_events = Just []
123 printDebug "[triggerSearxSearch] cId" cId
124 printDebug "[triggerSearxSearch] q" q
125 printDebug "[triggerSearxSearch] l" l
126 cfg <- view hasConfig
127 let surl = _gc_frame_searx_url cfg
128 printDebug "[triggerSearxSearch] surl" surl
129 mListId <- defaultListMaybe cId
132 let failedJobLog = jobLogFailTotalWithMessage "[triggerSearxSearch] no list id" jobLog
133 logStatus failedJobLog
136 printDebug "[triggerSearxSearch] listId" listId
138 manager <- liftBase $ newManager tlsManagerSettings
139 res <- liftBase $ fetchSearxPage $ FetchSearxParams { _fsp_language = l
140 , _fsp_manager = manager
145 --printDebug "[triggerSearxSearch] res" res
149 Right (SearxResponse { _srs_results }) -> do
150 let docs = hyperdataDocumentFromSearxResult <$> _srs_results
151 --printDebug "[triggerSearxSearch] docs" docs
152 -- docs :: [Either Text HyperdataDocument]
153 let docs' = catMaybes $ rightToMaybe <$> docs
154 Prelude.mapM_ (\(HyperdataDocument { _hd_title, _hd_publication_year, _hd_publication_date }) -> do
155 printDebug "[triggerSearxSearch] doc time" $
156 "[title] " <> (show _hd_title) <>
157 " :: [publication_year] " <> (show _hd_publication_year) <>
158 " :: [publication_date] " <> (show _hd_publication_date)
160 _ <- flowDataText user (DataNew [docs']) (Multi EN) cId Nothing logStatus
163 pure $ jobLogSuccess jobLog
165 hyperdataDocumentFromSearxResult :: SearxResult -> Either T.Text HyperdataDocument
166 hyperdataDocumentFromSearxResult (SearxResult { _sr_content, _sr_engine, _sr_pubdate, _sr_title }) = do
167 let mDate = parseTimeM False defaultTimeLocale "%Y-%m-%d %H:%M:%S+0000" (T.unpack _sr_pubdate) :: Maybe Day
168 let mGregorian = toGregorian <$> mDate
169 Right HyperdataDocument { _hd_bdd = Just "Searx"
172 , _hd_uniqId = Nothing
173 , _hd_uniqIdBdd = Nothing
175 , _hd_title = Just $ ("[" <> _sr_pubdate <> "] ") <> _sr_title
176 , _hd_authors = Nothing
177 , _hd_institutes = Nothing
178 , _hd_source = Just _sr_engine
179 , _hd_abstract = _sr_content
180 , _hd_publication_date = T.pack <$> formatTime defaultTimeLocale "%Y-%m-%dT%H:%M:%S" <$> mDate
181 , _hd_publication_year = fromIntegral <$> sel1 <$> mGregorian
182 , _hd_publication_month = sel2 <$> mGregorian
183 , _hd_publication_day = sel3 <$> mGregorian
184 , _hd_publication_hour = Nothing
185 , _hd_publication_minute = Nothing
186 , _hd_publication_second = Nothing
187 , _hd_language_iso2 = Just $ T.pack $ show EN }