1 {-# LANGUAGE TemplateHaskell #-}
3 module Gargantext.API.Node.Corpus.Searx where
7 import Control.Lens (view)
8 import qualified Data.Aeson as Aeson
9 import Data.Aeson.TH (deriveJSON)
10 import Data.Either (Either(..))
11 import qualified Data.Text as T
12 import Data.Time.Calendar (Day, toGregorian)
13 import Data.Time.Format (defaultTimeLocale, parseTimeM)
14 import Data.Tuple.Select (sel1, sel2, sel3)
15 import GHC.Generics (Generic)
16 import Network.HTTP.Client
17 import Network.HTTP.Client.TLS
19 import qualified Prelude as Prelude
20 import Protolude (encodeUtf8, Text)
21 import Gargantext.Prelude
22 import Gargantext.Prelude.Config
24 import Gargantext.Core (Lang(..))
25 import qualified Gargantext.Core.Text.Corpus.API as API
26 import Gargantext.Core.Utils.Prefix (unPrefix)
27 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
28 import Gargantext.Database.Admin.Config ()
29 import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
30 import Gargantext.Database.Admin.Types.Node (CorpusId)
31 import Gargantext.Database.Prelude (hasConfig)
32 import Gargantext.Database.Query.Table.Node (defaultList)
35 langToSearx :: Lang -> Text
36 langToSearx EN = "en-US"
37 langToSearx FR = "fr-FR"
38 langToSearx All = "en-US"
40 data SearxResult = SearxResult
43 , _sr_content :: Maybe Text
46 , _sr_category :: Text
47 , _sr_pretty_url :: Text
48 , _sr_publishedDate :: Text -- "Nov 19, 2021"
49 , _sr_pubdate :: Text -- "2021-11-19 02:12:00+0000"
51 deriving (Show, Eq, Generic)
56 $(deriveJSON (unPrefix "_sr_") ''SearxResult)
58 data SearxResponse = SearxResponse
60 , _srs_number_of_results :: Int
61 , _srs_results :: [SearxResult] }
62 deriving (Show, Eq, Generic)
66 -- , _srs_suggestions :: [Text]
67 -- , _srs_unresponsive_engines :: [Text] }
69 $(deriveJSON (unPrefix "_srs_") ''SearxResponse)
71 data FetchSearxParams = FetchSearxParams
72 { _fsp_language :: Lang
73 , _fsp_manager :: Manager
79 fetchSearxPage :: FetchSearxParams -> IO (Either Prelude.String SearxResponse)
80 fetchSearxPage (FetchSearxParams { _fsp_language
86 -- https://searx.github.io/searx/dev/search_api.html?highlight=json
87 req <- parseRequest $ T.unpack _fsp_url
88 let request = urlEncodedBody
89 [ --("category_general", "1")
90 ("q", encodeUtf8 _fsp_query)
91 , ("categories", "news") -- https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/70#note_3976
92 , ("pageno", encodeUtf8 $ T.pack $ show _fsp_pageno)
93 --, ("time_range", "None")
94 , ("language", encodeUtf8 $ langToSearx _fsp_language)
97 res <- httpLbs request _fsp_manager
98 let dec = Aeson.eitherDecode $ responseBody res :: (Either Prelude.String SearxResponse)
101 -- TODO Make an async task out of this?
102 triggerSearxSearch :: (MonadBase IO m, FlowCmdM env err m)
107 triggerSearxSearch cId q l = do
108 printDebug "[triggerSearxSearch] cId" cId
109 printDebug "[triggerSearxSearch] q" q
110 printDebug "[triggerSearxSearch] l" l
111 cfg <- view hasConfig
112 let surl = _gc_frame_searx_url cfg
113 printDebug "[triggerSearxSearch] surl" surl
114 listId <- defaultList cId
115 printDebug "[triggerSearxSearch] listId" listId
117 manager <- liftBase $ newManager tlsManagerSettings
118 res <- liftBase $ fetchSearxPage $ FetchSearxParams { _fsp_language = l
119 , _fsp_manager = manager
124 printDebug "[triggerSearxSearch] res" res
128 Right (SearxResponse { _srs_results }) -> do
129 let docs = hyperdataDocumentFromSearxResult <$> _srs_results
130 printDebug "[triggerSearxSearch] docs" docs
134 hyperdataDocumentFromSearxResult :: SearxResult -> Either T.Text HyperdataDocument
135 hyperdataDocumentFromSearxResult (SearxResult { _sr_content, _sr_engine, _sr_pubdate, _sr_title }) = do
136 let mDate = parseTimeM False defaultTimeLocale "%Y-%m-%d %H:%M:%S" (T.unpack _sr_pubdate) :: Maybe Day
137 let mGregorian = toGregorian <$> mDate
138 Right HyperdataDocument { _hd_bdd = Just "Searx"
141 , _hd_uniqId = Nothing
142 , _hd_uniqIdBdd = Nothing
144 , _hd_title = Just _sr_title
145 , _hd_authors = Nothing
146 , _hd_institutes = Nothing
147 , _hd_source = Just _sr_engine
148 , _hd_abstract = _sr_content
149 , _hd_publication_date = Just _sr_pubdate
150 , _hd_publication_year = fromIntegral <$> sel1 <$> mGregorian
151 , _hd_publication_month = sel2 <$> mGregorian
152 , _hd_publication_day = sel3 <$> mGregorian
153 , _hd_publication_hour = Nothing
154 , _hd_publication_minute = Nothing
155 , _hd_publication_second = Nothing
156 , _hd_language_iso2 = Just $ T.pack $ show EN }