{-# LANGUAGE TemplateHaskell #-} module Gargantext.API.Node.Corpus.Searx where import Control.Lens (view) import qualified Data.Aeson as Aeson import Data.Aeson.TH (deriveJSON) import Data.Either (Either(..)) import qualified Data.Text as T import Data.Time.Calendar (Day, toGregorian) import Data.Time.Format (defaultTimeLocale, formatTime, parseTimeM) import Data.Tuple.Select (sel1, sel2, sel3) import GHC.Generics (Generic) import Network.HTTP.Client import Network.HTTP.Client.TLS import qualified Prelude as Prelude import Protolude (catMaybes, encodeUtf8, rightToMaybe, Text) import Gargantext.Prelude import Gargantext.Prelude.Config import Gargantext.API.Admin.Orchestrator.Types (JobLog(..)) --import Gargantext.API.Admin.Types (HasSettings) import Gargantext.API.Job (jobLogSuccess, jobLogFailTotalWithMessage) import Gargantext.Core (Lang(..)) import qualified Gargantext.Core.Text.Corpus.API as API import Gargantext.Core.Text.Terms (TermType(..)) import Gargantext.Core.Types.Individu (User(..)) import Gargantext.Core.Utils.Prefix (unPrefix) import Gargantext.Database.Action.Flow (flowDataText, DataText(..)) import Gargantext.Database.Action.Flow.Types (FlowCmdM) import Gargantext.Database.Admin.Config () import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..)) import Gargantext.Database.Admin.Types.Node (CorpusId) import Gargantext.Database.Prelude (hasConfig) import Gargantext.Database.Query.Table.Node (defaultListMaybe) langToSearx :: Lang -> Text langToSearx EN = "en-US" langToSearx FR = "fr-FR" langToSearx All = "en-US" data SearxResult = SearxResult { _sr_url :: Text , _sr_title :: Text , _sr_content :: Maybe Text , _sr_engine :: Text , _sr_score :: Double , _sr_category :: Text , _sr_pretty_url :: Text , _sr_publishedDate :: Text -- "Nov 19, 2021" , _sr_pubdate :: Text -- "2021-11-19 02:12:00+0000" } deriving (Show, Eq, Generic) -- , _sr_parsed_url -- , _sr_engines -- , _sr_positions $(deriveJSON (unPrefix "_sr_") ''SearxResult) data SearxResponse = SearxResponse { _srs_query :: Text , _srs_number_of_results :: Int , _srs_results :: [SearxResult] } deriving (Show, Eq, Generic) -- , _srs_answers -- , _srs_corrections -- , _srs_infoboxes -- , _srs_suggestions :: [Text] -- , _srs_unresponsive_engines :: [Text] } $(deriveJSON (unPrefix "_srs_") ''SearxResponse) data FetchSearxParams = FetchSearxParams { _fsp_language :: Lang , _fsp_manager :: Manager , _fsp_pageno :: Int , _fsp_query :: Text , _fsp_url :: Text } fetchSearxPage :: FetchSearxParams -> IO (Either Prelude.String SearxResponse) fetchSearxPage (FetchSearxParams { _fsp_language , _fsp_manager , _fsp_pageno , _fsp_query , _fsp_url }) = do -- searx search API: -- https://searx.github.io/searx/dev/search_api.html?highlight=json req <- parseRequest $ T.unpack _fsp_url let request = urlEncodedBody [ --("category_general", "1") ("q", encodeUtf8 _fsp_query) , ("categories", "news") -- https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/70#note_3976 , ("pageno", encodeUtf8 $ T.pack $ show _fsp_pageno) --, ("time_range", "None") , ("language", encodeUtf8 $ langToSearx _fsp_language) , ("format", "json") ] req res <- httpLbs request _fsp_manager let dec = Aeson.eitherDecode $ responseBody res :: (Either Prelude.String SearxResponse) pure dec -- TODO Make an async task out of this? triggerSearxSearch :: (MonadBase IO m, FlowCmdM env err m) => User -> CorpusId -> API.Query -> Lang -> (JobLog -> m ()) -> m JobLog triggerSearxSearch user cId q l logStatus = do let jobLog = JobLog { _scst_succeeded = Just 1 , _scst_failed = Just 0 , _scst_remaining = Just 1 , _scst_events = Just [] } logStatus jobLog printDebug "[triggerSearxSearch] cId" cId printDebug "[triggerSearxSearch] q" q printDebug "[triggerSearxSearch] l" l cfg <- view hasConfig let surl = _gc_frame_searx_url cfg printDebug "[triggerSearxSearch] surl" surl mListId <- defaultListMaybe cId case mListId of Nothing -> do let failedJobLog = jobLogFailTotalWithMessage "[triggerSearxSearch] no list id" jobLog logStatus failedJobLog pure failedJobLog Just listId -> do printDebug "[triggerSearxSearch] listId" listId manager <- liftBase $ newManager tlsManagerSettings res <- liftBase $ fetchSearxPage $ FetchSearxParams { _fsp_language = l , _fsp_manager = manager , _fsp_pageno = 1 , _fsp_query = q , _fsp_url = surl } --printDebug "[triggerSearxSearch] res" res case res of Left _ -> pure () Right (SearxResponse { _srs_results }) -> do let docs = hyperdataDocumentFromSearxResult <$> _srs_results --printDebug "[triggerSearxSearch] docs" docs -- docs :: [Either Text HyperdataDocument] let docs' = catMaybes $ rightToMaybe <$> docs Prelude.mapM_ (\(HyperdataDocument { _hd_title, _hd_publication_year, _hd_publication_date }) -> do printDebug "[triggerSearxSearch] doc time" $ "[title] " <> (show _hd_title) <> " :: [publication_year] " <> (show _hd_publication_year) <> " :: [publication_date] " <> (show _hd_publication_date) ) docs' _ <- flowDataText user (DataNew [docs']) (Multi EN) cId Nothing logStatus pure () pure $ jobLogSuccess jobLog hyperdataDocumentFromSearxResult :: SearxResult -> Either T.Text HyperdataDocument hyperdataDocumentFromSearxResult (SearxResult { _sr_content, _sr_engine, _sr_pubdate, _sr_title }) = do let mDate = parseTimeM False defaultTimeLocale "%Y-%m-%d %H:%M:%S+0000" (T.unpack _sr_pubdate) :: Maybe Day let mGregorian = toGregorian <$> mDate Right HyperdataDocument { _hd_bdd = Just "Searx" , _hd_doi = Nothing , _hd_url = Nothing , _hd_uniqId = Nothing , _hd_uniqIdBdd = Nothing , _hd_page = Nothing , _hd_title = Just $ ("[" <> _sr_pubdate <> "] ") <> _sr_title , _hd_authors = Nothing , _hd_institutes = Nothing , _hd_source = Just _sr_engine , _hd_abstract = _sr_content , _hd_publication_date = T.pack <$> formatTime defaultTimeLocale "%Y-%m-%dT%H:%M:%S" <$> mDate , _hd_publication_year = fromIntegral <$> sel1 <$> mGregorian , _hd_publication_month = sel2 <$> mGregorian , _hd_publication_day = sel3 <$> mGregorian , _hd_publication_hour = Nothing , _hd_publication_minute = Nothing , _hd_publication_second = Nothing , _hd_language_iso2 = Just $ T.pack $ show EN }