1 {-# LANGUAGE TemplateHaskell #-}
3 module Gargantext.API.Node.Corpus.Searx where
7 import Control.Lens (view)
8 import qualified Data.Aeson as Aeson
9 import Data.Aeson.TH (deriveJSON)
10 import Data.Either (Either(..))
11 import qualified Data.HashMap.Strict as HashMap
12 import qualified Data.Text as T
13 import Data.Time.Calendar (Day, toGregorian)
14 import Data.Time.Format (defaultTimeLocale, formatTime, parseTimeM)
15 import Data.Tuple.Select (sel1, sel2, sel3)
16 import GHC.Generics (Generic)
17 import Network.HTTP.Client
18 import Network.HTTP.Client.TLS
20 import qualified Prelude as Prelude
21 import Protolude (catMaybes, encodeUtf8, rightToMaybe, Text)
22 import Gargantext.Prelude
23 import Gargantext.Prelude.Config
25 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
26 --import Gargantext.API.Admin.Types (HasSettings)
27 import Gargantext.API.Job (jobLogSuccess)
28 import Gargantext.Core (Lang(..), PosTagAlgo(..))
29 import qualified Gargantext.Core.Text.Corpus.API as API
30 import Gargantext.Core.Text.List (buildNgramsLists)
31 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
32 import Gargantext.Core.Text.Terms (TermType(..))
33 import Gargantext.Core.Types.Individu (User(..))
34 import Gargantext.Core.Utils.Prefix (unPrefix)
35 import Gargantext.Database.Action.Flow (insertMasterDocs) --, DataText(..))
36 import Gargantext.Database.Action.Flow.List (flowList_DbRepo)
37 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
38 import Gargantext.Database.Action.User (getUserId)
39 import Gargantext.Database.Admin.Config (userMaster)
40 import Gargantext.Database.Admin.Types.Hyperdata.Corpus (HyperdataCorpus)
41 import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
42 import Gargantext.Database.Admin.Types.Node (CorpusId)
43 import Gargantext.Database.Prelude (hasConfig)
44 import Gargantext.Database.Query.Table.Node (defaultListMaybe, getOrMkList)
45 import Gargantext.Database.Query.Tree.Root (getOrMk_RootWithCorpus)
46 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
48 langToSearx :: Lang -> Text
49 langToSearx EN = "en-US"
50 langToSearx FR = "fr-FR"
51 langToSearx All = "en-US"
53 data SearxResult = SearxResult
56 , _sr_content :: Maybe Text
59 , _sr_category :: Text
60 , _sr_pretty_url :: Text
61 , _sr_publishedDate :: Text -- "Nov 19, 2021"
62 , _sr_pubdate :: Text -- "2021-11-19 02:12:00+0000"
64 deriving (Show, Eq, Generic)
69 $(deriveJSON (unPrefix "_sr_") ''SearxResult)
71 data SearxResponse = SearxResponse
73 , _srs_number_of_results :: Int
74 , _srs_results :: [SearxResult] }
75 deriving (Show, Eq, Generic)
79 -- , _srs_suggestions :: [Text]
80 -- , _srs_unresponsive_engines :: [Text] }
82 $(deriveJSON (unPrefix "_srs_") ''SearxResponse)
84 data FetchSearxParams = FetchSearxParams
85 { _fsp_language :: Lang
86 , _fsp_manager :: Manager
92 fetchSearxPage :: FetchSearxParams -> IO (Either Prelude.String SearxResponse)
93 fetchSearxPage (FetchSearxParams { _fsp_language
99 -- https://searx.github.io/searx/dev/search_api.html?highlight=json
100 req <- parseRequest $ T.unpack _fsp_url
101 let request = urlEncodedBody
102 [ --("category_general", "1")
103 ("q", encodeUtf8 _fsp_query)
104 , ("categories", "news") -- https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/70#note_3976
105 , ("pageno", encodeUtf8 $ T.pack $ show _fsp_pageno)
106 --, ("time_range", "None")
107 , ("language", encodeUtf8 $ langToSearx _fsp_language)
110 res <- httpLbs request _fsp_manager
111 let dec = Aeson.eitherDecode $ responseBody res :: (Either Prelude.String SearxResponse)
114 -- TODO Make an async task out of this?
115 triggerSearxSearch :: (MonadBase IO m, FlowCmdM env err m)
122 triggerSearxSearch user cId q l logStatus = do
123 let jobLog = JobLog { _scst_succeeded = Just 1
124 , _scst_failed = Just 0
125 , _scst_remaining = Just 1
126 , _scst_events = Just []
130 printDebug "[triggerSearxSearch] cId" cId
131 printDebug "[triggerSearxSearch] q" q
132 printDebug "[triggerSearxSearch] l" l
133 cfg <- view hasConfig
134 uId <- getUserId user
135 let surl = _gc_frame_searx_url cfg
136 printDebug "[triggerSearxSearch] surl" surl
137 mListId <- defaultListMaybe cId
138 listId <- case mListId of
140 --let failedJobLog = jobLogFailTotalWithMessage "[triggerSearxSearch] no list id" jobLog
141 --logStatus failedJobLog
143 listId <- getOrMkList cId uId
145 Just listId -> pure listId
147 printDebug "[triggerSearxSearch] listId" listId
149 manager <- liftBase $ newManager tlsManagerSettings
150 res <- liftBase $ fetchSearxPage $ FetchSearxParams { _fsp_language = l
151 , _fsp_manager = manager
156 --printDebug "[triggerSearxSearch] res" res
160 Right (SearxResponse { _srs_results }) -> do
161 let docs = hyperdataDocumentFromSearxResult <$> _srs_results
162 --printDebug "[triggerSearxSearch] docs" docs
163 -- docs :: [Either Text HyperdataDocument]
164 let docs' = catMaybes $ rightToMaybe <$> docs
165 Prelude.mapM_ (\(HyperdataDocument { _hd_title, _hd_publication_year, _hd_publication_date }) -> do
166 printDebug "[triggerSearxSearch] doc time" $
167 "[title] " <> (show _hd_title) <>
168 " :: [publication_year] " <> (show _hd_publication_year) <>
169 " :: [publication_date] " <> (show _hd_publication_date)
171 --_ <- flowDataText user (DataNew [docs']) (Multi EN) cId Nothing logStatus
172 let mCorpus = Nothing :: Maybe HyperdataCorpus
173 ids <- insertMasterDocs mCorpus (Multi EN) docs'
175 (_masterUserId, _masterRootId, masterCorpusId)
176 <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") mCorpus
177 let gp = GroupWithPosTag l CoreNLP HashMap.empty
178 ngs <- buildNgramsLists user cId masterCorpusId Nothing gp
179 _userListId <- flowList_DbRepo listId ngs
183 pure $ jobLogSuccess jobLog
185 hyperdataDocumentFromSearxResult :: SearxResult -> Either T.Text HyperdataDocument
186 hyperdataDocumentFromSearxResult (SearxResult { _sr_content, _sr_engine, _sr_pubdate, _sr_title }) = do
187 let mDate = parseTimeM False defaultTimeLocale "%Y-%m-%d %H:%M:%S+0000" (T.unpack _sr_pubdate) :: Maybe Day
188 let mGregorian = toGregorian <$> mDate
189 Right HyperdataDocument { _hd_bdd = Just "Searx"
192 , _hd_uniqId = Nothing
193 , _hd_uniqIdBdd = Nothing
195 , _hd_title = Just $ ("[" <> _sr_pubdate <> "] ") <> _sr_title
196 , _hd_authors = Nothing
197 , _hd_institutes = Nothing
198 , _hd_source = Just _sr_engine
199 , _hd_abstract = _sr_content
200 , _hd_publication_date = T.pack <$> formatTime defaultTimeLocale "%Y-%m-%dT%H:%M:%S" <$> mDate
201 , _hd_publication_year = fromIntegral <$> sel1 <$> mGregorian
202 , _hd_publication_month = sel2 <$> mGregorian
203 , _hd_publication_day = sel3 <$> mGregorian
204 , _hd_publication_hour = Nothing
205 , _hd_publication_minute = Nothing
206 , _hd_publication_second = Nothing
207 , _hd_language_iso2 = Just $ T.pack $ show EN }