1 {-# LANGUAGE TemplateHaskell #-}
3 module Gargantext.API.Node.Corpus.Searx where
7 import Control.Lens (view)
8 import qualified Data.Aeson as Aeson
9 import Data.Aeson.TH (deriveJSON)
10 import Data.Either (Either(..))
11 import qualified Data.HashMap.Strict as HashMap
12 import qualified Data.Text as T
13 import Data.Time.Calendar (Day, toGregorian)
14 import Data.Time.Format (defaultTimeLocale, formatTime, parseTimeM)
15 import Data.Tuple.Select (sel1, sel2, sel3)
16 import GHC.Generics (Generic)
17 import Network.HTTP.Client
18 import Network.HTTP.Client.TLS
20 import qualified Prelude as Prelude
21 import Protolude (catMaybes, encodeUtf8, rightToMaybe, Text)
22 import Gargantext.Prelude
23 import Gargantext.Prelude.Config
25 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
26 --import Gargantext.API.Admin.Types (HasSettings)
27 import Gargantext.API.Job (jobLogSuccess)
28 import Gargantext.Core (Lang(..), PosTagAlgo(..))
29 import qualified Gargantext.Core.Text.Corpus.API as API
30 import Gargantext.Core.Text.List (buildNgramsLists)
31 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
32 import Gargantext.Core.Text.Terms (TermType(..))
33 import Gargantext.Core.Types.Individu (User(..))
34 import Gargantext.Core.Utils.Prefix (unPrefix)
35 import Gargantext.Database.Action.Flow (insertMasterDocs) --, DataText(..))
36 import Gargantext.Database.Action.Flow.List (flowList_DbRepo)
37 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
38 import Gargantext.Database.Action.User (getUserId)
39 import Gargantext.Database.Admin.Config (userMaster)
40 import Gargantext.Database.Admin.Types.Hyperdata.Corpus (HyperdataCorpus)
41 import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
42 import Gargantext.Database.Admin.Types.Node (CorpusId, ListId)
43 import Gargantext.Database.Prelude (hasConfig)
44 import Gargantext.Database.Query.Table.Node (defaultListMaybe, getOrMkList)
45 import Gargantext.Database.Query.Tree.Root (getOrMk_RootWithCorpus)
46 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
48 langToSearx :: Lang -> Text
49 langToSearx EN = "en-US"
50 langToSearx FR = "fr-FR"
51 langToSearx All = "en-US"
53 data SearxResult = SearxResult
56 , _sr_content :: Maybe Text
59 , _sr_category :: Text
60 , _sr_pretty_url :: Text
61 , _sr_publishedDate :: Text -- "Nov 19, 2021"
62 , _sr_pubdate :: Text -- "2021-11-19 02:12:00+0000"
64 deriving (Show, Eq, Generic)
69 $(deriveJSON (unPrefix "_sr_") ''SearxResult)
71 data SearxResponse = SearxResponse
73 , _srs_number_of_results :: Int
74 , _srs_results :: [SearxResult] }
75 deriving (Show, Eq, Generic)
79 -- , _srs_suggestions :: [Text]
80 -- , _srs_unresponsive_engines :: [Text] }
82 $(deriveJSON (unPrefix "_srs_") ''SearxResponse)
84 data FetchSearxParams = FetchSearxParams
85 { _fsp_language :: Lang
86 , _fsp_manager :: Manager
92 fetchSearxPage :: FetchSearxParams -> IO (Either Prelude.String SearxResponse)
93 fetchSearxPage (FetchSearxParams { _fsp_language
99 -- https://searx.github.io/searx/dev/search_api.html?highlight=json
100 req <- parseRequest $ T.unpack _fsp_url
101 let request = urlEncodedBody
102 [ --("category_general", "1")
103 ("q", encodeUtf8 _fsp_query)
104 , ("categories", "news") -- https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/70#note_3976
105 , ("pageno", encodeUtf8 $ T.pack $ show _fsp_pageno)
106 --, ("time_range", "None")
107 , ("language", encodeUtf8 $ langToSearx _fsp_language)
110 res <- httpLbs request _fsp_manager
111 let dec = Aeson.eitherDecode $ responseBody res :: (Either Prelude.String SearxResponse)
114 insertSearxResponse :: (MonadBase IO m, FlowCmdM env err m)
119 -> Either Prelude.String SearxResponse
121 insertSearxResponse _ _ _ _ (Left _) = pure ()
122 insertSearxResponse user cId listId l (Right (SearxResponse { _srs_results })) = do
123 let docs = hyperdataDocumentFromSearxResult <$> _srs_results
124 --printDebug "[triggerSearxSearch] docs" docs
125 -- docs :: [Either Text HyperdataDocument]
126 let docs' = catMaybes $ rightToMaybe <$> docs
127 Prelude.mapM_ (\(HyperdataDocument { _hd_title, _hd_publication_year, _hd_publication_date }) -> do
128 printDebug "[triggerSearxSearch] doc time" $
129 "[title] " <> (show _hd_title) <>
130 " :: [publication_year] " <> (show _hd_publication_year) <>
131 " :: [publication_date] " <> (show _hd_publication_date)
133 --_ <- flowDataText user (DataNew [docs']) (Multi EN) cId Nothing logStatus
134 let mCorpus = Nothing :: Maybe HyperdataCorpus
135 ids <- insertMasterDocs mCorpus (Multi EN) docs'
137 (_masterUserId, _masterRootId, masterCorpusId)
138 <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") mCorpus
139 let gp = GroupWithPosTag l CoreNLP HashMap.empty
140 ngs <- buildNgramsLists user cId masterCorpusId Nothing gp
141 _userListId <- flowList_DbRepo listId ngs
145 -- TODO Make an async task out of this?
146 triggerSearxSearch :: (MonadBase IO m, FlowCmdM env err m)
153 triggerSearxSearch user cId q l logStatus = do
155 let jobLog = JobLog { _scst_succeeded = Just 0
156 , _scst_failed = Just 0
157 , _scst_remaining = Just numPages
158 , _scst_events = Just []
162 printDebug "[triggerSearxSearch] cId" cId
163 printDebug "[triggerSearxSearch] q" q
164 printDebug "[triggerSearxSearch] l" l
165 cfg <- view hasConfig
166 uId <- getUserId user
167 let surl = _gc_frame_searx_url cfg
168 printDebug "[triggerSearxSearch] surl" surl
169 mListId <- defaultListMaybe cId
170 listId <- case mListId of
172 --let failedJobLog = jobLogFailTotalWithMessage "[triggerSearxSearch] no list id" jobLog
173 --logStatus failedJobLog
175 listId <- getOrMkList cId uId
177 Just listId -> pure listId
179 printDebug "[triggerSearxSearch] listId" listId
181 manager <- liftBase $ newManager tlsManagerSettings
182 _ <- mapM (\page -> do
183 res <- liftBase $ fetchSearxPage $ FetchSearxParams { _fsp_language = l
184 , _fsp_manager = manager
189 insertSearxResponse user cId listId l res
191 logStatus $ JobLog { _scst_succeeded = Just page
192 , _scst_failed = Just 0
193 , _scst_remaining = Just (numPages - page)
194 , _scst_events = Just [] }
196 --printDebug "[triggerSearxSearch] res" res
198 pure $ jobLogSuccess jobLog
200 hyperdataDocumentFromSearxResult :: SearxResult -> Either T.Text HyperdataDocument
201 hyperdataDocumentFromSearxResult (SearxResult { _sr_content, _sr_engine, _sr_pubdate, _sr_title }) = do
202 let mDate = parseTimeM False defaultTimeLocale "%Y-%m-%d %H:%M:%S+0000" (T.unpack _sr_pubdate) :: Maybe Day
203 let mGregorian = toGregorian <$> mDate
204 Right HyperdataDocument { _hd_bdd = Just "Searx"
207 , _hd_uniqId = Nothing
208 , _hd_uniqIdBdd = Nothing
210 , _hd_title = Just $ ("[" <> _sr_pubdate <> "] ") <> _sr_title
211 , _hd_authors = Nothing
212 , _hd_institutes = Nothing
213 , _hd_source = Just _sr_engine
214 , _hd_abstract = _sr_content
215 , _hd_publication_date = T.pack <$> formatTime defaultTimeLocale "%Y-%m-%dT%H:%M:%S" <$> mDate
216 , _hd_publication_year = fromIntegral <$> sel1 <$> mGregorian
217 , _hd_publication_month = sel2 <$> mGregorian
218 , _hd_publication_day = sel3 <$> mGregorian
219 , _hd_publication_hour = Nothing
220 , _hd_publication_minute = Nothing
221 , _hd_publication_second = Nothing
222 , _hd_language_iso2 = Just $ T.pack $ show EN }