1 {-# LANGUAGE TemplateHaskell #-}
3 module Gargantext.API.Node.Corpus.Searx where
7 import Control.Lens (view)
8 import qualified Data.Aeson as Aeson
9 import Data.Aeson.TH (deriveJSON)
10 import Data.Either (Either(..))
11 import qualified Data.HashMap.Strict as HashMap
12 import qualified Data.Text as T
13 import Data.Time.Calendar (Day, toGregorian)
14 import Data.Time.Format (defaultTimeLocale, formatTime, parseTimeM)
15 import Data.Tuple.Select (sel1, sel2, sel3)
16 import GHC.Generics (Generic)
17 import Network.HTTP.Client
18 import Network.HTTP.Client.TLS
20 import qualified Prelude
21 import Protolude (catMaybes, encodeUtf8, rightToMaybe, Text)
22 import Gargantext.Prelude
23 import Gargantext.Prelude.Config
25 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
26 --import Gargantext.API.Admin.Types (HasSettings)
27 import Gargantext.API.Job (jobLogSuccess)
28 import Gargantext.Core (Lang(..), PosTagAlgo(..))
29 import qualified Gargantext.Core.Text.Corpus.API as API
30 import Gargantext.Core.Text.List (buildNgramsLists)
31 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
32 import Gargantext.Core.Text.Terms (TermType(..))
33 import Gargantext.Core.Types.Individu (User(..))
34 import Gargantext.Core.Utils.Prefix (unPrefix)
35 import Gargantext.Database.Action.Flow (insertMasterDocs) --, DataText(..))
36 import Gargantext.Database.Action.Flow.List (flowList_DbRepo)
37 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
38 import Gargantext.Database.Action.User (getUserId)
39 import Gargantext.Database.Admin.Config (userMaster)
40 import Gargantext.Database.Admin.Types.Hyperdata.Corpus (HyperdataCorpus)
41 import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
42 import Gargantext.Database.Admin.Types.Node (CorpusId, ListId)
43 import Gargantext.Database.Prelude (hasConfig)
44 import Gargantext.Database.Query.Table.Node (defaultListMaybe, getOrMkList)
45 import Gargantext.Database.Query.Tree.Root (getOrMk_RootWithCorpus)
46 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
48 langToSearx :: Lang -> Text
49 langToSearx EN = "en-US"
50 langToSearx FR = "fr-FR"
51 langToSearx All = "en-US"
53 data SearxResult = SearxResult
56 , _sr_content :: Maybe Text
59 , _sr_category :: Text
60 , _sr_pretty_url :: Text
61 , _sr_publishedDate :: Text -- "Nov 19, 2021"
62 , _sr_pubdate :: Text -- "2021-11-19 02:12:00+0000"
64 deriving (Show, Eq, Generic)
69 $(deriveJSON (unPrefix "_sr_") ''SearxResult)
71 data SearxResponse = SearxResponse
73 , _srs_number_of_results :: Int
74 , _srs_results :: [SearxResult] }
75 deriving (Show, Eq, Generic)
79 -- , _srs_suggestions :: [Text]
80 -- , _srs_unresponsive_engines :: [Text] }
82 $(deriveJSON (unPrefix "_srs_") ''SearxResponse)
84 data FetchSearxParams = FetchSearxParams
85 { _fsp_language :: Lang
86 , _fsp_manager :: Manager
92 fetchSearxPage :: FetchSearxParams -> IO (Either Prelude.String SearxResponse)
93 fetchSearxPage (FetchSearxParams { _fsp_language
99 -- https://searx.github.io/searx/dev/search_api.html?highlight=json
100 req <- parseRequest $ T.unpack _fsp_url
101 let request = urlEncodedBody
102 [ --("category_general", "1")
103 ("q", encodeUtf8 _fsp_query)
104 , ("categories", "news") -- https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/70#note_3976
105 , ("pageno", encodeUtf8 $ T.pack $ show _fsp_pageno)
106 --, ("time_range", "None")
107 , ("language", encodeUtf8 $ langToSearx _fsp_language)
110 res <- httpLbs request _fsp_manager
111 let dec = Aeson.eitherDecode $ responseBody res :: (Either Prelude.String SearxResponse)
114 insertSearxResponse :: (MonadBase IO m, FlowCmdM env err m)
119 -> Either Prelude.String SearxResponse
121 insertSearxResponse _ _ _ _ (Left _) = pure ()
122 insertSearxResponse user cId listId l (Right (SearxResponse { _srs_results })) = do
123 -- docs :: [Either Text HyperdataDocument]
124 let docs = hyperdataDocumentFromSearxResult l <$> _srs_results
125 --printDebug "[triggerSearxSearch] docs" docs
126 let docs' = catMaybes $ rightToMaybe <$> docs
128 Prelude.mapM_ (\(HyperdataDocument { _hd_title, _hd_publication_year, _hd_publication_date }) -> do
129 printDebug "[triggerSearxSearch] doc time" $
130 "[title] " <> (show _hd_title) <>
131 " :: [publication_year] " <> (show _hd_publication_year) <>
132 " :: [publication_date] " <> (show _hd_publication_date)
135 --_ <- flowDataText user (DataNew [docs']) (Multi l) cId Nothing logStatus
136 let mCorpus = Nothing :: Maybe HyperdataCorpus
137 ids <- insertMasterDocs mCorpus (Multi l) docs'
139 (_masterUserId, _masterRootId, masterCorpusId)
140 <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") mCorpus
143 FR -> GroupWithPosTag l Spacy HashMap.empty
144 _ -> GroupWithPosTag l CoreNLP HashMap.empty
145 ngs <- buildNgramsLists user cId masterCorpusId Nothing gp
146 _userListId <- flowList_DbRepo listId ngs
150 -- TODO Make an async task out of this?
151 triggerSearxSearch :: (MonadBase IO m, FlowCmdM env err m)
158 triggerSearxSearch user cId q l logStatus = do
160 let jobLog = JobLog { _scst_succeeded = Just 0
161 , _scst_failed = Just 0
162 , _scst_remaining = Just numPages
163 , _scst_events = Just []
167 -- printDebug "[triggerSearxSearch] cId" cId
168 -- printDebug "[triggerSearxSearch] q" q
169 -- printDebug "[triggerSearxSearch] l" l
170 cfg <- view hasConfig
171 uId <- getUserId user
172 let surl = _gc_frame_searx_url cfg
173 -- printDebug "[triggerSearxSearch] surl" surl
174 mListId <- defaultListMaybe cId
175 listId <- case mListId of
177 listId <- getOrMkList cId uId
179 Just listId -> pure listId
181 -- printDebug "[triggerSearxSearch] listId" listId
183 manager <- liftBase $ newManager tlsManagerSettings
184 _ <- mapM (\page -> do
185 res <- liftBase $ fetchSearxPage $ FetchSearxParams { _fsp_language = l
186 , _fsp_manager = manager
191 insertSearxResponse user cId listId l res
193 logStatus $ JobLog { _scst_succeeded = Just page
194 , _scst_failed = Just 0
195 , _scst_remaining = Just (numPages - page)
196 , _scst_events = Just [] }
198 --printDebug "[triggerSearxSearch] res" res
200 pure $ jobLogSuccess jobLog
202 hyperdataDocumentFromSearxResult :: Lang -> SearxResult -> Either T.Text HyperdataDocument
203 hyperdataDocumentFromSearxResult l (SearxResult { _sr_content, _sr_engine, _sr_pubdate, _sr_title }) = do
204 let mDate = parseTimeM False defaultTimeLocale "%Y-%m-%d %H:%M:%S+0000" (T.unpack _sr_pubdate) :: Maybe Day
205 let mGregorian = toGregorian <$> mDate
206 Right HyperdataDocument { _hd_bdd = Just "Searx"
209 , _hd_uniqId = Nothing
210 , _hd_uniqIdBdd = Nothing
212 , _hd_title = Just _sr_title
213 , _hd_authors = Nothing
214 , _hd_institutes = Nothing
215 , _hd_source = Just _sr_engine
216 , _hd_abstract = _sr_content
217 , _hd_publication_date = T.pack <$> formatTime defaultTimeLocale "%Y-%m-%dT%H:%M:%S" <$> mDate
218 , _hd_publication_year = fromIntegral <$> sel1 <$> mGregorian
219 , _hd_publication_month = sel2 <$> mGregorian
220 , _hd_publication_day = sel3 <$> mGregorian
221 , _hd_publication_hour = Nothing
222 , _hd_publication_minute = Nothing
223 , _hd_publication_second = Nothing
224 , _hd_language_iso2 = Just $ T.pack $ show l }