]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/Searx.hs
[searx] parser works now
[gargantext.git] / src / Gargantext / API / Node / Corpus / Searx.hs
1 {-# LANGUAGE TemplateHaskell #-}
2
3 module Gargantext.API.Node.Corpus.Searx where
4
5
6
7 import Control.Lens (view)
8 import qualified Data.Aeson as Aeson
9 import Data.Aeson.TH (deriveJSON)
10 import Data.Either (Either(..))
11 import qualified Data.Text as T
12 import Data.Time.Calendar (Day, toGregorian)
13 import Data.Time.Format (defaultTimeLocale, formatTime, parseTimeM)
14 import Data.Tuple.Select (sel1, sel2, sel3)
15 import GHC.Generics (Generic)
16 import Network.HTTP.Client
17 import Network.HTTP.Client.TLS
18
19 import qualified Prelude as Prelude
20 import Protolude (catMaybes, encodeUtf8, rightToMaybe, Text)
21 import Gargantext.Prelude
22 import Gargantext.Prelude.Config
23
24 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
25 --import Gargantext.API.Admin.Types (HasSettings)
26 import Gargantext.API.Job (jobLogSuccess, jobLogFailTotalWithMessage)
27 import Gargantext.Core (Lang(..))
28 import qualified Gargantext.Core.Text.Corpus.API as API
29 import Gargantext.Core.Text.Terms (TermType(..))
30 import Gargantext.Core.Types.Individu (User(..))
31 import Gargantext.Core.Utils.Prefix (unPrefix)
32 import Gargantext.Database.Action.Flow (flowDataText, DataText(..))
33 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
34 import Gargantext.Database.Admin.Config ()
35 import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
36 import Gargantext.Database.Admin.Types.Node (CorpusId)
37 import Gargantext.Database.Prelude (hasConfig)
38 import Gargantext.Database.Query.Table.Node (defaultListMaybe)
39
40
41 langToSearx :: Lang -> Text
42 langToSearx EN = "en-US"
43 langToSearx FR = "fr-FR"
44 langToSearx All = "en-US"
45
46 data SearxResult = SearxResult
47 { _sr_url :: Text
48 , _sr_title :: Text
49 , _sr_content :: Maybe Text
50 , _sr_engine :: Text
51 , _sr_score :: Double
52 , _sr_category :: Text
53 , _sr_pretty_url :: Text
54 , _sr_publishedDate :: Text -- "Nov 19, 2021"
55 , _sr_pubdate :: Text -- "2021-11-19 02:12:00+0000"
56 }
57 deriving (Show, Eq, Generic)
58 -- , _sr_parsed_url
59 -- , _sr_engines
60 -- , _sr_positions
61
62 $(deriveJSON (unPrefix "_sr_") ''SearxResult)
63
64 data SearxResponse = SearxResponse
65 { _srs_query :: Text
66 , _srs_number_of_results :: Int
67 , _srs_results :: [SearxResult] }
68 deriving (Show, Eq, Generic)
69 -- , _srs_answers
70 -- , _srs_corrections
71 -- , _srs_infoboxes
72 -- , _srs_suggestions :: [Text]
73 -- , _srs_unresponsive_engines :: [Text] }
74
75 $(deriveJSON (unPrefix "_srs_") ''SearxResponse)
76
77 data FetchSearxParams = FetchSearxParams
78 { _fsp_language :: Lang
79 , _fsp_manager :: Manager
80 , _fsp_pageno :: Int
81 , _fsp_query :: Text
82 , _fsp_url :: Text
83 }
84
85 fetchSearxPage :: FetchSearxParams -> IO (Either Prelude.String SearxResponse)
86 fetchSearxPage (FetchSearxParams { _fsp_language
87 , _fsp_manager
88 , _fsp_pageno
89 , _fsp_query
90 , _fsp_url }) = do
91 -- searx search API:
92 -- https://searx.github.io/searx/dev/search_api.html?highlight=json
93 req <- parseRequest $ T.unpack _fsp_url
94 let request = urlEncodedBody
95 [ --("category_general", "1")
96 ("q", encodeUtf8 _fsp_query)
97 , ("categories", "news") -- https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/70#note_3976
98 , ("pageno", encodeUtf8 $ T.pack $ show _fsp_pageno)
99 --, ("time_range", "None")
100 , ("language", encodeUtf8 $ langToSearx _fsp_language)
101 , ("format", "json")
102 ] req
103 res <- httpLbs request _fsp_manager
104 let dec = Aeson.eitherDecode $ responseBody res :: (Either Prelude.String SearxResponse)
105 pure dec
106
107 -- TODO Make an async task out of this?
108 triggerSearxSearch :: (MonadBase IO m, FlowCmdM env err m)
109 => User
110 -> CorpusId
111 -> API.Query
112 -> Lang
113 -> (JobLog -> m ())
114 -> m JobLog
115 triggerSearxSearch user cId q l logStatus = do
116 let jobLog = JobLog { _scst_succeeded = Just 1
117 , _scst_failed = Just 0
118 , _scst_remaining = Just 1
119 , _scst_events = Just []
120 }
121 logStatus jobLog
122
123 printDebug "[triggerSearxSearch] cId" cId
124 printDebug "[triggerSearxSearch] q" q
125 printDebug "[triggerSearxSearch] l" l
126 cfg <- view hasConfig
127 let surl = _gc_frame_searx_url cfg
128 printDebug "[triggerSearxSearch] surl" surl
129 mListId <- defaultListMaybe cId
130 case mListId of
131 Nothing -> do
132 let failedJobLog = jobLogFailTotalWithMessage "[triggerSearxSearch] no list id" jobLog
133 logStatus failedJobLog
134 pure failedJobLog
135 Just listId -> do
136 printDebug "[triggerSearxSearch] listId" listId
137
138 manager <- liftBase $ newManager tlsManagerSettings
139 res <- liftBase $ fetchSearxPage $ FetchSearxParams { _fsp_language = l
140 , _fsp_manager = manager
141 , _fsp_pageno = 1
142 , _fsp_query = q
143 , _fsp_url = surl }
144
145 --printDebug "[triggerSearxSearch] res" res
146
147 case res of
148 Left _ -> pure ()
149 Right (SearxResponse { _srs_results }) -> do
150 let docs = hyperdataDocumentFromSearxResult <$> _srs_results
151 --printDebug "[triggerSearxSearch] docs" docs
152 -- docs :: [Either Text HyperdataDocument]
153 let docs' = catMaybes $ rightToMaybe <$> docs
154 Prelude.mapM_ (\(HyperdataDocument { _hd_title, _hd_publication_year, _hd_publication_date }) -> do
155 printDebug "[triggerSearxSearch] doc time" $
156 "[title] " <> (show _hd_title) <>
157 " :: [publication_year] " <> (show _hd_publication_year) <>
158 " :: [publication_date] " <> (show _hd_publication_date)
159 ) docs'
160 _ <- flowDataText user (DataNew [docs']) (Multi EN) cId Nothing logStatus
161 pure ()
162
163 pure $ jobLogSuccess jobLog
164
165 hyperdataDocumentFromSearxResult :: SearxResult -> Either T.Text HyperdataDocument
166 hyperdataDocumentFromSearxResult (SearxResult { _sr_content, _sr_engine, _sr_pubdate, _sr_title }) = do
167 let mDate = parseTimeM False defaultTimeLocale "%Y-%m-%d %H:%M:%S+0000" (T.unpack _sr_pubdate) :: Maybe Day
168 let mGregorian = toGregorian <$> mDate
169 Right HyperdataDocument { _hd_bdd = Just "Searx"
170 , _hd_doi = Nothing
171 , _hd_url = Nothing
172 , _hd_uniqId = Nothing
173 , _hd_uniqIdBdd = Nothing
174 , _hd_page = Nothing
175 , _hd_title = Just $ ("[" <> _sr_pubdate <> "] ") <> _sr_title
176 , _hd_authors = Nothing
177 , _hd_institutes = Nothing
178 , _hd_source = Just _sr_engine
179 , _hd_abstract = _sr_content
180 , _hd_publication_date = T.pack <$> formatTime defaultTimeLocale "%Y-%m-%dT%H:%M:%S" <$> mDate
181 , _hd_publication_year = fromIntegral <$> sel1 <$> mGregorian
182 , _hd_publication_month = sel2 <$> mGregorian
183 , _hd_publication_day = sel3 <$> mGregorian
184 , _hd_publication_hour = Nothing
185 , _hd_publication_minute = Nothing
186 , _hd_publication_second = Nothing
187 , _hd_language_iso2 = Just $ T.pack $ show EN }
188