]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/Searx.hs
[searx] add HyperdataDocument generation to triggerSearxSearch
[gargantext.git] / src / Gargantext / API / Node / Corpus / Searx.hs
1 {-# LANGUAGE TemplateHaskell #-}
2
3 module Gargantext.API.Node.Corpus.Searx where
4
5
6
7 import Control.Lens (view)
8 import qualified Data.Aeson as Aeson
9 import Data.Aeson.TH (deriveJSON)
10 import Data.Either (Either(..))
11 import qualified Data.Text as T
12 import Data.Time.Calendar (Day, toGregorian)
13 import Data.Time.Format (defaultTimeLocale, parseTimeM)
14 import Data.Tuple.Select (sel1, sel2, sel3)
15 import GHC.Generics (Generic)
16 import Network.HTTP.Client
17 import Network.HTTP.Client.TLS
18
19 import qualified Prelude as Prelude
20 import Protolude (encodeUtf8, Text)
21 import Gargantext.Prelude
22 import Gargantext.Prelude.Config
23
24 import Gargantext.Core (Lang(..))
25 import qualified Gargantext.Core.Text.Corpus.API as API
26 import Gargantext.Core.Utils.Prefix (unPrefix)
27 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
28 import Gargantext.Database.Admin.Config ()
29 import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
30 import Gargantext.Database.Admin.Types.Node (CorpusId)
31 import Gargantext.Database.Prelude (hasConfig)
32 import Gargantext.Database.Query.Table.Node (defaultList)
33
34
35 langToSearx :: Lang -> Text
36 langToSearx EN = "en-US"
37 langToSearx FR = "fr-FR"
38 langToSearx All = "en-US"
39
40 data SearxResult = SearxResult
41 { _sr_url :: Text
42 , _sr_title :: Text
43 , _sr_content :: Maybe Text
44 , _sr_engine :: Text
45 , _sr_score :: Double
46 , _sr_category :: Text
47 , _sr_pretty_url :: Text
48 , _sr_publishedDate :: Text -- "Nov 19, 2021"
49 , _sr_pubdate :: Text -- "2021-11-19 02:12:00+0000"
50 }
51 deriving (Show, Eq, Generic)
52 -- , _sr_parsed_url
53 -- , _sr_engines
54 -- , _sr_positions
55
56 $(deriveJSON (unPrefix "_sr_") ''SearxResult)
57
58 data SearxResponse = SearxResponse
59 { _srs_query :: Text
60 , _srs_number_of_results :: Int
61 , _srs_results :: [SearxResult] }
62 deriving (Show, Eq, Generic)
63 -- , _srs_answers
64 -- , _srs_corrections
65 -- , _srs_infoboxes
66 -- , _srs_suggestions :: [Text]
67 -- , _srs_unresponsive_engines :: [Text] }
68
69 $(deriveJSON (unPrefix "_srs_") ''SearxResponse)
70
71 data FetchSearxParams = FetchSearxParams
72 { _fsp_language :: Lang
73 , _fsp_manager :: Manager
74 , _fsp_pageno :: Int
75 , _fsp_query :: Text
76 , _fsp_url :: Text
77 }
78
79 fetchSearxPage :: FetchSearxParams -> IO (Either Prelude.String SearxResponse)
80 fetchSearxPage (FetchSearxParams { _fsp_language
81 , _fsp_manager
82 , _fsp_pageno
83 , _fsp_query
84 , _fsp_url }) = do
85 -- searx search API:
86 -- https://searx.github.io/searx/dev/search_api.html?highlight=json
87 req <- parseRequest $ T.unpack _fsp_url
88 let request = urlEncodedBody
89 [ --("category_general", "1")
90 ("q", encodeUtf8 _fsp_query)
91 , ("categories", "news") -- https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/70#note_3976
92 , ("pageno", encodeUtf8 $ T.pack $ show _fsp_pageno)
93 --, ("time_range", "None")
94 , ("language", encodeUtf8 $ langToSearx _fsp_language)
95 , ("format", "json")
96 ] req
97 res <- httpLbs request _fsp_manager
98 let dec = Aeson.eitherDecode $ responseBody res :: (Either Prelude.String SearxResponse)
99 pure dec
100
101 -- TODO Make an async task out of this?
102 triggerSearxSearch :: (MonadBase IO m, FlowCmdM env err m)
103 => CorpusId
104 -> API.Query
105 -> Lang
106 -> m ()
107 triggerSearxSearch cId q l = do
108 printDebug "[triggerSearxSearch] cId" cId
109 printDebug "[triggerSearxSearch] q" q
110 printDebug "[triggerSearxSearch] l" l
111 cfg <- view hasConfig
112 let surl = _gc_frame_searx_url cfg
113 printDebug "[triggerSearxSearch] surl" surl
114 listId <- defaultList cId
115 printDebug "[triggerSearxSearch] listId" listId
116
117 manager <- liftBase $ newManager tlsManagerSettings
118 res <- liftBase $ fetchSearxPage $ FetchSearxParams { _fsp_language = l
119 , _fsp_manager = manager
120 , _fsp_pageno = 1
121 , _fsp_query = q
122 , _fsp_url = surl }
123
124 printDebug "[triggerSearxSearch] res" res
125
126 _ <- case res of
127 Left _ -> pure ()
128 Right (SearxResponse { _srs_results }) -> do
129 let docs = hyperdataDocumentFromSearxResult <$> _srs_results
130 printDebug "[triggerSearxSearch] docs" docs
131
132 pure ()
133
134 hyperdataDocumentFromSearxResult :: SearxResult -> Either T.Text HyperdataDocument
135 hyperdataDocumentFromSearxResult (SearxResult { _sr_content, _sr_engine, _sr_pubdate, _sr_title }) = do
136 let mDate = parseTimeM False defaultTimeLocale "%Y-%m-%d %H:%M:%S" (T.unpack _sr_pubdate) :: Maybe Day
137 let mGregorian = toGregorian <$> mDate
138 Right HyperdataDocument { _hd_bdd = Just "Searx"
139 , _hd_doi = Nothing
140 , _hd_url = Nothing
141 , _hd_uniqId = Nothing
142 , _hd_uniqIdBdd = Nothing
143 , _hd_page = Nothing
144 , _hd_title = Just _sr_title
145 , _hd_authors = Nothing
146 , _hd_institutes = Nothing
147 , _hd_source = Just _sr_engine
148 , _hd_abstract = _sr_content
149 , _hd_publication_date = Just _sr_pubdate
150 , _hd_publication_year = fromIntegral <$> sel1 <$> mGregorian
151 , _hd_publication_month = sel2 <$> mGregorian
152 , _hd_publication_day = sel3 <$> mGregorian
153 , _hd_publication_hour = Nothing
154 , _hd_publication_minute = Nothing
155 , _hd_publication_second = Nothing
156 , _hd_language_iso2 = Just $ T.pack $ show EN }
157