1 {-# LANGUAGE TemplateHaskell #-}
3 module Gargantext.API.Node.Corpus.Searx where
5 import Control.Lens (view)
6 import Data.Aeson.TH (deriveJSON)
7 import Data.Either (Either(..))
8 import Data.Time.Calendar (Day, toGregorian)
9 import Data.Time.Format (defaultTimeLocale, formatTime, parseTimeM)
10 import Data.Tuple.Select (sel1, sel2, sel3)
11 import GHC.Generics (Generic)
12 import Gargantext.Core (Lang(..))
13 import Gargantext.Core.NLP (nlpServerGet)
14 import Gargantext.Core.Text.List (buildNgramsLists)
15 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
16 import Gargantext.Core.Text.Terms (TermType(..))
17 import Gargantext.Core.Types.Individu (User(..))
18 import Gargantext.Core.Utils.Prefix (unPrefix)
19 import Gargantext.Database.Action.Flow (insertMasterDocs) --, DataText(..))
20 import Gargantext.Database.Action.Flow.List (flowList_DbRepo)
21 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
22 import Gargantext.Database.Action.User (getUserId)
23 import Gargantext.Database.Admin.Config (userMaster)
24 import Gargantext.Database.Admin.Types.Hyperdata.Corpus (HyperdataCorpus)
25 import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
26 import Gargantext.Database.Admin.Types.Node (CorpusId, ListId)
27 import Gargantext.Database.Prelude (hasConfig)
28 import Gargantext.Database.Query.Table.Node (defaultListMaybe, getOrMkList)
29 import Gargantext.Database.Query.Tree.Root (getOrMk_RootWithCorpus)
30 import Gargantext.Prelude
31 import Gargantext.Prelude.Config
32 import Gargantext.Utils.Jobs (JobHandle, MonadJobStatus(..))
33 import Network.HTTP.Client
34 import Network.HTTP.Client.TLS
35 import Protolude (catMaybes, encodeUtf8, rightToMaybe, Text)
36 import qualified Data.Aeson as Aeson
37 import qualified Data.HashMap.Strict as HashMap
38 import qualified Data.Text as T
39 import qualified Data.Text as Text
40 import qualified Gargantext.Core.Text.Corpus.API as API
41 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
42 import qualified Prelude
44 langToSearx :: Lang -> Text
45 langToSearx All = "en-US"
46 langToSearx x = (Text.toLower acronym) <> "-" <> acronym
48 acronym = (cs $ show x)
50 data SearxResult = SearxResult
53 , _sr_content :: Maybe Text
56 , _sr_category :: Text
57 , _sr_pretty_url :: Text
58 , _sr_publishedDate :: Text -- "Nov 19, 2021"
59 , _sr_pubdate :: Text -- "2021-11-19 02:12:00+0000"
61 deriving (Show, Eq, Generic)
66 $(deriveJSON (unPrefix "_sr_") ''SearxResult)
68 data SearxResponse = SearxResponse
70 , _srs_number_of_results :: Int
71 , _srs_results :: [SearxResult] }
72 deriving (Show, Eq, Generic)
76 -- , _srs_suggestions :: [Text]
77 -- , _srs_unresponsive_engines :: [Text] }
79 $(deriveJSON (unPrefix "_srs_") ''SearxResponse)
81 data FetchSearxParams = FetchSearxParams
82 { _fsp_language :: Lang
83 , _fsp_manager :: Manager
89 fetchSearxPage :: FetchSearxParams -> IO (Either Prelude.String SearxResponse)
90 fetchSearxPage (FetchSearxParams { _fsp_language
96 -- https://searx.github.io/searx/dev/search_api.html?highlight=json
97 req <- parseRequest $ T.unpack _fsp_url
98 let request = urlEncodedBody
99 [ --("category_general", "1")
100 ("q", encodeUtf8 _fsp_query)
101 , ("categories", "news") -- https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/70#note_3976
102 , ("pageno", encodeUtf8 $ T.pack $ show _fsp_pageno)
103 --, ("time_range", "None")
104 , ("language", encodeUtf8 $ langToSearx _fsp_language)
107 res <- httpLbs request _fsp_manager
108 let dec = Aeson.eitherDecode $ responseBody res :: (Either Prelude.String SearxResponse)
111 insertSearxResponse :: (MonadBase IO m, FlowCmdM env err m)
116 -> Either Prelude.String SearxResponse
118 insertSearxResponse _ _ _ _ (Left _) = pure ()
119 insertSearxResponse user cId listId l (Right (SearxResponse { _srs_results })) = do
120 server <- view (nlpServerGet l)
121 -- docs :: [Either Text HyperdataDocument]
122 let docs = hyperdataDocumentFromSearxResult l <$> _srs_results
123 --printDebug "[triggerSearxSearch] docs" docs
124 let docs' = catMaybes $ rightToMaybe <$> docs
126 Prelude.mapM_ (\(HyperdataDocument { _hd_title, _hd_publication_year, _hd_publication_date }) -> do
127 printDebug "[triggerSearxSearch] doc time" $
128 "[title] " <> (show _hd_title) <>
129 " :: [publication_year] " <> (show _hd_publication_year) <>
130 " :: [publication_date] " <> (show _hd_publication_date)
133 --_ <- flowDataText user (DataNew [docs']) (Multi l) cId Nothing logStatus
134 let mCorpus = Nothing :: Maybe HyperdataCorpus
135 ids <- insertMasterDocs mCorpus (Multi l) docs'
137 (_masterUserId, _masterRootId, masterCorpusId)
138 <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") mCorpus
139 let gp = GroupWithPosTag l server HashMap.empty
141 -- FR -> GroupWithPosTag l Spacy HashMap.empty
142 -- _ -> GroupWithPosTag l CoreNLP HashMap.empty
143 ngs <- buildNgramsLists user cId masterCorpusId Nothing gp
144 _userListId <- flowList_DbRepo listId ngs
148 -- TODO Make an async task out of this?
149 triggerSearxSearch :: (MonadBase IO m, FlowCmdM env err m, MonadJobStatus m)
156 triggerSearxSearch user cId q l jobHandle = do
158 markStarted numPages jobHandle
160 -- printDebug "[triggerSearxSearch] cId" cId
161 -- printDebug "[triggerSearxSearch] q" q
162 -- printDebug "[triggerSearxSearch] l" l
163 cfg <- view hasConfig
164 uId <- getUserId user
165 let surl = _gc_frame_searx_url cfg
166 -- printDebug "[triggerSearxSearch] surl" surl
167 mListId <- defaultListMaybe cId
168 listId <- case mListId of
170 listId <- getOrMkList cId uId
172 Just listId -> pure listId
174 -- printDebug "[triggerSearxSearch] listId" listId
176 manager <- liftBase $ newManager tlsManagerSettings
177 _ <- mapM (\page -> do
178 res <- liftBase $ fetchSearxPage $ FetchSearxParams { _fsp_language = l
179 , _fsp_manager = manager
184 insertSearxResponse user cId listId l res
185 markProgress page jobHandle
188 --printDebug "[triggerSearxSearch] res" res
189 markComplete jobHandle
191 hyperdataDocumentFromSearxResult :: Lang -> SearxResult -> Either T.Text HyperdataDocument
192 hyperdataDocumentFromSearxResult l (SearxResult { _sr_content, _sr_engine, _sr_pubdate, _sr_title }) = do
193 let mDate = parseTimeM False defaultTimeLocale "%Y-%m-%d %H:%M:%S+0000" (T.unpack _sr_pubdate) :: Maybe Day
194 let mGregorian = toGregorian <$> mDate
195 Right HyperdataDocument { _hd_bdd = Just "Searx"
198 , _hd_uniqId = Nothing
199 , _hd_uniqIdBdd = Nothing
201 , _hd_title = Just _sr_title
202 , _hd_authors = Nothing
203 , _hd_institutes = Nothing
204 , _hd_source = Just _sr_engine
205 , _hd_abstract = _sr_content
206 , _hd_publication_date = T.pack <$> formatTime defaultTimeLocale "%Y-%m-%dT%H:%M:%S" <$> mDate
207 , _hd_publication_year = fromIntegral <$> sel1 <$> mGregorian
208 , _hd_publication_month = sel2 <$> mGregorian
209 , _hd_publication_day = sel3 <$> mGregorian
210 , _hd_publication_hour = Nothing
211 , _hd_publication_minute = Nothing
212 , _hd_publication_second = Nothing
213 , _hd_language_iso2 = Just $ T.pack $ show l }