1 {-# LANGUAGE TemplateHaskell #-}
3 module Gargantext.API.Node.Corpus.Searx where
5 import Control.Lens (view)
6 import Data.Aeson.TH (deriveJSON)
7 import Data.Either (Either(..))
8 import Data.Time.Calendar (Day, toGregorian)
9 import Data.Time.Format (defaultTimeLocale, formatTime, parseTimeM)
10 import Data.Tuple.Select (sel1, sel2, sel3)
11 import GHC.Generics (Generic)
12 import Gargantext.Core (Lang(..))
13 import Gargantext.Core.NLP (nlpServerGet)
14 import Gargantext.Core.Text.List (buildNgramsLists)
15 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
16 import Gargantext.Core.Text.Terms (TermType(..))
17 import Gargantext.Core.Types.Individu (User(..))
18 import Gargantext.Core.Utils.Prefix (unPrefix)
19 import Gargantext.Database.Action.Flow (insertMasterDocs) --, DataText(..))
20 import Gargantext.Database.Action.Flow.List (flowList_DbRepo)
21 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
22 import Gargantext.Database.Action.User (getUserId)
23 import Gargantext.Database.Admin.Config (userMaster)
24 import Gargantext.Database.Query.Table.Node (insertDefaultNodeIfNotExists)
25 import Gargantext.Database.Admin.Types.Hyperdata.Corpus (HyperdataCorpus)
26 import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
27 import Gargantext.Database.Admin.Types.Node (CorpusId, ListId, NodeType(NodeTexts))
28 import Gargantext.Database.Prelude (hasConfig)
29 import Gargantext.Database.Query.Table.Node (defaultListMaybe, getOrMkList)
30 import Gargantext.Database.Query.Tree.Root (getOrMk_RootWithCorpus)
31 import Gargantext.Prelude
32 import Gargantext.Prelude.Config
33 import Gargantext.Utils.Jobs (JobHandle, MonadJobStatus(..))
34 import Network.HTTP.Client
35 import Network.HTTP.Client.TLS
36 import Protolude (catMaybes, encodeUtf8, rightToMaybe, Text)
37 import qualified Data.Aeson as Aeson
38 import qualified Data.HashMap.Strict as HashMap
39 import qualified Data.Text as T
40 import qualified Data.Text as Text
41 import qualified Gargantext.Core.Text.Corpus.API as API
42 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
43 import qualified Prelude
45 langToSearx :: Lang -> Text
46 langToSearx All = "en-US"
47 langToSearx x = (Text.toLower acronym) <> "-" <> acronym
49 acronym = (cs $ show x)
51 data SearxResult = SearxResult
54 , _sr_content :: Maybe Text
57 , _sr_category :: Text
58 , _sr_pretty_url :: Text
59 , _sr_publishedDate :: Text -- "Nov 19, 2021"
60 , _sr_pubdate :: Text -- "2021-11-19 02:12:00+0000"
62 deriving (Show, Eq, Generic)
67 $(deriveJSON (unPrefix "_sr_") ''SearxResult)
69 data SearxResponse = SearxResponse
71 , _srs_number_of_results :: Int
72 , _srs_results :: [SearxResult] }
73 deriving (Show, Eq, Generic)
77 -- , _srs_suggestions :: [Text]
78 -- , _srs_unresponsive_engines :: [Text] }
80 $(deriveJSON (unPrefix "_srs_") ''SearxResponse)
82 data FetchSearxParams = FetchSearxParams
83 { _fsp_language :: Lang
84 , _fsp_manager :: Manager
90 fetchSearxPage :: FetchSearxParams -> IO (Either Prelude.String SearxResponse)
91 fetchSearxPage (FetchSearxParams { _fsp_language
97 -- https://searx.github.io/searx/dev/search_api.html?highlight=json
98 req <- parseRequest $ T.unpack _fsp_url
99 let request = urlEncodedBody
100 [ --("category_general", "1")
101 ("q", encodeUtf8 _fsp_query)
102 , ("categories", "news") -- https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/70#note_3976
103 , ("pageno", encodeUtf8 $ T.pack $ show _fsp_pageno)
104 --, ("time_range", "None")
105 , ("language", encodeUtf8 $ langToSearx _fsp_language)
108 res <- httpLbs request _fsp_manager
109 let dec = Aeson.eitherDecode $ responseBody res :: (Either Prelude.String SearxResponse)
112 insertSearxResponse :: (MonadBase IO m, FlowCmdM env err m)
117 -> Either Prelude.String SearxResponse
119 insertSearxResponse _ _ _ _ (Left _) = pure ()
120 insertSearxResponse user cId listId l (Right (SearxResponse { _srs_results })) = do
121 server <- view (nlpServerGet l)
122 -- docs :: [Either Text HyperdataDocument]
123 let docs = hyperdataDocumentFromSearxResult l <$> _srs_results
124 --printDebug "[triggerSearxSearch] docs" docs
125 let docs' = catMaybes $ rightToMaybe <$> docs
127 Prelude.mapM_ (\(HyperdataDocument { _hd_title, _hd_publication_year, _hd_publication_date }) -> do
128 printDebug "[triggerSearxSearch] doc time" $
129 "[title] " <> (show _hd_title) <>
130 " :: [publication_year] " <> (show _hd_publication_year) <>
131 " :: [publication_date] " <> (show _hd_publication_date)
134 --_ <- flowDataText user (DataNew [docs']) (Multi l) cId Nothing logStatus
135 let mCorpus = Nothing :: Maybe HyperdataCorpus
136 ids <- insertMasterDocs mCorpus (Multi l) docs'
138 (_masterUserId, _masterRootId, masterCorpusId)
139 <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") mCorpus
140 let gp = GroupWithPosTag l server HashMap.empty
142 -- FR -> GroupWithPosTag l Spacy HashMap.empty
143 -- _ -> GroupWithPosTag l CoreNLP HashMap.empty
144 ngs <- buildNgramsLists user cId masterCorpusId Nothing gp
145 _userListId <- flowList_DbRepo listId ngs
149 -- TODO Make an async task out of this?
150 triggerSearxSearch :: (MonadBase IO m, FlowCmdM env err m, MonadJobStatus m)
157 triggerSearxSearch user cId q l jobHandle = do
158 userId <- getUserId user
160 _tId <- insertDefaultNodeIfNotExists NodeTexts cId userId
163 markStarted numPages jobHandle
165 -- printDebug "[triggerSearxSearch] cId" cId
166 -- printDebug "[triggerSearxSearch] q" q
167 -- printDebug "[triggerSearxSearch] l" l
168 cfg <- view hasConfig
169 uId <- getUserId user
170 let surl = _gc_frame_searx_url cfg
171 -- printDebug "[triggerSearxSearch] surl" surl
172 mListId <- defaultListMaybe cId
173 listId <- case mListId of
175 listId <- getOrMkList cId uId
177 Just listId -> pure listId
179 -- printDebug "[triggerSearxSearch] listId" listId
181 manager <- liftBase $ newManager tlsManagerSettings
182 _ <- mapM (\page -> do
183 res <- liftBase $ fetchSearxPage $ FetchSearxParams { _fsp_language = l
184 , _fsp_manager = manager
189 insertSearxResponse user cId listId l res
190 markProgress page jobHandle
193 --printDebug "[triggerSearxSearch] res" res
194 markComplete jobHandle
196 hyperdataDocumentFromSearxResult :: Lang -> SearxResult -> Either T.Text HyperdataDocument
197 hyperdataDocumentFromSearxResult l (SearxResult { _sr_content, _sr_engine, _sr_pubdate, _sr_title }) = do
198 let mDate = parseTimeM False defaultTimeLocale "%Y-%m-%d %H:%M:%S+0000" (T.unpack _sr_pubdate) :: Maybe Day
199 let mGregorian = toGregorian <$> mDate
200 Right HyperdataDocument { _hd_bdd = Just "Searx"
203 , _hd_uniqId = Nothing
204 , _hd_uniqIdBdd = Nothing
206 , _hd_title = Just _sr_title
207 , _hd_authors = Nothing
208 , _hd_institutes = Nothing
209 , _hd_source = Just _sr_engine
210 , _hd_abstract = _sr_content
211 , _hd_publication_date = T.pack <$> formatTime defaultTimeLocale "%Y-%m-%dT%H:%M:%S" <$> mDate
212 , _hd_publication_year = fromIntegral <$> sel1 <$> mGregorian
213 , _hd_publication_month = sel2 <$> mGregorian
214 , _hd_publication_day = sel3 <$> mGregorian
215 , _hd_publication_hour = Nothing
216 , _hd_publication_minute = Nothing
217 , _hd_publication_second = Nothing
218 , _hd_language_iso2 = Just $ T.pack $ show l }