]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/Searx.hs
[searx] custom flow for searx docs insert
[gargantext.git] / src / Gargantext / API / Node / Corpus / Searx.hs
1 {-# LANGUAGE TemplateHaskell #-}
2
3 module Gargantext.API.Node.Corpus.Searx where
4
5
6
7 import Control.Lens (view)
8 import qualified Data.Aeson as Aeson
9 import Data.Aeson.TH (deriveJSON)
10 import Data.Either (Either(..))
11 import qualified Data.HashMap.Strict as HashMap
12 import qualified Data.Text as T
13 import Data.Time.Calendar (Day, toGregorian)
14 import Data.Time.Format (defaultTimeLocale, formatTime, parseTimeM)
15 import Data.Tuple.Select (sel1, sel2, sel3)
16 import GHC.Generics (Generic)
17 import Network.HTTP.Client
18 import Network.HTTP.Client.TLS
19
20 import qualified Prelude as Prelude
21 import Protolude (catMaybes, encodeUtf8, rightToMaybe, Text)
22 import Gargantext.Prelude
23 import Gargantext.Prelude.Config
24
25 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
26 --import Gargantext.API.Admin.Types (HasSettings)
27 import Gargantext.API.Job (jobLogSuccess)
28 import Gargantext.Core (Lang(..), PosTagAlgo(..))
29 import qualified Gargantext.Core.Text.Corpus.API as API
30 import Gargantext.Core.Text.List (buildNgramsLists)
31 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
32 import Gargantext.Core.Text.Terms (TermType(..))
33 import Gargantext.Core.Types.Individu (User(..))
34 import Gargantext.Core.Utils.Prefix (unPrefix)
35 import Gargantext.Database.Action.Flow (insertMasterDocs) --, DataText(..))
36 import Gargantext.Database.Action.Flow.List (flowList_DbRepo)
37 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
38 import Gargantext.Database.Action.User (getUserId)
39 import Gargantext.Database.Admin.Config (userMaster)
40 import Gargantext.Database.Admin.Types.Hyperdata.Corpus (HyperdataCorpus)
41 import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
42 import Gargantext.Database.Admin.Types.Node (CorpusId)
43 import Gargantext.Database.Prelude (hasConfig)
44 import Gargantext.Database.Query.Table.Node (defaultListMaybe, getOrMkList)
45 import Gargantext.Database.Query.Tree.Root (getOrMk_RootWithCorpus)
46 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
47
48 langToSearx :: Lang -> Text
49 langToSearx EN = "en-US"
50 langToSearx FR = "fr-FR"
51 langToSearx All = "en-US"
52
53 data SearxResult = SearxResult
54 { _sr_url :: Text
55 , _sr_title :: Text
56 , _sr_content :: Maybe Text
57 , _sr_engine :: Text
58 , _sr_score :: Double
59 , _sr_category :: Text
60 , _sr_pretty_url :: Text
61 , _sr_publishedDate :: Text -- "Nov 19, 2021"
62 , _sr_pubdate :: Text -- "2021-11-19 02:12:00+0000"
63 }
64 deriving (Show, Eq, Generic)
65 -- , _sr_parsed_url
66 -- , _sr_engines
67 -- , _sr_positions
68
69 $(deriveJSON (unPrefix "_sr_") ''SearxResult)
70
71 data SearxResponse = SearxResponse
72 { _srs_query :: Text
73 , _srs_number_of_results :: Int
74 , _srs_results :: [SearxResult] }
75 deriving (Show, Eq, Generic)
76 -- , _srs_answers
77 -- , _srs_corrections
78 -- , _srs_infoboxes
79 -- , _srs_suggestions :: [Text]
80 -- , _srs_unresponsive_engines :: [Text] }
81
82 $(deriveJSON (unPrefix "_srs_") ''SearxResponse)
83
84 data FetchSearxParams = FetchSearxParams
85 { _fsp_language :: Lang
86 , _fsp_manager :: Manager
87 , _fsp_pageno :: Int
88 , _fsp_query :: Text
89 , _fsp_url :: Text
90 }
91
92 fetchSearxPage :: FetchSearxParams -> IO (Either Prelude.String SearxResponse)
93 fetchSearxPage (FetchSearxParams { _fsp_language
94 , _fsp_manager
95 , _fsp_pageno
96 , _fsp_query
97 , _fsp_url }) = do
98 -- searx search API:
99 -- https://searx.github.io/searx/dev/search_api.html?highlight=json
100 req <- parseRequest $ T.unpack _fsp_url
101 let request = urlEncodedBody
102 [ --("category_general", "1")
103 ("q", encodeUtf8 _fsp_query)
104 , ("categories", "news") -- https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/70#note_3976
105 , ("pageno", encodeUtf8 $ T.pack $ show _fsp_pageno)
106 --, ("time_range", "None")
107 , ("language", encodeUtf8 $ langToSearx _fsp_language)
108 , ("format", "json")
109 ] req
110 res <- httpLbs request _fsp_manager
111 let dec = Aeson.eitherDecode $ responseBody res :: (Either Prelude.String SearxResponse)
112 pure dec
113
114 -- TODO Make an async task out of this?
115 triggerSearxSearch :: (MonadBase IO m, FlowCmdM env err m)
116 => User
117 -> CorpusId
118 -> API.Query
119 -> Lang
120 -> (JobLog -> m ())
121 -> m JobLog
122 triggerSearxSearch user cId q l logStatus = do
123 let jobLog = JobLog { _scst_succeeded = Just 1
124 , _scst_failed = Just 0
125 , _scst_remaining = Just 1
126 , _scst_events = Just []
127 }
128 logStatus jobLog
129
130 printDebug "[triggerSearxSearch] cId" cId
131 printDebug "[triggerSearxSearch] q" q
132 printDebug "[triggerSearxSearch] l" l
133 cfg <- view hasConfig
134 uId <- getUserId user
135 let surl = _gc_frame_searx_url cfg
136 printDebug "[triggerSearxSearch] surl" surl
137 mListId <- defaultListMaybe cId
138 listId <- case mListId of
139 Nothing -> do
140 --let failedJobLog = jobLogFailTotalWithMessage "[triggerSearxSearch] no list id" jobLog
141 --logStatus failedJobLog
142 --pure failedJobLog
143 listId <- getOrMkList cId uId
144 pure listId
145 Just listId -> pure listId
146
147 printDebug "[triggerSearxSearch] listId" listId
148
149 manager <- liftBase $ newManager tlsManagerSettings
150 res <- liftBase $ fetchSearxPage $ FetchSearxParams { _fsp_language = l
151 , _fsp_manager = manager
152 , _fsp_pageno = 1
153 , _fsp_query = q
154 , _fsp_url = surl }
155
156 --printDebug "[triggerSearxSearch] res" res
157
158 case res of
159 Left _ -> pure ()
160 Right (SearxResponse { _srs_results }) -> do
161 let docs = hyperdataDocumentFromSearxResult <$> _srs_results
162 --printDebug "[triggerSearxSearch] docs" docs
163 -- docs :: [Either Text HyperdataDocument]
164 let docs' = catMaybes $ rightToMaybe <$> docs
165 Prelude.mapM_ (\(HyperdataDocument { _hd_title, _hd_publication_year, _hd_publication_date }) -> do
166 printDebug "[triggerSearxSearch] doc time" $
167 "[title] " <> (show _hd_title) <>
168 " :: [publication_year] " <> (show _hd_publication_year) <>
169 " :: [publication_date] " <> (show _hd_publication_date)
170 ) docs'
171 --_ <- flowDataText user (DataNew [docs']) (Multi EN) cId Nothing logStatus
172 let mCorpus = Nothing :: Maybe HyperdataCorpus
173 ids <- insertMasterDocs mCorpus (Multi EN) docs'
174 _ <- Doc.add cId ids
175 (_masterUserId, _masterRootId, masterCorpusId)
176 <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") mCorpus
177 let gp = GroupWithPosTag l CoreNLP HashMap.empty
178 ngs <- buildNgramsLists user cId masterCorpusId Nothing gp
179 _userListId <- flowList_DbRepo listId ngs
180
181 pure ()
182
183 pure $ jobLogSuccess jobLog
184
185 hyperdataDocumentFromSearxResult :: SearxResult -> Either T.Text HyperdataDocument
186 hyperdataDocumentFromSearxResult (SearxResult { _sr_content, _sr_engine, _sr_pubdate, _sr_title }) = do
187 let mDate = parseTimeM False defaultTimeLocale "%Y-%m-%d %H:%M:%S+0000" (T.unpack _sr_pubdate) :: Maybe Day
188 let mGregorian = toGregorian <$> mDate
189 Right HyperdataDocument { _hd_bdd = Just "Searx"
190 , _hd_doi = Nothing
191 , _hd_url = Nothing
192 , _hd_uniqId = Nothing
193 , _hd_uniqIdBdd = Nothing
194 , _hd_page = Nothing
195 , _hd_title = Just $ ("[" <> _sr_pubdate <> "] ") <> _sr_title
196 , _hd_authors = Nothing
197 , _hd_institutes = Nothing
198 , _hd_source = Just _sr_engine
199 , _hd_abstract = _sr_content
200 , _hd_publication_date = T.pack <$> formatTime defaultTimeLocale "%Y-%m-%dT%H:%M:%S" <$> mDate
201 , _hd_publication_year = fromIntegral <$> sel1 <$> mGregorian
202 , _hd_publication_month = sel2 <$> mGregorian
203 , _hd_publication_day = sel3 <$> mGregorian
204 , _hd_publication_hour = Nothing
205 , _hd_publication_minute = Nothing
206 , _hd_publication_second = Nothing
207 , _hd_language_iso2 = Just $ T.pack $ show EN }
208