]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/Searx.hs
Merge remote-tracking branch 'origin/adinapoli/issue-incorrect-pagination' into dev
[gargantext.git] / src / Gargantext / API / Node / Corpus / Searx.hs
1 {-# LANGUAGE TemplateHaskell #-}
2
3 module Gargantext.API.Node.Corpus.Searx where
4
5 import Control.Lens (view)
6 import Data.Aeson.TH (deriveJSON)
7 import Data.Either (Either(..))
8 import Data.Time.Calendar (Day, toGregorian)
9 import Data.Time.Format (defaultTimeLocale, formatTime, parseTimeM)
10 import Data.Tuple.Select (sel1, sel2, sel3)
11 import GHC.Generics (Generic)
12 import Gargantext.Core (Lang(..))
13 import Gargantext.Core.NLP (nlpServerGet)
14 import Gargantext.Core.Text.List (buildNgramsLists)
15 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
16 import Gargantext.Core.Text.Terms (TermType(..))
17 import Gargantext.Core.Types.Individu (User(..))
18 import Gargantext.Core.Utils.Prefix (unPrefix)
19 import Gargantext.Database.Action.Flow (insertMasterDocs) --, DataText(..))
20 import Gargantext.Database.Action.Flow.List (flowList_DbRepo)
21 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
22 import Gargantext.Database.Action.User (getUserId)
23 import Gargantext.Database.Admin.Config (userMaster)
24 import Gargantext.Database.Admin.Types.Hyperdata.Corpus (HyperdataCorpus)
25 import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
26 import Gargantext.Database.Admin.Types.Node (CorpusId, ListId)
27 import Gargantext.Database.Prelude (hasConfig)
28 import Gargantext.Database.Query.Table.Node (defaultListMaybe, getOrMkList)
29 import Gargantext.Database.Query.Tree.Root (getOrMk_RootWithCorpus)
30 import Gargantext.Prelude
31 import Gargantext.Prelude.Config
32 import Gargantext.Utils.Jobs (JobHandle, MonadJobStatus(..))
33 import Network.HTTP.Client
34 import Network.HTTP.Client.TLS
35 import Protolude (catMaybes, encodeUtf8, rightToMaybe, Text)
36 import qualified Data.Aeson as Aeson
37 import qualified Data.HashMap.Strict as HashMap
38 import qualified Data.Text as T
39 import qualified Data.Text as Text
40 import qualified Gargantext.Core.Text.Corpus.API as API
41 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
42 import qualified Prelude
43
44 langToSearx :: Lang -> Text
45 langToSearx All = "en-US"
46 langToSearx x = (Text.toLower acronym) <> "-" <> acronym
47 where
48 acronym = (cs $ show x)
49
50 data SearxResult = SearxResult
51 { _sr_url :: Text
52 , _sr_title :: Text
53 , _sr_content :: Maybe Text
54 , _sr_engine :: Text
55 , _sr_score :: Double
56 , _sr_category :: Text
57 , _sr_pretty_url :: Text
58 , _sr_publishedDate :: Text -- "Nov 19, 2021"
59 , _sr_pubdate :: Text -- "2021-11-19 02:12:00+0000"
60 }
61 deriving (Show, Eq, Generic)
62 -- , _sr_parsed_url
63 -- , _sr_engines
64 -- , _sr_positions
65
66 $(deriveJSON (unPrefix "_sr_") ''SearxResult)
67
68 data SearxResponse = SearxResponse
69 { _srs_query :: Text
70 , _srs_number_of_results :: Int
71 , _srs_results :: [SearxResult] }
72 deriving (Show, Eq, Generic)
73 -- , _srs_answers
74 -- , _srs_corrections
75 -- , _srs_infoboxes
76 -- , _srs_suggestions :: [Text]
77 -- , _srs_unresponsive_engines :: [Text] }
78
79 $(deriveJSON (unPrefix "_srs_") ''SearxResponse)
80
81 data FetchSearxParams = FetchSearxParams
82 { _fsp_language :: Lang
83 , _fsp_manager :: Manager
84 , _fsp_pageno :: Int
85 , _fsp_query :: Text
86 , _fsp_url :: Text
87 }
88
89 fetchSearxPage :: FetchSearxParams -> IO (Either Prelude.String SearxResponse)
90 fetchSearxPage (FetchSearxParams { _fsp_language
91 , _fsp_manager
92 , _fsp_pageno
93 , _fsp_query
94 , _fsp_url }) = do
95 -- searx search API:
96 -- https://searx.github.io/searx/dev/search_api.html?highlight=json
97 req <- parseRequest $ T.unpack _fsp_url
98 let request = urlEncodedBody
99 [ --("category_general", "1")
100 ("q", encodeUtf8 _fsp_query)
101 , ("categories", "news") -- https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/70#note_3976
102 , ("pageno", encodeUtf8 $ T.pack $ show _fsp_pageno)
103 --, ("time_range", "None")
104 , ("language", encodeUtf8 $ langToSearx _fsp_language)
105 , ("format", "json")
106 ] req
107 res <- httpLbs request _fsp_manager
108 let dec = Aeson.eitherDecode $ responseBody res :: (Either Prelude.String SearxResponse)
109 pure dec
110
111 insertSearxResponse :: (MonadBase IO m, FlowCmdM env err m)
112 => User
113 -> CorpusId
114 -> ListId
115 -> Lang
116 -> Either Prelude.String SearxResponse
117 -> m ()
118 insertSearxResponse _ _ _ _ (Left _) = pure ()
119 insertSearxResponse user cId listId l (Right (SearxResponse { _srs_results })) = do
120 server <- view (nlpServerGet l)
121 -- docs :: [Either Text HyperdataDocument]
122 let docs = hyperdataDocumentFromSearxResult l <$> _srs_results
123 --printDebug "[triggerSearxSearch] docs" docs
124 let docs' = catMaybes $ rightToMaybe <$> docs
125 {-
126 Prelude.mapM_ (\(HyperdataDocument { _hd_title, _hd_publication_year, _hd_publication_date }) -> do
127 printDebug "[triggerSearxSearch] doc time" $
128 "[title] " <> (show _hd_title) <>
129 " :: [publication_year] " <> (show _hd_publication_year) <>
130 " :: [publication_date] " <> (show _hd_publication_date)
131 ) docs'
132 -}
133 --_ <- flowDataText user (DataNew [docs']) (Multi l) cId Nothing logStatus
134 let mCorpus = Nothing :: Maybe HyperdataCorpus
135 ids <- insertMasterDocs mCorpus (Multi l) docs'
136 _ <- Doc.add cId ids
137 (_masterUserId, _masterRootId, masterCorpusId)
138 <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") mCorpus
139 let gp = GroupWithPosTag l server HashMap.empty
140 -- gp = case l of
141 -- FR -> GroupWithPosTag l Spacy HashMap.empty
142 -- _ -> GroupWithPosTag l CoreNLP HashMap.empty
143 ngs <- buildNgramsLists user cId masterCorpusId Nothing gp
144 _userListId <- flowList_DbRepo listId ngs
145
146 pure ()
147
148 -- TODO Make an async task out of this?
149 triggerSearxSearch :: (MonadBase IO m, FlowCmdM env err m, MonadJobStatus m)
150 => User
151 -> CorpusId
152 -> API.Query
153 -> Lang
154 -> JobHandle m
155 -> m ()
156 triggerSearxSearch user cId q l jobHandle = do
157 let numPages = 100
158 markStarted numPages jobHandle
159
160 -- printDebug "[triggerSearxSearch] cId" cId
161 -- printDebug "[triggerSearxSearch] q" q
162 -- printDebug "[triggerSearxSearch] l" l
163 cfg <- view hasConfig
164 uId <- getUserId user
165 let surl = _gc_frame_searx_url cfg
166 -- printDebug "[triggerSearxSearch] surl" surl
167 mListId <- defaultListMaybe cId
168 listId <- case mListId of
169 Nothing -> do
170 listId <- getOrMkList cId uId
171 pure listId
172 Just listId -> pure listId
173
174 -- printDebug "[triggerSearxSearch] listId" listId
175
176 manager <- liftBase $ newManager tlsManagerSettings
177 _ <- mapM (\page -> do
178 res <- liftBase $ fetchSearxPage $ FetchSearxParams { _fsp_language = l
179 , _fsp_manager = manager
180 , _fsp_pageno = page
181 , _fsp_query = q
182 , _fsp_url = surl }
183
184 insertSearxResponse user cId listId l res
185 markProgress page jobHandle
186
187 ) [1..numPages]
188 --printDebug "[triggerSearxSearch] res" res
189 markComplete jobHandle
190
191 hyperdataDocumentFromSearxResult :: Lang -> SearxResult -> Either T.Text HyperdataDocument
192 hyperdataDocumentFromSearxResult l (SearxResult { _sr_content, _sr_engine, _sr_pubdate, _sr_title }) = do
193 let mDate = parseTimeM False defaultTimeLocale "%Y-%m-%d %H:%M:%S+0000" (T.unpack _sr_pubdate) :: Maybe Day
194 let mGregorian = toGregorian <$> mDate
195 Right HyperdataDocument { _hd_bdd = Just "Searx"
196 , _hd_doi = Nothing
197 , _hd_url = Nothing
198 , _hd_uniqId = Nothing
199 , _hd_uniqIdBdd = Nothing
200 , _hd_page = Nothing
201 , _hd_title = Just _sr_title
202 , _hd_authors = Nothing
203 , _hd_institutes = Nothing
204 , _hd_source = Just _sr_engine
205 , _hd_abstract = _sr_content
206 , _hd_publication_date = T.pack <$> formatTime defaultTimeLocale "%Y-%m-%dT%H:%M:%S" <$> mDate
207 , _hd_publication_year = fromIntegral <$> sel1 <$> mGregorian
208 , _hd_publication_month = sel2 <$> mGregorian
209 , _hd_publication_day = sel3 <$> mGregorian
210 , _hd_publication_hour = Nothing
211 , _hd_publication_minute = Nothing
212 , _hd_publication_second = Nothing
213 , _hd_language_iso2 = Just $ T.pack $ show l }