]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/Searx.hs
Merge remote-tracking branch 'origin/adinapoli/investigate-issue-192' into dev
[gargantext.git] / src / Gargantext / API / Node / Corpus / Searx.hs
1 {-# LANGUAGE TemplateHaskell #-}
2
3 module Gargantext.API.Node.Corpus.Searx where
4
5
6
7 import Control.Lens (view)
8 import qualified Data.Aeson as Aeson
9 import Data.Aeson.TH (deriveJSON)
10 import Data.Either (Either(..))
11 import qualified Data.HashMap.Strict as HashMap
12 import qualified Data.Text as T
13 import Data.Time.Calendar (Day, toGregorian)
14 import Data.Time.Format (defaultTimeLocale, formatTime, parseTimeM)
15 import Data.Tuple.Select (sel1, sel2, sel3)
16 import GHC.Generics (Generic)
17 import Network.HTTP.Client
18 import Network.HTTP.Client.TLS
19
20 import qualified Prelude
21 import Protolude (catMaybes, encodeUtf8, rightToMaybe, Text)
22 import Gargantext.Prelude
23 import Gargantext.Prelude.Config
24
25 import Gargantext.Core (Lang(..))
26 import Gargantext.Core.NLP (nlpServerGet)
27 import qualified Gargantext.Core.Text.Corpus.API as API
28 import Gargantext.Core.Text.List (buildNgramsLists)
29 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
30 import Gargantext.Core.Text.Terms (TermType(..))
31 import Gargantext.Core.Types.Individu (User(..))
32 import Gargantext.Core.Utils.Prefix (unPrefix)
33 import Gargantext.Database.Action.Flow (insertMasterDocs) --, DataText(..))
34 import Gargantext.Database.Action.Flow.List (flowList_DbRepo)
35 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
36 import Gargantext.Database.Action.User (getUserId)
37 import Gargantext.Database.Admin.Config (userMaster)
38 import Gargantext.Database.Admin.Types.Hyperdata.Corpus (HyperdataCorpus)
39 import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
40 import Gargantext.Database.Admin.Types.Node (CorpusId, ListId)
41 import Gargantext.Database.Prelude (hasConfig)
42 import Gargantext.Database.Query.Table.Node (defaultListMaybe, getOrMkList)
43 import Gargantext.Database.Query.Tree.Root (getOrMk_RootWithCorpus)
44 import Gargantext.Utils.Jobs (JobHandle, MonadJobStatus(..))
45 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
46
47 langToSearx :: Lang -> Text
48 langToSearx EN = "en-US"
49 langToSearx FR = "fr-FR"
50 langToSearx All = "en-US"
51
52 data SearxResult = SearxResult
53 { _sr_url :: Text
54 , _sr_title :: Text
55 , _sr_content :: Maybe Text
56 , _sr_engine :: Text
57 , _sr_score :: Double
58 , _sr_category :: Text
59 , _sr_pretty_url :: Text
60 , _sr_publishedDate :: Text -- "Nov 19, 2021"
61 , _sr_pubdate :: Text -- "2021-11-19 02:12:00+0000"
62 }
63 deriving (Show, Eq, Generic)
64 -- , _sr_parsed_url
65 -- , _sr_engines
66 -- , _sr_positions
67
68 $(deriveJSON (unPrefix "_sr_") ''SearxResult)
69
70 data SearxResponse = SearxResponse
71 { _srs_query :: Text
72 , _srs_number_of_results :: Int
73 , _srs_results :: [SearxResult] }
74 deriving (Show, Eq, Generic)
75 -- , _srs_answers
76 -- , _srs_corrections
77 -- , _srs_infoboxes
78 -- , _srs_suggestions :: [Text]
79 -- , _srs_unresponsive_engines :: [Text] }
80
81 $(deriveJSON (unPrefix "_srs_") ''SearxResponse)
82
83 data FetchSearxParams = FetchSearxParams
84 { _fsp_language :: Lang
85 , _fsp_manager :: Manager
86 , _fsp_pageno :: Int
87 , _fsp_query :: Text
88 , _fsp_url :: Text
89 }
90
91 fetchSearxPage :: FetchSearxParams -> IO (Either Prelude.String SearxResponse)
92 fetchSearxPage (FetchSearxParams { _fsp_language
93 , _fsp_manager
94 , _fsp_pageno
95 , _fsp_query
96 , _fsp_url }) = do
97 -- searx search API:
98 -- https://searx.github.io/searx/dev/search_api.html?highlight=json
99 req <- parseRequest $ T.unpack _fsp_url
100 let request = urlEncodedBody
101 [ --("category_general", "1")
102 ("q", encodeUtf8 _fsp_query)
103 , ("categories", "news") -- https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/70#note_3976
104 , ("pageno", encodeUtf8 $ T.pack $ show _fsp_pageno)
105 --, ("time_range", "None")
106 , ("language", encodeUtf8 $ langToSearx _fsp_language)
107 , ("format", "json")
108 ] req
109 res <- httpLbs request _fsp_manager
110 let dec = Aeson.eitherDecode $ responseBody res :: (Either Prelude.String SearxResponse)
111 pure dec
112
113 insertSearxResponse :: (MonadBase IO m, FlowCmdM env err m)
114 => User
115 -> CorpusId
116 -> ListId
117 -> Lang
118 -> Either Prelude.String SearxResponse
119 -> m ()
120 insertSearxResponse _ _ _ _ (Left _) = pure ()
121 insertSearxResponse user cId listId l (Right (SearxResponse { _srs_results })) = do
122 server <- view (nlpServerGet l)
123 -- docs :: [Either Text HyperdataDocument]
124 let docs = hyperdataDocumentFromSearxResult l <$> _srs_results
125 --printDebug "[triggerSearxSearch] docs" docs
126 let docs' = catMaybes $ rightToMaybe <$> docs
127 {-
128 Prelude.mapM_ (\(HyperdataDocument { _hd_title, _hd_publication_year, _hd_publication_date }) -> do
129 printDebug "[triggerSearxSearch] doc time" $
130 "[title] " <> (show _hd_title) <>
131 " :: [publication_year] " <> (show _hd_publication_year) <>
132 " :: [publication_date] " <> (show _hd_publication_date)
133 ) docs'
134 -}
135 --_ <- flowDataText user (DataNew [docs']) (Multi l) cId Nothing logStatus
136 let mCorpus = Nothing :: Maybe HyperdataCorpus
137 ids <- insertMasterDocs mCorpus (Multi l) docs'
138 _ <- Doc.add cId ids
139 (_masterUserId, _masterRootId, masterCorpusId)
140 <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") mCorpus
141 let gp = GroupWithPosTag l server HashMap.empty
142 -- gp = case l of
143 -- FR -> GroupWithPosTag l Spacy HashMap.empty
144 -- _ -> GroupWithPosTag l CoreNLP HashMap.empty
145 ngs <- buildNgramsLists user cId masterCorpusId Nothing gp
146 _userListId <- flowList_DbRepo listId ngs
147
148 pure ()
149
150 -- TODO Make an async task out of this?
151 triggerSearxSearch :: (MonadBase IO m, FlowCmdM env err m, MonadJobStatus m)
152 => User
153 -> CorpusId
154 -> API.Query
155 -> Lang
156 -> JobHandle m
157 -> m ()
158 triggerSearxSearch user cId q l jobHandle = do
159 let numPages = 100
160 markStarted numPages jobHandle
161
162 -- printDebug "[triggerSearxSearch] cId" cId
163 -- printDebug "[triggerSearxSearch] q" q
164 -- printDebug "[triggerSearxSearch] l" l
165 cfg <- view hasConfig
166 uId <- getUserId user
167 let surl = _gc_frame_searx_url cfg
168 -- printDebug "[triggerSearxSearch] surl" surl
169 mListId <- defaultListMaybe cId
170 listId <- case mListId of
171 Nothing -> do
172 listId <- getOrMkList cId uId
173 pure listId
174 Just listId -> pure listId
175
176 -- printDebug "[triggerSearxSearch] listId" listId
177
178 manager <- liftBase $ newManager tlsManagerSettings
179 _ <- mapM (\page -> do
180 res <- liftBase $ fetchSearxPage $ FetchSearxParams { _fsp_language = l
181 , _fsp_manager = manager
182 , _fsp_pageno = page
183 , _fsp_query = q
184 , _fsp_url = surl }
185
186 insertSearxResponse user cId listId l res
187 markProgress page jobHandle
188
189 ) [1..numPages]
190 --printDebug "[triggerSearxSearch] res" res
191 markComplete jobHandle
192
193 hyperdataDocumentFromSearxResult :: Lang -> SearxResult -> Either T.Text HyperdataDocument
194 hyperdataDocumentFromSearxResult l (SearxResult { _sr_content, _sr_engine, _sr_pubdate, _sr_title }) = do
195 let mDate = parseTimeM False defaultTimeLocale "%Y-%m-%d %H:%M:%S+0000" (T.unpack _sr_pubdate) :: Maybe Day
196 let mGregorian = toGregorian <$> mDate
197 Right HyperdataDocument { _hd_bdd = Just "Searx"
198 , _hd_doi = Nothing
199 , _hd_url = Nothing
200 , _hd_uniqId = Nothing
201 , _hd_uniqIdBdd = Nothing
202 , _hd_page = Nothing
203 , _hd_title = Just _sr_title
204 , _hd_authors = Nothing
205 , _hd_institutes = Nothing
206 , _hd_source = Just _sr_engine
207 , _hd_abstract = _sr_content
208 , _hd_publication_date = T.pack <$> formatTime defaultTimeLocale "%Y-%m-%dT%H:%M:%S" <$> mDate
209 , _hd_publication_year = fromIntegral <$> sel1 <$> mGregorian
210 , _hd_publication_month = sel2 <$> mGregorian
211 , _hd_publication_day = sel3 <$> mGregorian
212 , _hd_publication_hour = Nothing
213 , _hd_publication_minute = Nothing
214 , _hd_publication_second = Nothing
215 , _hd_language_iso2 = Just $ T.pack $ show l }