]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/Searx.hs
Merge remote-tracking branch 'origin/513-dev-pin-tree' into dev-merge
[gargantext.git] / src / Gargantext / API / Node / Corpus / Searx.hs
1 {-# LANGUAGE TemplateHaskell #-}
2
3 module Gargantext.API.Node.Corpus.Searx where
4
5
6
7 import Control.Lens (view)
8 import qualified Data.Aeson as Aeson
9 import Data.Aeson.TH (deriveJSON)
10 import Data.Either (Either(..))
11 import qualified Data.HashMap.Strict as HashMap
12 import qualified Data.Text as T
13 import Data.Time.Calendar (Day, toGregorian)
14 import Data.Time.Format (defaultTimeLocale, formatTime, parseTimeM)
15 import Data.Tuple.Select (sel1, sel2, sel3)
16 import GHC.Generics (Generic)
17 import Network.HTTP.Client
18 import Network.HTTP.Client.TLS
19
20 import qualified Prelude
21 import Protolude (catMaybes, encodeUtf8, rightToMaybe, Text)
22 import Gargantext.Prelude
23 import Gargantext.Prelude.Config
24
25 import Gargantext.Core (Lang(..))
26 import Gargantext.Core.NLP (nlpServerGet)
27 import qualified Gargantext.Core.Text.Corpus.API as API
28 import Gargantext.Core.Text.List (buildNgramsLists)
29 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
30 import Gargantext.Core.Text.Terms (TermType(..))
31 import Gargantext.Core.Types.Individu (User(..))
32 import Gargantext.Core.Utils.Prefix (unPrefix)
33 import Gargantext.Database.Action.Flow (insertMasterDocs) --, DataText(..))
34 import Gargantext.Database.Action.Flow.List (flowList_DbRepo)
35 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
36 import Gargantext.Database.Action.User (getUserId)
37 import Gargantext.Database.Admin.Config (userMaster)
38 import Gargantext.Database.Admin.Types.Hyperdata.Corpus (HyperdataCorpus)
39 import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
40 import Gargantext.Database.Admin.Types.Node (CorpusId, ListId)
41 import Gargantext.Database.Prelude (hasConfig)
42 import Gargantext.Database.Query.Table.Node (defaultListMaybe, getOrMkList)
43 import Gargantext.Database.Query.Tree.Root (getOrMk_RootWithCorpus)
44 import Gargantext.Utils.Jobs (JobHandle, MonadJobStatus(..))
45 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
46
47 langToSearx :: Lang -> Text
48 langToSearx EN = "en-US"
49 langToSearx FR = "fr-FR"
50 langToSearx DE = "de-FR"
51 langToSearx ES = "es-FR"
52 langToSearx IT = "it-FR"
53 langToSearx PL = "pl-FR"
54 langToSearx CN = "cn-FR"
55 langToSearx All = "en-US"
56
57 data SearxResult = SearxResult
58 { _sr_url :: Text
59 , _sr_title :: Text
60 , _sr_content :: Maybe Text
61 , _sr_engine :: Text
62 , _sr_score :: Double
63 , _sr_category :: Text
64 , _sr_pretty_url :: Text
65 , _sr_publishedDate :: Text -- "Nov 19, 2021"
66 , _sr_pubdate :: Text -- "2021-11-19 02:12:00+0000"
67 }
68 deriving (Show, Eq, Generic)
69 -- , _sr_parsed_url
70 -- , _sr_engines
71 -- , _sr_positions
72
73 $(deriveJSON (unPrefix "_sr_") ''SearxResult)
74
75 data SearxResponse = SearxResponse
76 { _srs_query :: Text
77 , _srs_number_of_results :: Int
78 , _srs_results :: [SearxResult] }
79 deriving (Show, Eq, Generic)
80 -- , _srs_answers
81 -- , _srs_corrections
82 -- , _srs_infoboxes
83 -- , _srs_suggestions :: [Text]
84 -- , _srs_unresponsive_engines :: [Text] }
85
86 $(deriveJSON (unPrefix "_srs_") ''SearxResponse)
87
88 data FetchSearxParams = FetchSearxParams
89 { _fsp_language :: Lang
90 , _fsp_manager :: Manager
91 , _fsp_pageno :: Int
92 , _fsp_query :: Text
93 , _fsp_url :: Text
94 }
95
96 fetchSearxPage :: FetchSearxParams -> IO (Either Prelude.String SearxResponse)
97 fetchSearxPage (FetchSearxParams { _fsp_language
98 , _fsp_manager
99 , _fsp_pageno
100 , _fsp_query
101 , _fsp_url }) = do
102 -- searx search API:
103 -- https://searx.github.io/searx/dev/search_api.html?highlight=json
104 req <- parseRequest $ T.unpack _fsp_url
105 let request = urlEncodedBody
106 [ --("category_general", "1")
107 ("q", encodeUtf8 _fsp_query)
108 , ("categories", "news") -- https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/70#note_3976
109 , ("pageno", encodeUtf8 $ T.pack $ show _fsp_pageno)
110 --, ("time_range", "None")
111 , ("language", encodeUtf8 $ langToSearx _fsp_language)
112 , ("format", "json")
113 ] req
114 res <- httpLbs request _fsp_manager
115 let dec = Aeson.eitherDecode $ responseBody res :: (Either Prelude.String SearxResponse)
116 pure dec
117
118 insertSearxResponse :: (MonadBase IO m, FlowCmdM env err m)
119 => User
120 -> CorpusId
121 -> ListId
122 -> Lang
123 -> Either Prelude.String SearxResponse
124 -> m ()
125 insertSearxResponse _ _ _ _ (Left _) = pure ()
126 insertSearxResponse user cId listId l (Right (SearxResponse { _srs_results })) = do
127 server <- view (nlpServerGet l)
128 -- docs :: [Either Text HyperdataDocument]
129 let docs = hyperdataDocumentFromSearxResult l <$> _srs_results
130 --printDebug "[triggerSearxSearch] docs" docs
131 let docs' = catMaybes $ rightToMaybe <$> docs
132 {-
133 Prelude.mapM_ (\(HyperdataDocument { _hd_title, _hd_publication_year, _hd_publication_date }) -> do
134 printDebug "[triggerSearxSearch] doc time" $
135 "[title] " <> (show _hd_title) <>
136 " :: [publication_year] " <> (show _hd_publication_year) <>
137 " :: [publication_date] " <> (show _hd_publication_date)
138 ) docs'
139 -}
140 --_ <- flowDataText user (DataNew [docs']) (Multi l) cId Nothing logStatus
141 let mCorpus = Nothing :: Maybe HyperdataCorpus
142 ids <- insertMasterDocs mCorpus (Multi l) docs'
143 _ <- Doc.add cId ids
144 (_masterUserId, _masterRootId, masterCorpusId)
145 <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") mCorpus
146 let gp = GroupWithPosTag l server HashMap.empty
147 -- gp = case l of
148 -- FR -> GroupWithPosTag l Spacy HashMap.empty
149 -- _ -> GroupWithPosTag l CoreNLP HashMap.empty
150 ngs <- buildNgramsLists user cId masterCorpusId Nothing gp
151 _userListId <- flowList_DbRepo listId ngs
152
153 pure ()
154
155 -- TODO Make an async task out of this?
156 triggerSearxSearch :: (MonadBase IO m, FlowCmdM env err m, MonadJobStatus m)
157 => User
158 -> CorpusId
159 -> API.Query
160 -> Lang
161 -> JobHandle m
162 -> m ()
163 triggerSearxSearch user cId q l jobHandle = do
164 let numPages = 100
165 markStarted numPages jobHandle
166
167 -- printDebug "[triggerSearxSearch] cId" cId
168 -- printDebug "[triggerSearxSearch] q" q
169 -- printDebug "[triggerSearxSearch] l" l
170 cfg <- view hasConfig
171 uId <- getUserId user
172 let surl = _gc_frame_searx_url cfg
173 -- printDebug "[triggerSearxSearch] surl" surl
174 mListId <- defaultListMaybe cId
175 listId <- case mListId of
176 Nothing -> do
177 listId <- getOrMkList cId uId
178 pure listId
179 Just listId -> pure listId
180
181 -- printDebug "[triggerSearxSearch] listId" listId
182
183 manager <- liftBase $ newManager tlsManagerSettings
184 _ <- mapM (\page -> do
185 res <- liftBase $ fetchSearxPage $ FetchSearxParams { _fsp_language = l
186 , _fsp_manager = manager
187 , _fsp_pageno = page
188 , _fsp_query = q
189 , _fsp_url = surl }
190
191 insertSearxResponse user cId listId l res
192 markProgress page jobHandle
193
194 ) [1..numPages]
195 --printDebug "[triggerSearxSearch] res" res
196 markComplete jobHandle
197
198 hyperdataDocumentFromSearxResult :: Lang -> SearxResult -> Either T.Text HyperdataDocument
199 hyperdataDocumentFromSearxResult l (SearxResult { _sr_content, _sr_engine, _sr_pubdate, _sr_title }) = do
200 let mDate = parseTimeM False defaultTimeLocale "%Y-%m-%d %H:%M:%S+0000" (T.unpack _sr_pubdate) :: Maybe Day
201 let mGregorian = toGregorian <$> mDate
202 Right HyperdataDocument { _hd_bdd = Just "Searx"
203 , _hd_doi = Nothing
204 , _hd_url = Nothing
205 , _hd_uniqId = Nothing
206 , _hd_uniqIdBdd = Nothing
207 , _hd_page = Nothing
208 , _hd_title = Just _sr_title
209 , _hd_authors = Nothing
210 , _hd_institutes = Nothing
211 , _hd_source = Just _sr_engine
212 , _hd_abstract = _sr_content
213 , _hd_publication_date = T.pack <$> formatTime defaultTimeLocale "%Y-%m-%dT%H:%M:%S" <$> mDate
214 , _hd_publication_year = fromIntegral <$> sel1 <$> mGregorian
215 , _hd_publication_month = sel2 <$> mGregorian
216 , _hd_publication_day = sel3 <$> mGregorian
217 , _hd_publication_hour = Nothing
218 , _hd_publication_minute = Nothing
219 , _hd_publication_second = Nothing
220 , _hd_language_iso2 = Just $ T.pack $ show l }