]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/Searx.hs
Merge remote-tracking branch 'origin/506-dev-search-in-tree' into dev
[gargantext.git] / src / Gargantext / API / Node / Corpus / Searx.hs
1 {-# LANGUAGE TemplateHaskell #-}
2
3 module Gargantext.API.Node.Corpus.Searx where
4
5 import Control.Lens (view)
6 import Data.Aeson.TH (deriveJSON)
7 import Data.Either (Either(..))
8 import Data.Time.Calendar (Day, toGregorian)
9 import Data.Time.Format (defaultTimeLocale, formatTime, parseTimeM)
10 import Data.Tuple.Select (sel1, sel2, sel3)
11 import GHC.Generics (Generic)
12 import Gargantext.Core (Lang(..))
13 import Gargantext.Core.NLP (nlpServerGet)
14 import Gargantext.Core.Text.List (buildNgramsLists)
15 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
16 import Gargantext.Core.Text.Terms (TermType(..))
17 import Gargantext.Core.Types.Individu (User(..))
18 import Gargantext.Core.Utils.Prefix (unPrefix)
19 import Gargantext.Database.Action.Flow (insertMasterDocs) --, DataText(..))
20 import Gargantext.Database.Action.Flow.List (flowList_DbRepo)
21 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
22 import Gargantext.Database.Action.User (getUserId)
23 import Gargantext.Database.Admin.Config (userMaster)
24 import Gargantext.Database.Query.Table.Node (insertDefaultNodeIfNotExists)
25 import Gargantext.Database.Admin.Types.Hyperdata.Corpus (HyperdataCorpus)
26 import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
27 import Gargantext.Database.Admin.Types.Node (CorpusId, ListId, NodeType(NodeTexts))
28 import Gargantext.Database.Prelude (hasConfig)
29 import Gargantext.Database.Query.Table.Node (defaultListMaybe, getOrMkList)
30 import Gargantext.Database.Query.Tree.Root (getOrMk_RootWithCorpus)
31 import Gargantext.Prelude
32 import Gargantext.Prelude.Config
33 import Gargantext.Utils.Jobs (JobHandle, MonadJobStatus(..))
34 import Network.HTTP.Client
35 import Network.HTTP.Client.TLS
36 import Protolude (catMaybes, encodeUtf8, rightToMaybe, Text)
37 import qualified Data.Aeson as Aeson
38 import qualified Data.HashMap.Strict as HashMap
39 import qualified Data.Text as T
40 import qualified Data.Text as Text
41 import qualified Gargantext.Core.Text.Corpus.API as API
42 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
43 import qualified Prelude
44
45 langToSearx :: Lang -> Text
46 langToSearx All = "en-US"
47 langToSearx x = (Text.toLower acronym) <> "-" <> acronym
48 where
49 acronym = (cs $ show x)
50
51 data SearxResult = SearxResult
52 { _sr_url :: Text
53 , _sr_title :: Text
54 , _sr_content :: Maybe Text
55 , _sr_engine :: Text
56 , _sr_score :: Double
57 , _sr_category :: Text
58 , _sr_pretty_url :: Text
59 , _sr_publishedDate :: Text -- "Nov 19, 2021"
60 , _sr_pubdate :: Text -- "2021-11-19 02:12:00+0000"
61 }
62 deriving (Show, Eq, Generic)
63 -- , _sr_parsed_url
64 -- , _sr_engines
65 -- , _sr_positions
66
67 $(deriveJSON (unPrefix "_sr_") ''SearxResult)
68
69 data SearxResponse = SearxResponse
70 { _srs_query :: Text
71 , _srs_number_of_results :: Int
72 , _srs_results :: [SearxResult] }
73 deriving (Show, Eq, Generic)
74 -- , _srs_answers
75 -- , _srs_corrections
76 -- , _srs_infoboxes
77 -- , _srs_suggestions :: [Text]
78 -- , _srs_unresponsive_engines :: [Text] }
79
80 $(deriveJSON (unPrefix "_srs_") ''SearxResponse)
81
82 data FetchSearxParams = FetchSearxParams
83 { _fsp_language :: Lang
84 , _fsp_manager :: Manager
85 , _fsp_pageno :: Int
86 , _fsp_query :: Text
87 , _fsp_url :: Text
88 }
89
90 fetchSearxPage :: FetchSearxParams -> IO (Either Prelude.String SearxResponse)
91 fetchSearxPage (FetchSearxParams { _fsp_language
92 , _fsp_manager
93 , _fsp_pageno
94 , _fsp_query
95 , _fsp_url }) = do
96 -- searx search API:
97 -- https://searx.github.io/searx/dev/search_api.html?highlight=json
98 req <- parseRequest $ T.unpack _fsp_url
99 let request = urlEncodedBody
100 [ --("category_general", "1")
101 ("q", encodeUtf8 _fsp_query)
102 , ("categories", "news") -- https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/70#note_3976
103 , ("pageno", encodeUtf8 $ T.pack $ show _fsp_pageno)
104 --, ("time_range", "None")
105 , ("language", encodeUtf8 $ langToSearx _fsp_language)
106 , ("format", "json")
107 ] req
108 res <- httpLbs request _fsp_manager
109 let dec = Aeson.eitherDecode $ responseBody res :: (Either Prelude.String SearxResponse)
110 pure dec
111
112 insertSearxResponse :: (MonadBase IO m, FlowCmdM env err m)
113 => User
114 -> CorpusId
115 -> ListId
116 -> Lang
117 -> Either Prelude.String SearxResponse
118 -> m ()
119 insertSearxResponse _ _ _ _ (Left _) = pure ()
120 insertSearxResponse user cId listId l (Right (SearxResponse { _srs_results })) = do
121 server <- view (nlpServerGet l)
122 -- docs :: [Either Text HyperdataDocument]
123 let docs = hyperdataDocumentFromSearxResult l <$> _srs_results
124 --printDebug "[triggerSearxSearch] docs" docs
125 let docs' = catMaybes $ rightToMaybe <$> docs
126 {-
127 Prelude.mapM_ (\(HyperdataDocument { _hd_title, _hd_publication_year, _hd_publication_date }) -> do
128 printDebug "[triggerSearxSearch] doc time" $
129 "[title] " <> (show _hd_title) <>
130 " :: [publication_year] " <> (show _hd_publication_year) <>
131 " :: [publication_date] " <> (show _hd_publication_date)
132 ) docs'
133 -}
134 --_ <- flowDataText user (DataNew [docs']) (Multi l) cId Nothing logStatus
135 let mCorpus = Nothing :: Maybe HyperdataCorpus
136 ids <- insertMasterDocs mCorpus (Multi l) docs'
137 _ <- Doc.add cId ids
138 (_masterUserId, _masterRootId, masterCorpusId)
139 <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") mCorpus
140 let gp = GroupWithPosTag l server HashMap.empty
141 -- gp = case l of
142 -- FR -> GroupWithPosTag l Spacy HashMap.empty
143 -- _ -> GroupWithPosTag l CoreNLP HashMap.empty
144 ngs <- buildNgramsLists user cId masterCorpusId Nothing gp
145 _userListId <- flowList_DbRepo listId ngs
146
147 pure ()
148
149 -- TODO Make an async task out of this?
150 triggerSearxSearch :: (MonadBase IO m, FlowCmdM env err m, MonadJobStatus m)
151 => User
152 -> CorpusId
153 -> API.Query
154 -> Lang
155 -> JobHandle m
156 -> m ()
157 triggerSearxSearch user cId q l jobHandle = do
158 userId <- getUserId user
159
160 _tId <- insertDefaultNodeIfNotExists NodeTexts cId userId
161
162 let numPages = 100
163 markStarted numPages jobHandle
164
165 -- printDebug "[triggerSearxSearch] cId" cId
166 -- printDebug "[triggerSearxSearch] q" q
167 -- printDebug "[triggerSearxSearch] l" l
168 cfg <- view hasConfig
169 uId <- getUserId user
170 let surl = _gc_frame_searx_url cfg
171 -- printDebug "[triggerSearxSearch] surl" surl
172 mListId <- defaultListMaybe cId
173 listId <- case mListId of
174 Nothing -> do
175 listId <- getOrMkList cId uId
176 pure listId
177 Just listId -> pure listId
178
179 -- printDebug "[triggerSearxSearch] listId" listId
180
181 manager <- liftBase $ newManager tlsManagerSettings
182 _ <- mapM (\page -> do
183 res <- liftBase $ fetchSearxPage $ FetchSearxParams { _fsp_language = l
184 , _fsp_manager = manager
185 , _fsp_pageno = page
186 , _fsp_query = q
187 , _fsp_url = surl }
188
189 insertSearxResponse user cId listId l res
190 markProgress page jobHandle
191
192 ) [1..numPages]
193 --printDebug "[triggerSearxSearch] res" res
194 markComplete jobHandle
195
196 hyperdataDocumentFromSearxResult :: Lang -> SearxResult -> Either T.Text HyperdataDocument
197 hyperdataDocumentFromSearxResult l (SearxResult { _sr_content, _sr_engine, _sr_pubdate, _sr_title }) = do
198 let mDate = parseTimeM False defaultTimeLocale "%Y-%m-%d %H:%M:%S+0000" (T.unpack _sr_pubdate) :: Maybe Day
199 let mGregorian = toGregorian <$> mDate
200 Right HyperdataDocument { _hd_bdd = Just "Searx"
201 , _hd_doi = Nothing
202 , _hd_url = Nothing
203 , _hd_uniqId = Nothing
204 , _hd_uniqIdBdd = Nothing
205 , _hd_page = Nothing
206 , _hd_title = Just _sr_title
207 , _hd_authors = Nothing
208 , _hd_institutes = Nothing
209 , _hd_source = Just _sr_engine
210 , _hd_abstract = _sr_content
211 , _hd_publication_date = T.pack <$> formatTime defaultTimeLocale "%Y-%m-%dT%H:%M:%S" <$> mDate
212 , _hd_publication_year = fromIntegral <$> sel1 <$> mGregorian
213 , _hd_publication_month = sel2 <$> mGregorian
214 , _hd_publication_day = sel3 <$> mGregorian
215 , _hd_publication_hour = Nothing
216 , _hd_publication_minute = Nothing
217 , _hd_publication_second = Nothing
218 , _hd_language_iso2 = Just $ T.pack $ show l }