]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/New.hs
[searx] some more searx parsing work
[gargantext.git] / src / Gargantext / API / Node / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
17
18 module Gargantext.API.Node.Corpus.New
19 where
20
21 import Control.Lens hiding (elements, Empty)
22 import Data.Aeson
23 import Data.Aeson.TH (deriveJSON)
24 import qualified Data.ByteString.Base64 as BSB64
25 import Data.Either
26 import Data.Maybe (fromMaybe)
27 import Data.Swagger
28 import Data.Text (Text)
29 import qualified Data.Text as T
30 import GHC.Generics (Generic)
31 import Servant
32 import Servant.Job.Utils (jsonOptions)
33 -- import Servant.Multipart
34 import qualified Data.Text.Encoding as TE
35 -- import Test.QuickCheck (elements)
36 import Test.QuickCheck.Arbitrary
37
38 import Gargantext.Prelude
39
40 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs, ScraperEvent(..), scst_events)
41 import Gargantext.API.Admin.Types (HasSettings)
42 import Gargantext.API.Job (jobLogSuccess, jobLogFailTotal, jobLogFailTotalWithMessage)
43 import Gargantext.API.Node.Corpus.New.File
44 import Gargantext.API.Node.Corpus.Searx
45 import Gargantext.API.Node.Corpus.Types
46 import Gargantext.API.Node.Types
47 import Gargantext.Core (Lang(..){-, allLangs-})
48 import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
49 import qualified Gargantext.Core.Text.Corpus.API as API
50 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
51 import Gargantext.Core.Types.Individu (User(..))
52 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
53 import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
54 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
55 import Gargantext.Database.Action.Mail (sendMail)
56 import Gargantext.Database.Action.Node (mkNodeWithParent)
57 import Gargantext.Database.Action.User (getUserId)
58 import Gargantext.Database.Admin.Types.Hyperdata
59 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
60 import Gargantext.Database.Prelude (hasConfig)
61 import Gargantext.Database.Query.Table.Node (getNodeWith)
62 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
63 import Gargantext.Database.Schema.Node (node_hyperdata)
64 import qualified Gargantext.Database.GargDB as GargDB
65 import Gargantext.Prelude.Config (gc_max_docs_parsers)
66 ------------------------------------------------------------------------
67 {-
68 data Query = Query { query_query :: Text
69 , query_node_id :: Int
70 , query_lang :: Lang
71 , query_databases :: [DataOrigin]
72 }
73 deriving (Eq, Generic)
74
75 deriveJSON (unPrefix "query_") 'Query
76
77 instance Arbitrary Query where
78 arbitrary = elements [ Query q n la fs
79 | q <- ["honeybee* AND collapse"
80 ,"covid 19"
81 ]
82 , n <- [0..10]
83 , la <- allLangs
84 , fs <- take 3 $ repeat allDataOrigins
85 ]
86
87 instance ToSchema Query where
88 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
89 -}
90
91 ------------------------------------------------------------------------
92
93 {-
94 type Api = PostApi
95 :<|> GetApi
96
97 type PostApi = Summary "New Corpus endpoint"
98 :> ReqBody '[JSON] Query
99 :> Post '[JSON] CorpusId
100 type GetApi = Get '[JSON] ApiInfo
101 -}
102
103 -- | TODO manage several apis
104 -- TODO-ACCESS
105 -- TODO this is only the POST
106 {-
107 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
108 api uid (Query q _ as) = do
109 cId <- case head as of
110 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
111 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
112 Just a -> do
113 docs <- liftBase $ API.get a q (Just 1000)
114 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
115 pure cId'
116
117 pure cId
118 -}
119
120 ------------------------------------------------
121 -- TODO use this route for Client implementation
122 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
123 deriving (Generic)
124 instance Arbitrary ApiInfo where
125 arbitrary = ApiInfo <$> arbitrary
126
127 deriveJSON (unPrefix "") 'ApiInfo
128
129 instance ToSchema ApiInfo
130
131 info :: FlowCmdM env err m => UserId -> m ApiInfo
132 info _u = pure $ ApiInfo API.externalAPIs
133
134 ------------------------------------------------------------------------
135 ------------------------------------------------------------------------
136 data WithQuery = WithQuery
137 { _wq_query :: !Text
138 , _wq_databases :: !Database
139 , _wq_datafield :: !(Maybe Datafield)
140 , _wq_lang :: !Lang
141 , _wq_node_id :: !Int
142 , _wq_flowListWith :: !FlowSocialListWith
143 }
144 deriving Generic
145
146 makeLenses ''WithQuery
147 instance FromJSON WithQuery where
148 parseJSON = genericParseJSON $ jsonOptions "_wq_"
149 instance ToJSON WithQuery where
150 toJSON = genericToJSON $ jsonOptions "_wq_"
151 instance ToSchema WithQuery where
152 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
153
154 ------------------------------------------------------------------------
155
156 type AddWithQuery = Summary "Add with Query to corpus endpoint"
157 :> "corpus"
158 :> Capture "corpus_id" CorpusId
159 :> "query"
160 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
161
162 {-
163 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
164 :> "corpus"
165 :> Capture "corpus_id" CorpusId
166 :> "add"
167 :> "file"
168 :> MultipartForm Mem (MultipartData Mem)
169 :> QueryParam "fileType" FileType
170 :> "async"
171 :> AsyncJobs JobLog '[JSON] () JobLog
172 -}
173
174
175 ------------------------------------------------------------------------
176 -- TODO WithQuery also has a corpus id
177 addToCorpusWithQuery :: FlowCmdM env err m
178 => User
179 -> CorpusId
180 -> WithQuery
181 -> Maybe Integer
182 -> (JobLog -> m ())
183 -> m JobLog
184 addToCorpusWithQuery user cid (WithQuery { _wq_query = q
185 , _wq_databases = dbs
186 , _wq_datafield = datafield
187 , _wq_lang = l
188 , _wq_flowListWith = flw }) maybeLimit logStatus = do
189 -- TODO ...
190 logStatus JobLog { _scst_succeeded = Just 0
191 , _scst_failed = Just 0
192 , _scst_remaining = Just 3
193 , _scst_events = Just []
194 }
195 printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
196 printDebug "[addToCorpusWithQuery] datafield" datafield
197 printDebug "[addToCorpusWithQuery] flowListWith" flw
198
199 case datafield of
200 Just Web -> do
201 printDebug "[addToCorpusWithQuery] processing web request" datafield
202
203 _ <- triggerSearxSearch user cid q l logStatus
204
205 pure JobLog { _scst_succeeded = Just 3
206 , _scst_failed = Just 0
207 , _scst_remaining = Just 0
208 , _scst_events = Just []
209 }
210
211 _ -> do
212 -- TODO add cid
213 -- TODO if cid is folder -> create Corpus
214 -- if cid is corpus -> add to corpus
215 -- if cid is root -> create corpus in Private
216 txts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
217
218 logStatus JobLog { _scst_succeeded = Just 2
219 , _scst_failed = Just 0
220 , _scst_remaining = Just $ 1 + length txts
221 , _scst_events = Just []
222 }
223
224 cids <- mapM (\txt -> flowDataText user txt (Multi l) cid Nothing logStatus) txts
225 printDebug "corpus id" cids
226 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
227 sendMail user
228 -- TODO ...
229 pure JobLog { _scst_succeeded = Just 3
230 , _scst_failed = Just 0
231 , _scst_remaining = Just 0
232 , _scst_events = Just []
233 }
234
235
236 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
237 :> "corpus"
238 :> Capture "corpus_id" CorpusId
239 :> "add"
240 :> "form"
241 :> "async"
242 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
243
244 addToCorpusWithForm :: (FlowCmdM env err m)
245 => User
246 -> CorpusId
247 -> NewWithForm
248 -> (JobLog -> m ())
249 -> JobLog
250 -> m JobLog
251 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
252 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
253 printDebug "[addToCorpusWithForm] fileType" ft
254 logStatus jobLog
255 let
256 parse = case ft of
257 CSV_HAL -> Parser.parseFormat Parser.CsvHal
258 CSV -> Parser.parseFormat Parser.CsvGargV3
259 WOS -> Parser.parseFormat Parser.WOS
260 PresseRIS -> Parser.parseFormat Parser.RisPresse
261 ZIP -> Parser.parseFormat Parser.ZIP
262
263 -- TODO granularity of the logStatus
264 let data' = case ft of
265 ZIP -> case BSB64.decode $ TE.encodeUtf8 d of
266 Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err
267 Right decoded -> decoded
268 _ -> cs d
269 eDocs <- liftBase $ parse data'
270 case eDocs of
271 Right docs' -> do
272 -- TODO Add progress (jobStatus) update for docs - this is a
273 -- long action
274 limit' <- view $ hasConfig . gc_max_docs_parsers
275 let limit = fromIntegral limit'
276 if length docs' > limit then do
277 printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show $ length docs')
278 let panicMsg' = [ "[addToCorpusWithForm] number of docs ("
279 , show $ length docs'
280 , ") exceeds the MAX_DOCS_PARSERS limit ("
281 , show limit
282 , ")" ]
283 let panicMsg = T.concat $ T.pack <$> panicMsg'
284 logStatus $ jobLogFailTotalWithMessage panicMsg jobLog
285 panic panicMsg
286 else
287 pure ()
288 let docs = splitEvery 500 $ take limit docs'
289
290 printDebug "Parsing corpus finished : " cid
291 logStatus jobLog2
292
293 printDebug "Starting extraction : " cid
294 -- TODO granularity of the logStatus
295 _cid' <- flowCorpus user
296 (Right [cid])
297 (Multi $ fromMaybe EN l)
298 Nothing
299 (map (map toHyperdataDocument) docs)
300 logStatus
301
302 printDebug "Extraction finished : " cid
303 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
304 sendMail user
305
306 logStatus jobLog3
307 pure $ jobLog3
308 Left e -> do
309 printDebug "[addToCorpusWithForm] parse error" e
310
311 let evt = ScraperEvent { _scev_message = Just $ T.pack e
312 , _scev_level = Just "ERROR"
313 , _scev_date = Nothing }
314
315 logStatus $ over (scst_events . _Just) (\evt' -> evt' <> [evt]) jobLogE
316 pure jobLogE
317 where
318 jobLog2 = jobLogSuccess jobLog
319 jobLog3 = jobLogSuccess jobLog2
320 jobLogE = jobLogFailTotal jobLog
321
322 {-
323 addToCorpusWithFile :: FlowCmdM env err m
324 => CorpusId
325 -> MultipartData Mem
326 -> Maybe FileType
327 -> (JobLog -> m ())
328 -> m JobLog
329 addToCorpusWithFile cid input filetype logStatus = do
330 logStatus JobLog { _scst_succeeded = Just 10
331 , _scst_failed = Just 2
332 , _scst_remaining = Just 138
333 , _scst_events = Just []
334 }
335 printDebug "addToCorpusWithFile" cid
336 _h <- postUpload cid filetype input
337
338 pure JobLog { _scst_succeeded = Just 137
339 , _scst_failed = Just 13
340 , _scst_remaining = Just 0
341 , _scst_events = Just []
342 }
343 -}
344
345
346
347 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
348 :> "corpus"
349 :> Capture "corpus_id" CorpusId
350 :> "add"
351 :> "file"
352 :> "async"
353 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
354
355 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
356 => User
357 -> CorpusId
358 -> NewWithFile
359 -> (JobLog -> m ())
360 -> m JobLog
361 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
362
363 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
364 logStatus JobLog { _scst_succeeded = Just 0
365 , _scst_failed = Just 0
366 , _scst_remaining = Just 1
367 , _scst_events = Just []
368 }
369
370 fPath <- GargDB.writeFile nwf
371 printDebug "[addToCorpusWithFile] File saved as: " fPath
372
373 uId <- getUserId user
374 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
375
376 _ <- case nIds of
377 [nId] -> do
378 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
379 let hl = node ^. node_hyperdata
380 _ <- updateHyperdata nId $ hl { _hff_name = fName
381 , _hff_path = T.pack fPath }
382
383 printDebug "[addToCorpusWithFile] Created node with id: " nId
384 _ -> pure ()
385
386 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
387
388 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
389 sendMail user
390
391 pure $ JobLog { _scst_succeeded = Just 1
392 , _scst_failed = Just 0
393 , _scst_remaining = Just 0
394 , _scst_events = Just []
395 }
396