]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/New.hs
Merge branch '106-dev-ngrams-score-fix' of ssh://gitlab.iscpif.fr:20022/gargantext...
[gargantext.git] / src / Gargantext / API / Node / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
17
18 module Gargantext.API.Node.Corpus.New
19 where
20
21
22 import Conduit
23 import Control.Lens hiding (elements, Empty)
24 import Data.Aeson
25 import Data.Aeson.TH (deriveJSON)
26 import qualified Data.ByteString.Base64 as BSB64
27 import Data.Conduit.Internal (zipSources)
28 import Data.Either
29 import Data.Maybe (fromMaybe)
30 import Data.Swagger
31 import Data.Text (Text)
32 import qualified Data.Text as T
33 import GHC.Generics (Generic)
34 import Servant
35 import Servant.Job.Utils (jsonOptions)
36 -- import Servant.Multipart
37 import qualified Data.Text.Encoding as TE
38 -- import Test.QuickCheck (elements)
39 import Test.QuickCheck.Arbitrary
40
41 import Gargantext.Prelude
42
43 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs, ScraperEvent(..), scst_events)
44 import Gargantext.API.Admin.Types (HasSettings)
45 import Gargantext.API.Job (addEvent, jobLogSuccess, jobLogFailTotal)
46 import Gargantext.API.Node.Corpus.New.Types
47 import Gargantext.API.Node.Corpus.Searx
48 import Gargantext.API.Node.Corpus.Types
49 import Gargantext.API.Node.Types
50 import Gargantext.Core (Lang(..){-, allLangs-})
51 import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
52 import qualified Gargantext.Core.Text.Corpus.API as API
53 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileType(..), parseFormatC)
54 import Gargantext.Core.Types.Individu (User(..))
55 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
56 import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
57 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
58 import Gargantext.Database.Action.Mail (sendMail)
59 import Gargantext.Database.Action.Node (mkNodeWithParent)
60 import Gargantext.Database.Action.User (getUserId)
61 import Gargantext.Database.Admin.Types.Hyperdata
62 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
63 import Gargantext.Database.Prelude (hasConfig)
64 import Gargantext.Database.Query.Table.Node (getNodeWith)
65 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
66 import Gargantext.Database.Schema.Node (node_hyperdata)
67 import qualified Gargantext.Database.GargDB as GargDB
68 import Gargantext.Prelude.Config (gc_max_docs_parsers)
69 ------------------------------------------------------------------------
70 {-
71 data Query = Query { query_query :: Text
72 , query_node_id :: Int
73 , query_lang :: Lang
74 , query_databases :: [DataOrigin]
75 }
76 deriving (Eq, Generic)
77
78 deriveJSON (unPrefix "query_") 'Query
79
80 instance Arbitrary Query where
81 arbitrary = elements [ Query q n la fs
82 | q <- ["honeybee* AND collapse"
83 ,"covid 19"
84 ]
85 , n <- [0..10]
86 , la <- allLangs
87 , fs <- take 3 $ repeat allDataOrigins
88 ]
89
90 instance ToSchema Query where
91 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
92 -}
93
94 ------------------------------------------------------------------------
95
96 {-
97 type Api = PostApi
98 :<|> GetApi
99
100 type PostApi = Summary "New Corpus endpoint"
101 :> ReqBody '[JSON] Query
102 :> Post '[JSON] CorpusId
103 type GetApi = Get '[JSON] ApiInfo
104 -}
105
106 -- | TODO manage several apis
107 -- TODO-ACCESS
108 -- TODO this is only the POST
109 {-
110 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
111 api uid (Query q _ as) = do
112 cId <- case head as of
113 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
114 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
115 Just a -> do
116 docs <- liftBase $ API.get a q (Just 1000)
117 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
118 pure cId'
119
120 pure cId
121 -}
122
123 ------------------------------------------------
124 -- TODO use this route for Client implementation
125 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
126 deriving (Generic)
127 instance Arbitrary ApiInfo where
128 arbitrary = ApiInfo <$> arbitrary
129
130 deriveJSON (unPrefix "") 'ApiInfo
131
132 instance ToSchema ApiInfo
133
134 info :: FlowCmdM env err m => UserId -> m ApiInfo
135 info _u = pure $ ApiInfo API.externalAPIs
136
137 ------------------------------------------------------------------------
138 ------------------------------------------------------------------------
139 data WithQuery = WithQuery
140 { _wq_query :: !Text
141 , _wq_databases :: !Database
142 , _wq_datafield :: !(Maybe Datafield)
143 , _wq_lang :: !Lang
144 , _wq_node_id :: !Int
145 , _wq_flowListWith :: !FlowSocialListWith
146 }
147 deriving Generic
148
149 makeLenses ''WithQuery
150 instance FromJSON WithQuery where
151 parseJSON = genericParseJSON $ jsonOptions "_wq_"
152 instance ToJSON WithQuery where
153 toJSON = genericToJSON $ jsonOptions "_wq_"
154 instance ToSchema WithQuery where
155 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
156
157 ------------------------------------------------------------------------
158
159 type AddWithQuery = Summary "Add with Query to corpus endpoint"
160 :> "corpus"
161 :> Capture "corpus_id" CorpusId
162 :> "query"
163 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
164
165 {-
166 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
167 :> "corpus"
168 :> Capture "corpus_id" CorpusId
169 :> "add"
170 :> "file"
171 :> MultipartForm Mem (MultipartData Mem)
172 :> QueryParam "fileType" FileType
173 :> "async"
174 :> AsyncJobs JobLog '[JSON] () JobLog
175 -}
176
177
178 ------------------------------------------------------------------------
179 -- TODO WithQuery also has a corpus id
180 addToCorpusWithQuery :: FlowCmdM env err m
181 => User
182 -> CorpusId
183 -> WithQuery
184 -> Maybe Integer
185 -> (JobLog -> m ())
186 -> m JobLog
187 addToCorpusWithQuery user cid (WithQuery { _wq_query = q
188 , _wq_databases = dbs
189 , _wq_datafield = datafield
190 , _wq_lang = l
191 , _wq_flowListWith = flw }) maybeLimit logStatus = do
192 -- TODO ...
193 logStatus JobLog { _scst_succeeded = Just 0
194 , _scst_failed = Just 0
195 , _scst_remaining = Just 3
196 , _scst_events = Just []
197 }
198 printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
199 printDebug "[addToCorpusWithQuery] datafield" datafield
200 printDebug "[addToCorpusWithQuery] flowListWith" flw
201
202 case datafield of
203 Just Web -> do
204 printDebug "[addToCorpusWithQuery] processing web request" datafield
205
206 _ <- triggerSearxSearch user cid q l logStatus
207
208 pure JobLog { _scst_succeeded = Just 3
209 , _scst_failed = Just 0
210 , _scst_remaining = Just 0
211 , _scst_events = Just []
212 }
213
214 _ -> do
215 -- TODO add cid
216 -- TODO if cid is folder -> create Corpus
217 -- if cid is corpus -> add to corpus
218 -- if cid is root -> create corpus in Private
219 eTxts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
220 let lTxts = lefts eTxts
221 case lTxts of
222 [] -> do
223 let txts = rights eTxts
224 -- TODO Sum lenghts of each txt elements
225 logStatus $ JobLog { _scst_succeeded = Just 2
226 , _scst_failed = Just 0
227 , _scst_remaining = Just $ 1 + length txts
228 , _scst_events = Just []
229 }
230
231 cids <- mapM (\txt -> do
232 flowDataText user txt (Multi l) cid Nothing logStatus) txts
233 printDebug "corpus id" cids
234 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
235 sendMail user
236 -- TODO ...
237 pure JobLog { _scst_succeeded = Just 3
238 , _scst_failed = Just 0
239 , _scst_remaining = Just 0
240 , _scst_events = Just []
241 }
242
243 (err:_) -> do
244 pure $ addEvent "ERROR" (T.pack $ show err) $
245 JobLog { _scst_succeeded = Just 2
246 , _scst_failed = Just 1
247 , _scst_remaining = Just 0
248 , _scst_events = Just []
249 }
250
251
252 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
253 :> "corpus"
254 :> Capture "corpus_id" CorpusId
255 :> "add"
256 :> "form"
257 :> "async"
258 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
259
260 addToCorpusWithForm :: (FlowCmdM env err m)
261 => User
262 -> CorpusId
263 -> NewWithForm
264 -> (JobLog -> m ())
265 -> JobLog
266 -> m JobLog
267 addToCorpusWithForm user cid (NewWithForm ft ff d l _n) logStatus jobLog = do
268 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
269 printDebug "[addToCorpusWithForm] fileType" ft
270 printDebug "[addToCorpusWithForm] fileFormat" ff
271 logStatus jobLog
272 limit' <- view $ hasConfig . gc_max_docs_parsers
273 let limit = fromIntegral limit' :: Integer
274 let
275 parseC = case ft of
276 CSV_HAL -> Parser.parseFormatC Parser.CsvHal
277 CSV -> Parser.parseFormatC Parser.CsvGargV3
278 WOS -> Parser.parseFormatC Parser.WOS
279 PresseRIS -> Parser.parseFormatC Parser.RisPresse
280
281 -- TODO granularity of the logStatus
282 let data' = case ff of
283 Plain -> cs d
284 ZIP -> case BSB64.decode $ TE.encodeUtf8 d of
285 Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err
286 Right decoded -> decoded
287 eDocsC <- liftBase $ parseC ff data'
288 case eDocsC of
289 Right (mCount, docsC) -> do
290 -- TODO Add progress (jobStatus) update for docs - this is a
291 -- long action
292
293 let docsC' = zipSources (yieldMany [1..]) docsC
294 .| mapMC (\(idx, doc) ->
295 if idx > limit then do
296 --printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show limit)
297 let panicMsg' = [ "[addToCorpusWithForm] number of docs "
298 , "exceeds the MAX_DOCS_PARSERS limit ("
299 , show limit
300 , ")" ]
301 let panicMsg = T.concat $ T.pack <$> panicMsg'
302 --logStatus $ jobLogFailTotalWithMessage panicMsg jobLog
303 panic panicMsg
304 else
305 pure doc)
306 .| mapC toHyperdataDocument
307
308 --printDebug "Parsing corpus finished : " cid
309 --logStatus jobLog2
310
311 --printDebug "Starting extraction : " cid
312 -- TODO granularity of the logStatus
313 _cid' <- flowCorpus user
314 (Right [cid])
315 (Multi $ fromMaybe EN l)
316 Nothing
317 --(Just $ fromIntegral $ length docs, docsC')
318 (mCount, transPipe liftBase docsC') -- TODO fix number of docs
319 --(map (map toHyperdataDocument) docs)
320 logStatus
321
322 printDebug "Extraction finished : " cid
323 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
324 -- TODO uncomment this
325 --sendMail user
326
327 logStatus jobLog3
328 pure jobLog3
329 Left e -> do
330 printDebug "[addToCorpusWithForm] parse error" e
331
332 let evt = ScraperEvent { _scev_message = Just $ T.pack e
333 , _scev_level = Just "ERROR"
334 , _scev_date = Nothing }
335
336 logStatus $ over (scst_events . _Just) (\evt' -> evt' <> [evt]) jobLogE
337 pure jobLogE
338 where
339 jobLog2 = jobLogSuccess jobLog
340 jobLog3 = jobLogSuccess jobLog2
341 jobLogE = jobLogFailTotal jobLog
342
343 {-
344 addToCorpusWithFile :: FlowCmdM env err m
345 => CorpusId
346 -> MultipartData Mem
347 -> Maybe FileType
348 -> (JobLog -> m ())
349 -> m JobLog
350 addToCorpusWithFile cid input filetype logStatus = do
351 logStatus JobLog { _scst_succeeded = Just 10
352 , _scst_failed = Just 2
353 , _scst_remaining = Just 138
354 , _scst_events = Just []
355 }
356 printDebug "addToCorpusWithFile" cid
357 _h <- postUpload cid filetype input
358
359 pure JobLog { _scst_succeeded = Just 137
360 , _scst_failed = Just 13
361 , _scst_remaining = Just 0
362 , _scst_events = Just []
363 }
364 -}
365
366
367
368 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
369 :> "corpus"
370 :> Capture "corpus_id" CorpusId
371 :> "add"
372 :> "file"
373 :> "async"
374 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
375
376 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
377 => User
378 -> CorpusId
379 -> NewWithFile
380 -> (JobLog -> m ())
381 -> m JobLog
382 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
383
384 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
385 logStatus JobLog { _scst_succeeded = Just 0
386 , _scst_failed = Just 0
387 , _scst_remaining = Just 1
388 , _scst_events = Just []
389 }
390
391 fPath <- GargDB.writeFile nwf
392 printDebug "[addToCorpusWithFile] File saved as: " fPath
393
394 uId <- getUserId user
395 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
396
397 _ <- case nIds of
398 [nId] -> do
399 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
400 let hl = node ^. node_hyperdata
401 _ <- updateHyperdata nId $ hl { _hff_name = fName
402 , _hff_path = T.pack fPath }
403
404 printDebug "[addToCorpusWithFile] Created node with id: " nId
405 _ -> pure ()
406
407 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
408
409 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
410 sendMail user
411
412 pure $ JobLog { _scst_succeeded = Just 1
413 , _scst_failed = Just 0
414 , _scst_remaining = Just 0
415 , _scst_events = Just []
416 }
417