]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/New.hs
Simplify ExternalAPIs type
[gargantext.git] / src / Gargantext / API / Node / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
17
18 module Gargantext.API.Node.Corpus.New
19 where
20
21 import Conduit
22 import Control.Lens hiding (elements, Empty)
23 import Control.Monad
24 import Data.Aeson
25 import Data.Aeson.TH (deriveJSON)
26 import qualified Data.ByteString.Base64 as BSB64
27 import Data.Conduit.Internal (zipSources)
28 import Data.Either
29 import Data.Maybe (fromMaybe)
30 import Data.Swagger
31 import Data.Text (Text)
32 import qualified Data.Text as T
33 import GHC.Generics (Generic)
34 import Servant
35 import Servant.Job.Utils (jsonOptions)
36 -- import Servant.Multipart
37 import qualified Data.Text.Encoding as TE
38 -- import Test.QuickCheck (elements)
39 import Test.QuickCheck.Arbitrary
40
41 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
42 import Gargantext.API.Admin.Types (HasSettings)
43 import Gargantext.API.Node.Corpus.New.Types
44 import Gargantext.API.Node.Corpus.Searx
45 import Gargantext.API.Node.Corpus.Types
46 import Gargantext.API.Node.Types
47 import Gargantext.Core (Lang(..))
48 import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
49 import Gargantext.Core.Types.Individu (User(..))
50 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
51 import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
52 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
53 import Gargantext.Database.Action.Mail (sendMail)
54 import Gargantext.Database.Action.Node (mkNodeWithParent)
55 import Gargantext.Database.Action.User (getUserId)
56 import Gargantext.Database.Admin.Types.Hyperdata
57 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..))
58 import Gargantext.Database.Prelude (hasConfig)
59 import Gargantext.Database.Query.Table.Node (getNodeWith, updateCorpusPubmedAPIKey)
60 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
61 import Gargantext.Database.Schema.Node (node_hyperdata)
62 import Gargantext.Prelude
63 import Gargantext.Prelude.Config (gc_max_docs_parsers, gc_pubmed_api_key)
64 import Gargantext.Utils.Jobs (JobHandle, MonadJobStatus(..))
65 import qualified Gargantext.Core.Text.Corpus.API as API
66 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileType(..), parseFormatC)
67 import qualified Gargantext.Database.GargDB as GargDB
68 ------------------------------------------------------------------------
69 {-
70 data Query = Query { query_query :: Text
71 , query_node_id :: Int
72 , query_lang :: Lang
73 , query_databases :: [DataOrigin]
74 }
75 deriving (Eq, Generic)
76
77 deriveJSON (unPrefix "query_") 'Query
78
79 instance Arbitrary Query where
80 arbitrary = elements [ Query q n la fs
81 | q <- ["honeybee* AND collapse"
82 ,"covid 19"
83 ]
84 , n <- [0..10]
85 , la <- allLangs
86 , fs <- take 3 $ repeat allDataOrigins
87 ]
88
89 instance ToSchema Query where
90 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
91 -}
92
93 ------------------------------------------------------------------------
94
95 {-
96 type Api = PostApi
97 :<|> GetApi
98
99 type PostApi = Summary "New Corpus endpoint"
100 :> ReqBody '[JSON] Query
101 :> Post '[JSON] CorpusId
102 type GetApi = Get '[JSON] ApiInfo
103 -}
104
105 -- | TODO manage several apis
106 -- TODO-ACCESS
107 -- TODO this is only the POST
108 {-
109 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
110 api uid (Query q _ as) = do
111 cId <- case head as of
112 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
113 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
114 Just a -> do
115 docs <- liftBase $ API.get a q (Just 1000)
116 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
117 pure cId'
118
119 pure cId
120 -}
121
122 ------------------------------------------------
123 -- TODO use this route for Client implementation
124 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
125 deriving (Generic)
126 instance Arbitrary ApiInfo where
127 arbitrary = ApiInfo <$> arbitrary
128
129 deriveJSON (unPrefix "") 'ApiInfo
130
131 instance ToSchema ApiInfo
132
133 info :: ApiInfo
134 info = ApiInfo API.externalAPIs
135
136 ------------------------------------------------------------------------
137 ------------------------------------------------------------------------
138 data WithQuery = WithQuery
139 { _wq_query :: !Text
140 , _wq_databases :: !Database
141 , _wq_datafield :: !(Maybe Datafield)
142 , _wq_lang :: !Lang
143 , _wq_node_id :: !Int
144 , _wq_flowListWith :: !FlowSocialListWith
145 }
146 deriving Generic
147
148 makeLenses ''WithQuery
149 instance FromJSON WithQuery where
150 parseJSON = genericParseJSON $ jsonOptions "_wq_"
151 instance ToJSON WithQuery where
152 toJSON = genericToJSON $ jsonOptions "_wq_"
153 instance ToSchema WithQuery where
154 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
155
156 ------------------------------------------------------------------------
157
158 type AddWithQuery = Summary "Add with Query to corpus endpoint"
159 :> "corpus"
160 :> Capture "corpus_id" CorpusId
161 :> "query"
162 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
163
164 {-
165 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
166 :> "corpus"
167 :> Capture "corpus_id" CorpusId
168 :> "add"
169 :> "file"
170 :> MultipartForm Mem (MultipartData Mem)
171 :> QueryParam "fileType" FileType
172 :> "async"
173 :> AsyncJobs JobLog '[JSON] () JobLog
174 -}
175
176
177 ------------------------------------------------------------------------
178 -- TODO WithQuery also has a corpus id
179
180
181 addToCorpusWithQuery :: (FlowCmdM env err m, MonadJobStatus m)
182 => User
183 -> CorpusId
184 -> WithQuery
185 -> Maybe Integer
186 -> JobHandle m
187 -> m ()
188 addToCorpusWithQuery user cid (WithQuery { _wq_query = q
189 , _wq_databases = dbs
190 , _wq_datafield = datafield
191 , _wq_lang = l
192 , _wq_flowListWith = flw }) maybeLimit jobHandle = do
193 -- TODO ...
194 -- printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
195 -- printDebug "[addToCorpusWithQuery] datafield" datafield
196 -- printDebug "[addToCorpusWithQuery] flowListWith" flw
197
198 case datafield of
199 Just Web -> do
200 -- printDebug "[addToCorpusWithQuery] processing web request" datafield
201
202 markStarted 1 jobHandle
203
204 _ <- triggerSearxSearch user cid q l jobHandle
205
206 markComplete jobHandle
207
208 _ -> do
209 case datafield of
210 Just (External PubMed) -> do
211 _api_key <- view $ hasConfig . gc_pubmed_api_key
212 printDebug "[addToCorpusWithQuery] pubmed api key" _api_key
213 _ <- updateCorpusPubmedAPIKey cid _api_key
214 pure ()
215 _ -> pure ()
216 markStarted 3 jobHandle
217
218 -- TODO add cid
219 -- TODO if cid is folder -> create Corpus
220 -- if cid is corpus -> add to corpus
221 -- if cid is root -> create corpus in Private
222 -- printDebug "[G.A.N.C.New] getDataText with query" q
223 let db = database2origin dbs
224 eTxt <- getDataText db (Multi l) q maybeLimit
225
226 -- printDebug "[G.A.N.C.New] lTxts" lTxts
227 case eTxt of
228 Right txt -> do
229 -- TODO Sum lenghts of each txt elements
230
231 markProgress 1 jobHandle
232
233 void $ flowDataText user txt (Multi l) cid (Just flw) jobHandle
234 -- printDebug "corpus id" cids
235 -- printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
236 sendMail user
237 -- TODO ...
238 markComplete jobHandle
239
240 Left err -> do
241 -- printDebug "Error: " err
242 markFailed (Just $ T.pack (show err)) jobHandle
243
244 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
245 :> "corpus"
246 :> Capture "corpus_id" CorpusId
247 :> "add"
248 :> "form"
249 :> "async"
250 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
251
252 addToCorpusWithForm :: (FlowCmdM env err m, MonadJobStatus m)
253 => User
254 -> CorpusId
255 -> NewWithForm
256 -> JobHandle m
257 -> m ()
258 addToCorpusWithForm user cid (NewWithForm ft ff d l _n sel) jobHandle = do
259 -- printDebug "[addToCorpusWithForm] Parsing corpus: " cid
260 -- printDebug "[addToCorpusWithForm] fileType" ft
261 -- printDebug "[addToCorpusWithForm] fileFormat" ff
262 limit' <- view $ hasConfig . gc_max_docs_parsers
263 let limit = fromIntegral limit' :: Integer
264 let
265 parseC = case ft of
266 CSV_HAL -> Parser.parseFormatC Parser.CsvHal
267 CSV -> Parser.parseFormatC Parser.CsvGargV3
268 WOS -> Parser.parseFormatC Parser.WOS
269 PresseRIS -> Parser.parseFormatC Parser.RisPresse
270 Iramuteq -> Parser.parseFormatC Parser.Iramuteq
271 JSON -> Parser.parseFormatC Parser.JSON
272
273 -- TODO granularity of the logStatus
274 let data' = case ff of
275 Plain -> cs d
276 ZIP -> case BSB64.decode $ TE.encodeUtf8 d of
277 Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err
278 Right decoded -> decoded
279 eDocsC <- liftBase $ parseC ff data'
280 case eDocsC of
281 Right (mCount, docsC) -> do
282 -- TODO Add progress (jobStatus) update for docs - this is a
283 -- long action
284
285 let docsC' = zipSources (yieldMany [1..]) docsC
286 .| mapMC (\(idx, doc) ->
287 if idx > limit then do
288 --printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show limit)
289 let panicMsg' = [ "[addToCorpusWithForm] number of docs "
290 , "exceeds the MAX_DOCS_PARSERS limit ("
291 , show limit
292 , ")" ]
293 let panicMsg = T.concat $ T.pack <$> panicMsg'
294 --logStatus $ jobLogFailTotalWithMessage panicMsg jobLog
295 panic panicMsg
296 else
297 pure doc)
298 .| mapC toHyperdataDocument
299
300 --printDebug "Parsing corpus finished : " cid
301 --logStatus jobLog2
302
303 --printDebug "Starting extraction : " cid
304 -- TODO granularity of the logStatus
305 -- printDebug "flowCorpus with (corpus_id, lang)" (cid, l)
306
307 _cid' <- flowCorpus user
308 (Right [cid])
309 (Multi $ fromMaybe EN l)
310 (Just sel)
311 --(Just $ fromIntegral $ length docs, docsC')
312 (mCount, transPipe liftBase docsC') -- TODO fix number of docs
313 --(map (map toHyperdataDocument) docs)
314 jobHandle
315
316 -- printDebug "Extraction finished : " cid
317 -- printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
318 -- TODO uncomment this
319 --sendMail user
320
321 markComplete jobHandle
322 Left e -> do
323 printDebug "[addToCorpusWithForm] parse error" e
324 markFailed (Just $ T.pack e) jobHandle
325
326 {-
327 addToCorpusWithFile :: FlowCmdM env err m
328 => CorpusId
329 -> MultipartData Mem
330 -> Maybe FileType
331 -> (JobLog -> m ())
332 -> m JobLog
333 addToCorpusWithFile cid input filetype logStatus = do
334 logStatus JobLog { _scst_succeeded = Just 10
335 , _scst_failed = Just 2
336 , _scst_remaining = Just 138
337 , _scst_events = Just []
338 }
339 printDebug "addToCorpusWithFile" cid
340 _h <- postUpload cid filetype input
341
342 pure JobLog { _scst_succeeded = Just 137
343 , _scst_failed = Just 13
344 , _scst_remaining = Just 0
345 , _scst_events = Just []
346 }
347 -}
348
349
350
351 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
352 :> "corpus"
353 :> Capture "corpus_id" CorpusId
354 :> "add"
355 :> "file"
356 :> "async"
357 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
358
359 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m, MonadJobStatus m)
360 => User
361 -> CorpusId
362 -> NewWithFile
363 -> JobHandle m
364 -> m ()
365 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) jobHandle = do
366
367 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
368 markStarted 1 jobHandle
369
370 fPath <- GargDB.writeFile nwf
371 printDebug "[addToCorpusWithFile] File saved as: " fPath
372
373 uId <- getUserId user
374 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
375
376 _ <- case nIds of
377 [nId] -> do
378 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
379 let hl = node ^. node_hyperdata
380 _ <- updateHyperdata nId $ hl { _hff_name = fName
381 , _hff_path = T.pack fPath }
382
383 printDebug "[addToCorpusWithFile] Created node with id: " nId
384 _ -> pure ()
385
386 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
387
388 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
389 sendMail user
390
391 markComplete jobHandle