]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/New.hs
[pubmed] implement pubmed api key in hyperdata user
[gargantext.git] / src / Gargantext / API / Node / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
17
18 module Gargantext.API.Node.Corpus.New
19 where
20
21 import Conduit
22 import Control.Lens hiding (elements, Empty)
23 import Control.Monad
24 import Data.Aeson
25 import Data.Aeson.TH (deriveJSON)
26 import qualified Data.ByteString.Base64 as BSB64
27 import Data.Conduit.Internal (zipSources)
28 import Data.Either
29 import Data.Maybe (fromMaybe)
30 import Data.Swagger
31 import Data.Text (Text)
32 import qualified Data.Text as T
33 import GHC.Generics (Generic)
34 import Servant
35 import Servant.Job.Utils (jsonOptions)
36 -- import Servant.Multipart
37 import qualified Data.Text.Encoding as TE
38 -- import Test.QuickCheck (elements)
39 import Test.QuickCheck.Arbitrary
40
41 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
42 import Gargantext.API.Admin.Types (HasSettings)
43 import Gargantext.API.Node.Corpus.New.Types
44 import Gargantext.API.Node.Corpus.Searx
45 import Gargantext.API.Node.Corpus.Types
46 import Gargantext.API.Node.Types
47 import Gargantext.Core (Lang(..))
48 import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
49 import Gargantext.Core.Types.Individu (User(..))
50 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
51 import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
52 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
53 import Gargantext.Database.Action.Mail (sendMail)
54 import Gargantext.Database.Action.Node (mkNodeWithParent)
55 import Gargantext.Database.Action.User (getUserId)
56 import Gargantext.Database.Admin.Types.Hyperdata
57 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..))
58 import Gargantext.Database.Prelude (hasConfig)
59 import Gargantext.Database.Query.Table.Node (getNodeWith)
60 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
61 import Gargantext.Database.Query.Table.User (getUserPubmedAPIKey)
62 import Gargantext.Database.Schema.Node (node_hyperdata)
63 import Gargantext.Prelude
64 import Gargantext.Prelude.Config (gc_max_docs_parsers)
65 import Gargantext.Utils.Jobs (JobHandle, MonadJobStatus(..))
66 import qualified Gargantext.Core.Text.Corpus.API as API
67 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileType(..), parseFormatC)
68 import qualified Gargantext.Database.GargDB as GargDB
69
70 ------------------------------------------------------------------------
71 {-
72 data Query = Query { query_query :: Text
73 , query_node_id :: Int
74 , query_lang :: Lang
75 , query_databases :: [DataOrigin]
76 }
77 deriving (Eq, Generic)
78
79 deriveJSON (unPrefix "query_") 'Query
80
81 instance Arbitrary Query where
82 arbitrary = elements [ Query q n la fs
83 | q <- ["honeybee* AND collapse"
84 ,"covid 19"
85 ]
86 , n <- [0..10]
87 , la <- allLangs
88 , fs <- take 3 $ repeat allDataOrigins
89 ]
90
91 instance ToSchema Query where
92 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
93 -}
94
95 ------------------------------------------------------------------------
96
97 {-
98 type Api = PostApi
99 :<|> GetApi
100
101 type PostApi = Summary "New Corpus endpoint"
102 :> ReqBody '[JSON] Query
103 :> Post '[JSON] CorpusId
104 type GetApi = Get '[JSON] ApiInfo
105 -}
106
107 -- | TODO manage several apis
108 -- TODO-ACCESS
109 -- TODO this is only the POST
110 {-
111 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
112 api uid (Query q _ as) = do
113 cId <- case head as of
114 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
115 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
116 Just a -> do
117 docs <- liftBase $ API.get a q (Just 1000)
118 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
119 pure cId'
120
121 pure cId
122 -}
123
124 ------------------------------------------------
125 -- TODO use this route for Client implementation
126 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
127 deriving (Generic)
128 instance Arbitrary ApiInfo where
129 arbitrary = ApiInfo <$> arbitrary
130
131 deriveJSON (unPrefix "") 'ApiInfo
132
133 instance ToSchema ApiInfo
134
135 info :: ApiInfo
136 info = ApiInfo API.externalAPIs
137
138 ------------------------------------------------------------------------
139 ------------------------------------------------------------------------
140 data WithQuery = WithQuery
141 { _wq_query :: !API.RawQuery
142 , _wq_databases :: !Database
143 , _wq_datafield :: !(Maybe Datafield)
144 , _wq_lang :: !Lang
145 , _wq_node_id :: !Int
146 , _wq_flowListWith :: !FlowSocialListWith
147 }
148 deriving (Show, Eq, Generic)
149
150 makeLenses ''WithQuery
151 instance FromJSON WithQuery where
152 parseJSON = genericParseJSON $ jsonOptions "_wq_"
153 instance ToJSON WithQuery where
154 toJSON = genericToJSON $ jsonOptions "_wq_"
155 instance ToSchema WithQuery where
156 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
157
158 instance Arbitrary WithQuery where
159 arbitrary = WithQuery <$> arbitrary
160 <*> arbitrary
161 <*> arbitrary
162 <*> arbitrary
163 <*> arbitrary
164 <*> arbitrary
165
166 ------------------------------------------------------------------------
167
168 type AddWithQuery = Summary "Add with Query to corpus endpoint"
169 :> "corpus"
170 :> Capture "corpus_id" CorpusId
171 :> "query"
172 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
173
174 {-
175 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
176 :> "corpus"
177 :> Capture "corpus_id" CorpusId
178 :> "add"
179 :> "file"
180 :> MultipartForm Mem (MultipartData Mem)
181 :> QueryParam "fileType" FileType
182 :> "async"
183 :> AsyncJobs JobLog '[JSON] () JobLog
184 -}
185
186
187 ------------------------------------------------------------------------
188 -- TODO WithQuery also has a corpus id
189
190
191 addToCorpusWithQuery :: (FlowCmdM env err m, MonadJobStatus m)
192 => User
193 -> CorpusId
194 -> WithQuery
195 -> Maybe API.Limit
196 -> JobHandle m
197 -> m ()
198 addToCorpusWithQuery user cid (WithQuery { _wq_query = q
199 , _wq_databases = dbs
200 , _wq_datafield = datafield
201 , _wq_lang = l
202 , _wq_flowListWith = flw }) maybeLimit jobHandle = do
203 -- TODO ...
204 -- printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
205 -- printDebug "[addToCorpusWithQuery] datafield" datafield
206 -- printDebug "[addToCorpusWithQuery] flowListWith" flw
207
208 case datafield of
209 Just Web -> do
210 -- printDebug "[addToCorpusWithQuery] processing web request" datafield
211
212 markStarted 1 jobHandle
213
214 _ <- triggerSearxSearch user cid q l jobHandle
215
216 markComplete jobHandle
217
218 _ -> do
219 markStarted 3 jobHandle
220
221 -- TODO add cid
222 -- TODO if cid is folder -> create Corpus
223 -- if cid is corpus -> add to corpus
224 -- if cid is root -> create corpus in Private
225 -- printDebug "[G.A.N.C.New] getDataText with query" q
226 let db = database2origin dbs
227 mPubmedAPIKey <- getUserPubmedAPIKey user
228 -- printDebug "[addToCorpusWithQuery] mPubmedAPIKey" mPubmedAPIKey
229 eTxt <- getDataText db (Multi l) q mPubmedAPIKey maybeLimit
230
231 -- printDebug "[G.A.N.C.New] lTxts" lTxts
232 case eTxt of
233 Right txt -> do
234 -- TODO Sum lenghts of each txt elements
235
236 markProgress 1 jobHandle
237
238 void $ flowDataText user txt (Multi l) cid (Just flw) jobHandle
239 -- printDebug "corpus id" cids
240 -- printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
241 sendMail user
242 -- TODO ...
243 markComplete jobHandle
244
245 Left err -> do
246 -- printDebug "Error: " err
247 markFailed (Just $ T.pack (show err)) jobHandle
248
249 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
250 :> "corpus"
251 :> Capture "corpus_id" CorpusId
252 :> "add"
253 :> "form"
254 :> "async"
255 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
256
257 addToCorpusWithForm :: (FlowCmdM env err m, MonadJobStatus m)
258 => User
259 -> CorpusId
260 -> NewWithForm
261 -> JobHandle m
262 -> m ()
263 addToCorpusWithForm user cid (NewWithForm ft ff d l _n sel) jobHandle = do
264 -- printDebug "[addToCorpusWithForm] Parsing corpus: " cid
265 -- printDebug "[addToCorpusWithForm] fileType" ft
266 -- printDebug "[addToCorpusWithForm] fileFormat" ff
267 limit' <- view $ hasConfig . gc_max_docs_parsers
268 let limit = fromIntegral limit' :: Integer
269 let
270 parseC = case ft of
271 CSV_HAL -> Parser.parseFormatC Parser.CsvHal
272 CSV -> Parser.parseFormatC Parser.CsvGargV3
273 WOS -> Parser.parseFormatC Parser.WOS
274 PresseRIS -> Parser.parseFormatC Parser.RisPresse
275 Iramuteq -> Parser.parseFormatC Parser.Iramuteq
276 JSON -> Parser.parseFormatC Parser.JSON
277
278 -- TODO granularity of the logStatus
279 let data' = case ff of
280 Plain -> cs d
281 ZIP -> case BSB64.decode $ TE.encodeUtf8 d of
282 Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err
283 Right decoded -> decoded
284 eDocsC <- liftBase $ parseC ff data'
285 case eDocsC of
286 Right (mCount, docsC) -> do
287 -- TODO Add progress (jobStatus) update for docs - this is a
288 -- long action
289
290 let docsC' = zipSources (yieldMany [1..]) docsC
291 .| mapMC (\(idx, doc) ->
292 if idx > limit then do
293 --printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show limit)
294 let panicMsg' = [ "[addToCorpusWithForm] number of docs "
295 , "exceeds the MAX_DOCS_PARSERS limit ("
296 , show limit
297 , ")" ]
298 let panicMsg = T.concat $ T.pack <$> panicMsg'
299 --logStatus $ jobLogFailTotalWithMessage panicMsg jobLog
300 panic panicMsg
301 else
302 pure doc)
303 .| mapC toHyperdataDocument
304
305 --printDebug "Parsing corpus finished : " cid
306 --logStatus jobLog2
307
308 --printDebug "Starting extraction : " cid
309 -- TODO granularity of the logStatus
310 -- printDebug "flowCorpus with (corpus_id, lang)" (cid, l)
311
312 _cid' <- flowCorpus user
313 (Right [cid])
314 (Multi $ fromMaybe EN l)
315 (Just sel)
316 --(Just $ fromIntegral $ length docs, docsC')
317 (mCount, transPipe liftBase docsC') -- TODO fix number of docs
318 --(map (map toHyperdataDocument) docs)
319 jobHandle
320
321 -- printDebug "Extraction finished : " cid
322 -- printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
323 -- TODO uncomment this
324 --sendMail user
325
326 markComplete jobHandle
327 Left e -> do
328 printDebug "[addToCorpusWithForm] parse error" e
329 markFailed (Just $ T.pack e) jobHandle
330
331 {-
332 addToCorpusWithFile :: FlowCmdM env err m
333 => CorpusId
334 -> MultipartData Mem
335 -> Maybe FileType
336 -> (JobLog -> m ())
337 -> m JobLog
338 addToCorpusWithFile cid input filetype logStatus = do
339 logStatus JobLog { _scst_succeeded = Just 10
340 , _scst_failed = Just 2
341 , _scst_remaining = Just 138
342 , _scst_events = Just []
343 }
344 printDebug "addToCorpusWithFile" cid
345 _h <- postUpload cid filetype input
346
347 pure JobLog { _scst_succeeded = Just 137
348 , _scst_failed = Just 13
349 , _scst_remaining = Just 0
350 , _scst_events = Just []
351 }
352 -}
353
354
355
356 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
357 :> "corpus"
358 :> Capture "corpus_id" CorpusId
359 :> "add"
360 :> "file"
361 :> "async"
362 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
363
364 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m, MonadJobStatus m)
365 => User
366 -> CorpusId
367 -> NewWithFile
368 -> JobHandle m
369 -> m ()
370 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) jobHandle = do
371
372 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
373 markStarted 1 jobHandle
374
375 fPath <- GargDB.writeFile nwf
376 printDebug "[addToCorpusWithFile] File saved as: " fPath
377
378 uId <- getUserId user
379 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
380
381 _ <- case nIds of
382 [nId] -> do
383 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
384 let hl = node ^. node_hyperdata
385 _ <- updateHyperdata nId $ hl { _hff_name = fName
386 , _hff_path = T.pack fPath }
387
388 printDebug "[addToCorpusWithFile] Created node with id: " nId
389 _ -> pure ()
390
391 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
392
393 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
394 sendMail user
395
396 markComplete jobHandle