]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/New.hs
Revert "Revert "Merge remote-tracking branch 'origin/201-dev-user-pubmed-api-key...
[gargantext.git] / src / Gargantext / API / Node / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
17
18 module Gargantext.API.Node.Corpus.New
19 where
20
21 import Conduit
22 import Control.Lens hiding (elements, Empty)
23 import Control.Monad
24 import Data.Aeson
25 import Data.Aeson.TH (deriveJSON)
26 import qualified Data.ByteString.Base64 as BSB64
27 import Data.Conduit.Internal (zipSources)
28 import Data.Either
29 import Data.Maybe (fromMaybe)
30 import Data.Swagger
31 import Data.Text (Text)
32 import qualified Data.Text as T
33 import GHC.Generics (Generic)
34 import Servant
35 import Servant.Job.Utils (jsonOptions)
36 -- import Servant.Multipart
37 import qualified Data.Text.Encoding as TE
38 -- import Test.QuickCheck (elements)
39 import Test.QuickCheck.Arbitrary
40
41 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
42 import Gargantext.API.Admin.Types (HasSettings)
43 import Gargantext.API.Node.Corpus.New.Types
44 import Gargantext.API.Node.Corpus.Searx
45 import Gargantext.API.Node.Corpus.Types
46 import Gargantext.API.Node.Types
47 import Gargantext.Core (Lang(..))
48 import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
49 import Gargantext.Core.Types.Individu (User(..))
50 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
51 import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
52 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
53 import Gargantext.Database.Action.Mail (sendMail)
54 import Gargantext.Database.Action.Node (mkNodeWithParent)
55 import Gargantext.Database.Action.User (getUserId)
56 import Gargantext.Database.Admin.Types.Hyperdata
57 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..))
58 import Gargantext.Database.Prelude (hasConfig)
59 import Gargantext.Database.Query.Table.Node (getNodeWith, updateCorpusPubmedAPIKey)
60 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
61 import Gargantext.Database.Schema.Node (node_hyperdata)
62 import Gargantext.Prelude
63 import Gargantext.Prelude.Config (gc_max_docs_parsers, gc_pubmed_api_key)
64 import Gargantext.Utils.Jobs (JobHandle, MonadJobStatus(..))
65 import qualified Gargantext.Core.Text.Corpus.API as API
66 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileType(..), parseFormatC)
67 import qualified Gargantext.Database.GargDB as GargDB
68
69 ------------------------------------------------------------------------
70 {-
71 data Query = Query { query_query :: Text
72 , query_node_id :: Int
73 , query_lang :: Lang
74 , query_databases :: [DataOrigin]
75 }
76 deriving (Eq, Generic)
77
78 deriveJSON (unPrefix "query_") 'Query
79
80 instance Arbitrary Query where
81 arbitrary = elements [ Query q n la fs
82 | q <- ["honeybee* AND collapse"
83 ,"covid 19"
84 ]
85 , n <- [0..10]
86 , la <- allLangs
87 , fs <- take 3 $ repeat allDataOrigins
88 ]
89
90 instance ToSchema Query where
91 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
92 -}
93
94 ------------------------------------------------------------------------
95
96 {-
97 type Api = PostApi
98 :<|> GetApi
99
100 type PostApi = Summary "New Corpus endpoint"
101 :> ReqBody '[JSON] Query
102 :> Post '[JSON] CorpusId
103 type GetApi = Get '[JSON] ApiInfo
104 -}
105
106 -- | TODO manage several apis
107 -- TODO-ACCESS
108 -- TODO this is only the POST
109 {-
110 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
111 api uid (Query q _ as) = do
112 cId <- case head as of
113 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
114 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
115 Just a -> do
116 docs <- liftBase $ API.get a q (Just 1000)
117 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
118 pure cId'
119
120 pure cId
121 -}
122
123 ------------------------------------------------
124 -- TODO use this route for Client implementation
125 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
126 deriving (Generic)
127 instance Arbitrary ApiInfo where
128 arbitrary = ApiInfo <$> arbitrary
129
130 deriveJSON (unPrefix "") 'ApiInfo
131
132 instance ToSchema ApiInfo
133
134 info :: ApiInfo
135 info = ApiInfo API.externalAPIs
136
137 ------------------------------------------------------------------------
138 ------------------------------------------------------------------------
139 data WithQuery = WithQuery
140 { _wq_query :: !API.RawQuery
141 , _wq_databases :: !Database
142 , _wq_datafield :: !(Maybe Datafield)
143 , _wq_lang :: !Lang
144 , _wq_node_id :: !Int
145 , _wq_flowListWith :: !FlowSocialListWith
146 }
147 deriving Generic
148
149 makeLenses ''WithQuery
150 instance FromJSON WithQuery where
151 parseJSON = genericParseJSON $ jsonOptions "_wq_"
152 instance ToJSON WithQuery where
153 toJSON = genericToJSON $ jsonOptions "_wq_"
154 instance ToSchema WithQuery where
155 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
156
157 ------------------------------------------------------------------------
158
159 type AddWithQuery = Summary "Add with Query to corpus endpoint"
160 :> "corpus"
161 :> Capture "corpus_id" CorpusId
162 :> "query"
163 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
164
165 {-
166 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
167 :> "corpus"
168 :> Capture "corpus_id" CorpusId
169 :> "add"
170 :> "file"
171 :> MultipartForm Mem (MultipartData Mem)
172 :> QueryParam "fileType" FileType
173 :> "async"
174 :> AsyncJobs JobLog '[JSON] () JobLog
175 -}
176
177
178 ------------------------------------------------------------------------
179 -- TODO WithQuery also has a corpus id
180
181
182 addToCorpusWithQuery :: (FlowCmdM env err m, MonadJobStatus m)
183 => User
184 -> CorpusId
185 -> WithQuery
186 -> Maybe API.Limit
187 -> JobHandle m
188 -> m ()
189 addToCorpusWithQuery user cid (WithQuery { _wq_query = q
190 , _wq_databases = dbs
191 , _wq_datafield = datafield
192 , _wq_lang = l
193 , _wq_flowListWith = flw }) maybeLimit jobHandle = do
194 -- TODO ...
195 -- printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
196 -- printDebug "[addToCorpusWithQuery] datafield" datafield
197 -- printDebug "[addToCorpusWithQuery] flowListWith" flw
198
199 case datafield of
200 Just Web -> do
201 -- printDebug "[addToCorpusWithQuery] processing web request" datafield
202
203 markStarted 1 jobHandle
204
205 _ <- triggerSearxSearch user cid q l jobHandle
206
207 markComplete jobHandle
208
209 _ -> do
210 case datafield of
211 Just (External PubMed) -> do
212 _api_key <- view $ hasConfig . gc_pubmed_api_key
213 printDebug "[addToCorpusWithQuery] pubmed api key" _api_key
214 _ <- updateCorpusPubmedAPIKey cid _api_key
215 pure ()
216 _ -> pure ()
217 markStarted 3 jobHandle
218
219 -- TODO add cid
220 -- TODO if cid is folder -> create Corpus
221 -- if cid is corpus -> add to corpus
222 -- if cid is root -> create corpus in Private
223 -- printDebug "[G.A.N.C.New] getDataText with query" q
224 let db = database2origin dbs
225 eTxt <- getDataText db (Multi l) q maybeLimit
226
227 -- printDebug "[G.A.N.C.New] lTxts" lTxts
228 case eTxt of
229 Right txt -> do
230 -- TODO Sum lenghts of each txt elements
231
232 markProgress 1 jobHandle
233
234 void $ flowDataText user txt (Multi l) cid (Just flw) jobHandle
235 -- printDebug "corpus id" cids
236 -- printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
237 sendMail user
238 -- TODO ...
239 markComplete jobHandle
240
241 Left err -> do
242 -- printDebug "Error: " err
243 markFailed (Just $ T.pack (show err)) jobHandle
244
245 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
246 :> "corpus"
247 :> Capture "corpus_id" CorpusId
248 :> "add"
249 :> "form"
250 :> "async"
251 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
252
253 addToCorpusWithForm :: (FlowCmdM env err m, MonadJobStatus m)
254 => User
255 -> CorpusId
256 -> NewWithForm
257 -> JobHandle m
258 -> m ()
259 addToCorpusWithForm user cid (NewWithForm ft ff d l _n sel) jobHandle = do
260 -- printDebug "[addToCorpusWithForm] Parsing corpus: " cid
261 -- printDebug "[addToCorpusWithForm] fileType" ft
262 -- printDebug "[addToCorpusWithForm] fileFormat" ff
263 limit' <- view $ hasConfig . gc_max_docs_parsers
264 let limit = fromIntegral limit' :: Integer
265 let
266 parseC = case ft of
267 CSV_HAL -> Parser.parseFormatC Parser.CsvHal
268 CSV -> Parser.parseFormatC Parser.CsvGargV3
269 WOS -> Parser.parseFormatC Parser.WOS
270 PresseRIS -> Parser.parseFormatC Parser.RisPresse
271 Iramuteq -> Parser.parseFormatC Parser.Iramuteq
272 JSON -> Parser.parseFormatC Parser.JSON
273
274 -- TODO granularity of the logStatus
275 let data' = case ff of
276 Plain -> cs d
277 ZIP -> case BSB64.decode $ TE.encodeUtf8 d of
278 Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err
279 Right decoded -> decoded
280 eDocsC <- liftBase $ parseC ff data'
281 case eDocsC of
282 Right (mCount, docsC) -> do
283 -- TODO Add progress (jobStatus) update for docs - this is a
284 -- long action
285
286 let docsC' = zipSources (yieldMany [1..]) docsC
287 .| mapMC (\(idx, doc) ->
288 if idx > limit then do
289 --printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show limit)
290 let panicMsg' = [ "[addToCorpusWithForm] number of docs "
291 , "exceeds the MAX_DOCS_PARSERS limit ("
292 , show limit
293 , ")" ]
294 let panicMsg = T.concat $ T.pack <$> panicMsg'
295 --logStatus $ jobLogFailTotalWithMessage panicMsg jobLog
296 panic panicMsg
297 else
298 pure doc)
299 .| mapC toHyperdataDocument
300
301 --printDebug "Parsing corpus finished : " cid
302 --logStatus jobLog2
303
304 --printDebug "Starting extraction : " cid
305 -- TODO granularity of the logStatus
306 -- printDebug "flowCorpus with (corpus_id, lang)" (cid, l)
307
308 _cid' <- flowCorpus user
309 (Right [cid])
310 (Multi $ fromMaybe EN l)
311 (Just sel)
312 --(Just $ fromIntegral $ length docs, docsC')
313 (mCount, transPipe liftBase docsC') -- TODO fix number of docs
314 --(map (map toHyperdataDocument) docs)
315 jobHandle
316
317 -- printDebug "Extraction finished : " cid
318 -- printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
319 -- TODO uncomment this
320 --sendMail user
321
322 markComplete jobHandle
323 Left e -> do
324 printDebug "[addToCorpusWithForm] parse error" e
325 markFailed (Just $ T.pack e) jobHandle
326
327 {-
328 addToCorpusWithFile :: FlowCmdM env err m
329 => CorpusId
330 -> MultipartData Mem
331 -> Maybe FileType
332 -> (JobLog -> m ())
333 -> m JobLog
334 addToCorpusWithFile cid input filetype logStatus = do
335 logStatus JobLog { _scst_succeeded = Just 10
336 , _scst_failed = Just 2
337 , _scst_remaining = Just 138
338 , _scst_events = Just []
339 }
340 printDebug "addToCorpusWithFile" cid
341 _h <- postUpload cid filetype input
342
343 pure JobLog { _scst_succeeded = Just 137
344 , _scst_failed = Just 13
345 , _scst_remaining = Just 0
346 , _scst_events = Just []
347 }
348 -}
349
350
351
352 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
353 :> "corpus"
354 :> Capture "corpus_id" CorpusId
355 :> "add"
356 :> "file"
357 :> "async"
358 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
359
360 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m, MonadJobStatus m)
361 => User
362 -> CorpusId
363 -> NewWithFile
364 -> JobHandle m
365 -> m ()
366 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) jobHandle = do
367
368 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
369 markStarted 1 jobHandle
370
371 fPath <- GargDB.writeFile nwf
372 printDebug "[addToCorpusWithFile] File saved as: " fPath
373
374 uId <- getUserId user
375 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
376
377 _ <- case nIds of
378 [nId] -> do
379 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
380 let hl = node ^. node_hyperdata
381 _ <- updateHyperdata nId $ hl { _hff_name = fName
382 , _hff_path = T.pack fPath }
383
384 printDebug "[addToCorpusWithFile] Created node with id: " nId
385 _ -> pure ()
386
387 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
388
389 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
390 sendMail user
391
392 markComplete jobHandle