]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/New.hs
Merge remote-tracking branch 'origin/adinapoli/fix-datafield-instance' into dev-merge
[gargantext.git] / src / Gargantext / API / Node / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
17
18 module Gargantext.API.Node.Corpus.New
19 where
20
21 import Conduit
22 import Control.Lens hiding (elements, Empty)
23 import Control.Monad
24 import Data.Aeson
25 import Data.Aeson.TH (deriveJSON)
26 import qualified Data.ByteString.Base64 as BSB64
27 import Data.Conduit.Internal (zipSources)
28 import Data.Either
29 import Data.Maybe (fromMaybe)
30 import Data.Swagger
31 import Data.Text (Text)
32 import qualified Data.Text as T
33 import GHC.Generics (Generic)
34 import Servant
35 import Servant.Job.Utils (jsonOptions)
36 -- import Servant.Multipart
37 import qualified Data.Text.Encoding as TE
38 -- import Test.QuickCheck (elements)
39 import Test.QuickCheck.Arbitrary
40
41 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
42 import Gargantext.API.Admin.Types (HasSettings)
43 import Gargantext.API.Node.Corpus.New.Types
44 import Gargantext.API.Node.Corpus.Searx
45 import Gargantext.API.Node.Corpus.Types
46 import Gargantext.API.Node.Types
47 import Gargantext.Core (Lang(..))
48 import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
49 import Gargantext.Core.Types.Individu (User(..))
50 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
51 import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
52 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
53 import Gargantext.Database.Action.Mail (sendMail)
54 import Gargantext.Database.Action.Node (mkNodeWithParent)
55 import Gargantext.Database.Action.User (getUserId)
56 import Gargantext.Database.Admin.Types.Hyperdata
57 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..))
58 import Gargantext.Database.Prelude (hasConfig)
59 import Gargantext.Database.Query.Table.Node (getNodeWith, updateCorpusPubmedAPIKey)
60 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
61 import Gargantext.Database.Schema.Node (node_hyperdata)
62 import Gargantext.Prelude
63 import Gargantext.Prelude.Config (gc_max_docs_parsers, gc_pubmed_api_key)
64 import Gargantext.Utils.Jobs (JobHandle, MonadJobStatus(..))
65 import qualified Gargantext.Core.Text.Corpus.API as API
66 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileType(..), parseFormatC)
67 import qualified Gargantext.Database.GargDB as GargDB
68
69 ------------------------------------------------------------------------
70 {-
71 data Query = Query { query_query :: Text
72 , query_node_id :: Int
73 , query_lang :: Lang
74 , query_databases :: [DataOrigin]
75 }
76 deriving (Eq, Generic)
77
78 deriveJSON (unPrefix "query_") 'Query
79
80 instance Arbitrary Query where
81 arbitrary = elements [ Query q n la fs
82 | q <- ["honeybee* AND collapse"
83 ,"covid 19"
84 ]
85 , n <- [0..10]
86 , la <- allLangs
87 , fs <- take 3 $ repeat allDataOrigins
88 ]
89
90 instance ToSchema Query where
91 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
92 -}
93
94 ------------------------------------------------------------------------
95
96 {-
97 type Api = PostApi
98 :<|> GetApi
99
100 type PostApi = Summary "New Corpus endpoint"
101 :> ReqBody '[JSON] Query
102 :> Post '[JSON] CorpusId
103 type GetApi = Get '[JSON] ApiInfo
104 -}
105
106 -- | TODO manage several apis
107 -- TODO-ACCESS
108 -- TODO this is only the POST
109 {-
110 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
111 api uid (Query q _ as) = do
112 cId <- case head as of
113 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
114 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
115 Just a -> do
116 docs <- liftBase $ API.get a q (Just 1000)
117 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
118 pure cId'
119
120 pure cId
121 -}
122
123 ------------------------------------------------
124 -- TODO use this route for Client implementation
125 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
126 deriving (Generic)
127 instance Arbitrary ApiInfo where
128 arbitrary = ApiInfo <$> arbitrary
129
130 deriveJSON (unPrefix "") 'ApiInfo
131
132 instance ToSchema ApiInfo
133
134 info :: ApiInfo
135 info = ApiInfo API.externalAPIs
136
137 ------------------------------------------------------------------------
138 ------------------------------------------------------------------------
139 data WithQuery = WithQuery
140 { _wq_query :: !API.RawQuery
141 , _wq_databases :: !Database
142 , _wq_datafield :: !(Maybe Datafield)
143 , _wq_lang :: !Lang
144 , _wq_node_id :: !Int
145 , _wq_flowListWith :: !FlowSocialListWith
146 }
147 deriving (Show, Eq, Generic)
148
149 makeLenses ''WithQuery
150 instance FromJSON WithQuery where
151 parseJSON = genericParseJSON $ jsonOptions "_wq_"
152 instance ToJSON WithQuery where
153 toJSON = genericToJSON $ jsonOptions "_wq_"
154 instance ToSchema WithQuery where
155 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
156
157 instance Arbitrary WithQuery where
158 arbitrary = WithQuery <$> arbitrary
159 <*> arbitrary
160 <*> arbitrary
161 <*> arbitrary
162 <*> arbitrary
163 <*> arbitrary
164
165 ------------------------------------------------------------------------
166
167 type AddWithQuery = Summary "Add with Query to corpus endpoint"
168 :> "corpus"
169 :> Capture "corpus_id" CorpusId
170 :> "query"
171 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
172
173 {-
174 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
175 :> "corpus"
176 :> Capture "corpus_id" CorpusId
177 :> "add"
178 :> "file"
179 :> MultipartForm Mem (MultipartData Mem)
180 :> QueryParam "fileType" FileType
181 :> "async"
182 :> AsyncJobs JobLog '[JSON] () JobLog
183 -}
184
185
186 ------------------------------------------------------------------------
187 -- TODO WithQuery also has a corpus id
188
189
190 addToCorpusWithQuery :: (FlowCmdM env err m, MonadJobStatus m)
191 => User
192 -> CorpusId
193 -> WithQuery
194 -> Maybe API.Limit
195 -> JobHandle m
196 -> m ()
197 addToCorpusWithQuery user cid (WithQuery { _wq_query = q
198 , _wq_databases = dbs
199 , _wq_datafield = datafield
200 , _wq_lang = l
201 , _wq_flowListWith = flw }) maybeLimit jobHandle = do
202 -- TODO ...
203 -- printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
204 -- printDebug "[addToCorpusWithQuery] datafield" datafield
205 -- printDebug "[addToCorpusWithQuery] flowListWith" flw
206
207 case datafield of
208 Just Web -> do
209 -- printDebug "[addToCorpusWithQuery] processing web request" datafield
210
211 markStarted 1 jobHandle
212
213 _ <- triggerSearxSearch user cid q l jobHandle
214
215 markComplete jobHandle
216
217 _ -> do
218 case datafield of
219 Just (External PubMed) -> do
220 _api_key <- view $ hasConfig . gc_pubmed_api_key
221 printDebug "[addToCorpusWithQuery] pubmed api key" _api_key
222 _ <- updateCorpusPubmedAPIKey cid _api_key
223 pure ()
224 _ -> pure ()
225 markStarted 3 jobHandle
226
227 -- TODO add cid
228 -- TODO if cid is folder -> create Corpus
229 -- if cid is corpus -> add to corpus
230 -- if cid is root -> create corpus in Private
231 -- printDebug "[G.A.N.C.New] getDataText with query" q
232 let db = database2origin dbs
233 eTxt <- getDataText db (Multi l) q maybeLimit
234
235 -- printDebug "[G.A.N.C.New] lTxts" lTxts
236 case eTxt of
237 Right txt -> do
238 -- TODO Sum lenghts of each txt elements
239
240 markProgress 1 jobHandle
241
242 void $ flowDataText user txt (Multi l) cid (Just flw) jobHandle
243 -- printDebug "corpus id" cids
244 -- printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
245 sendMail user
246 -- TODO ...
247 markComplete jobHandle
248
249 Left err -> do
250 -- printDebug "Error: " err
251 markFailed (Just $ T.pack (show err)) jobHandle
252
253 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
254 :> "corpus"
255 :> Capture "corpus_id" CorpusId
256 :> "add"
257 :> "form"
258 :> "async"
259 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
260
261 addToCorpusWithForm :: (FlowCmdM env err m, MonadJobStatus m)
262 => User
263 -> CorpusId
264 -> NewWithForm
265 -> JobHandle m
266 -> m ()
267 addToCorpusWithForm user cid (NewWithForm ft ff d l _n sel) jobHandle = do
268 -- printDebug "[addToCorpusWithForm] Parsing corpus: " cid
269 -- printDebug "[addToCorpusWithForm] fileType" ft
270 -- printDebug "[addToCorpusWithForm] fileFormat" ff
271 limit' <- view $ hasConfig . gc_max_docs_parsers
272 let limit = fromIntegral limit' :: Integer
273 let
274 parseC = case ft of
275 CSV_HAL -> Parser.parseFormatC Parser.CsvHal
276 CSV -> Parser.parseFormatC Parser.CsvGargV3
277 WOS -> Parser.parseFormatC Parser.WOS
278 PresseRIS -> Parser.parseFormatC Parser.RisPresse
279 Iramuteq -> Parser.parseFormatC Parser.Iramuteq
280 JSON -> Parser.parseFormatC Parser.JSON
281
282 -- TODO granularity of the logStatus
283 let data' = case ff of
284 Plain -> cs d
285 ZIP -> case BSB64.decode $ TE.encodeUtf8 d of
286 Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err
287 Right decoded -> decoded
288 eDocsC <- liftBase $ parseC ff data'
289 case eDocsC of
290 Right (mCount, docsC) -> do
291 -- TODO Add progress (jobStatus) update for docs - this is a
292 -- long action
293
294 let docsC' = zipSources (yieldMany [1..]) docsC
295 .| mapMC (\(idx, doc) ->
296 if idx > limit then do
297 --printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show limit)
298 let panicMsg' = [ "[addToCorpusWithForm] number of docs "
299 , "exceeds the MAX_DOCS_PARSERS limit ("
300 , show limit
301 , ")" ]
302 let panicMsg = T.concat $ T.pack <$> panicMsg'
303 --logStatus $ jobLogFailTotalWithMessage panicMsg jobLog
304 panic panicMsg
305 else
306 pure doc)
307 .| mapC toHyperdataDocument
308
309 --printDebug "Parsing corpus finished : " cid
310 --logStatus jobLog2
311
312 --printDebug "Starting extraction : " cid
313 -- TODO granularity of the logStatus
314 -- printDebug "flowCorpus with (corpus_id, lang)" (cid, l)
315
316 _cid' <- flowCorpus user
317 (Right [cid])
318 (Multi $ fromMaybe EN l)
319 (Just sel)
320 --(Just $ fromIntegral $ length docs, docsC')
321 (mCount, transPipe liftBase docsC') -- TODO fix number of docs
322 --(map (map toHyperdataDocument) docs)
323 jobHandle
324
325 -- printDebug "Extraction finished : " cid
326 -- printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
327 -- TODO uncomment this
328 --sendMail user
329
330 markComplete jobHandle
331 Left e -> do
332 printDebug "[addToCorpusWithForm] parse error" e
333 markFailed (Just $ T.pack e) jobHandle
334
335 {-
336 addToCorpusWithFile :: FlowCmdM env err m
337 => CorpusId
338 -> MultipartData Mem
339 -> Maybe FileType
340 -> (JobLog -> m ())
341 -> m JobLog
342 addToCorpusWithFile cid input filetype logStatus = do
343 logStatus JobLog { _scst_succeeded = Just 10
344 , _scst_failed = Just 2
345 , _scst_remaining = Just 138
346 , _scst_events = Just []
347 }
348 printDebug "addToCorpusWithFile" cid
349 _h <- postUpload cid filetype input
350
351 pure JobLog { _scst_succeeded = Just 137
352 , _scst_failed = Just 13
353 , _scst_remaining = Just 0
354 , _scst_events = Just []
355 }
356 -}
357
358
359
360 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
361 :> "corpus"
362 :> Capture "corpus_id" CorpusId
363 :> "add"
364 :> "file"
365 :> "async"
366 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
367
368 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m, MonadJobStatus m)
369 => User
370 -> CorpusId
371 -> NewWithFile
372 -> JobHandle m
373 -> m ()
374 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) jobHandle = do
375
376 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
377 markStarted 1 jobHandle
378
379 fPath <- GargDB.writeFile nwf
380 printDebug "[addToCorpusWithFile] File saved as: " fPath
381
382 uId <- getUserId user
383 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
384
385 _ <- case nIds of
386 [nId] -> do
387 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
388 let hl = node ^. node_hyperdata
389 _ <- updateHyperdata nId $ hl { _hff_name = fName
390 , _hff_path = T.pack fPath }
391
392 printDebug "[addToCorpusWithFile] Created node with id: " nId
393 _ -> pure ()
394
395 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
396
397 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
398 sendMail user
399
400 markComplete jobHandle