]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/New.hs
Merge remote-tracking branch 'origin/dev-phyloDebug' into 206-dev-phylo
[gargantext.git] / src / Gargantext / API / Node / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
17
18 module Gargantext.API.Node.Corpus.New
19 where
20
21 import Conduit
22 import Control.Lens hiding (elements, Empty)
23 import Control.Monad
24 import Data.Aeson
25 import Data.Aeson.TH (deriveJSON)
26 import qualified Data.ByteString.Base64 as BSB64
27 import Data.Conduit.Internal (zipSources)
28 import Data.Either
29 import Data.Maybe (fromMaybe)
30 import Data.Swagger
31 import Data.Text (Text)
32 import qualified Data.Text as T
33 import GHC.Generics (Generic)
34 import Servant
35 import Servant.Job.Utils (jsonOptions)
36 -- import Servant.Multipart
37 import qualified Data.Text.Encoding as TE
38 -- import Test.QuickCheck (elements)
39 import Test.QuickCheck.Arbitrary
40
41 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
42 import Gargantext.API.Admin.Types (HasSettings)
43 import Gargantext.API.Node.Corpus.New.Types
44 import Gargantext.API.Node.Corpus.Searx
45 import Gargantext.API.Node.Corpus.Types
46 import Gargantext.API.Node.Types
47 import Gargantext.Core (Lang(..))
48 import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
49 import Gargantext.Core.Types.Individu (User(..))
50 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
51 import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
52 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
53 import Gargantext.Database.Action.Mail (sendMail)
54 import Gargantext.Database.Action.Node (mkNodeWithParent)
55 import Gargantext.Database.Action.User (getUserId)
56 import Gargantext.Database.Admin.Types.Hyperdata
57 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
58 import Gargantext.Database.Prelude (hasConfig)
59 import Gargantext.Database.Query.Table.Node (getNodeWith, updateCorpusPubmedAPIKey)
60 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
61 import Gargantext.Database.Schema.Node (node_hyperdata)
62 import Gargantext.Prelude
63 import Gargantext.Prelude.Config (gc_max_docs_parsers)
64 import Gargantext.Utils.Jobs (JobHandle, MonadJobStatus(..))
65 import qualified Gargantext.Core.Text.Corpus.API as API
66 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileType(..), parseFormatC)
67 import qualified Gargantext.Database.GargDB as GargDB
68 ------------------------------------------------------------------------
69 {-
70 data Query = Query { query_query :: Text
71 , query_node_id :: Int
72 , query_lang :: Lang
73 , query_databases :: [DataOrigin]
74 }
75 deriving (Eq, Generic)
76
77 deriveJSON (unPrefix "query_") 'Query
78
79 instance Arbitrary Query where
80 arbitrary = elements [ Query q n la fs
81 | q <- ["honeybee* AND collapse"
82 ,"covid 19"
83 ]
84 , n <- [0..10]
85 , la <- allLangs
86 , fs <- take 3 $ repeat allDataOrigins
87 ]
88
89 instance ToSchema Query where
90 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
91 -}
92
93 ------------------------------------------------------------------------
94
95 {-
96 type Api = PostApi
97 :<|> GetApi
98
99 type PostApi = Summary "New Corpus endpoint"
100 :> ReqBody '[JSON] Query
101 :> Post '[JSON] CorpusId
102 type GetApi = Get '[JSON] ApiInfo
103 -}
104
105 -- | TODO manage several apis
106 -- TODO-ACCESS
107 -- TODO this is only the POST
108 {-
109 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
110 api uid (Query q _ as) = do
111 cId <- case head as of
112 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
113 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
114 Just a -> do
115 docs <- liftBase $ API.get a q (Just 1000)
116 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
117 pure cId'
118
119 pure cId
120 -}
121
122 ------------------------------------------------
123 -- TODO use this route for Client implementation
124 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
125 deriving (Generic)
126 instance Arbitrary ApiInfo where
127 arbitrary = ApiInfo <$> arbitrary
128
129 deriveJSON (unPrefix "") 'ApiInfo
130
131 instance ToSchema ApiInfo
132
133 info :: FlowCmdM env err m => UserId -> m ApiInfo
134 info _u = do
135 ext <- API.externalAPIs
136
137 pure $ ApiInfo ext
138
139 ------------------------------------------------------------------------
140 ------------------------------------------------------------------------
141 data WithQuery = WithQuery
142 { _wq_query :: !Text
143 , _wq_databases :: !Database
144 , _wq_datafield :: !(Maybe Datafield)
145 , _wq_lang :: !Lang
146 , _wq_node_id :: !Int
147 , _wq_flowListWith :: !FlowSocialListWith
148 }
149 deriving Generic
150
151 makeLenses ''WithQuery
152 instance FromJSON WithQuery where
153 parseJSON = genericParseJSON $ jsonOptions "_wq_"
154 instance ToJSON WithQuery where
155 toJSON = genericToJSON $ jsonOptions "_wq_"
156 instance ToSchema WithQuery where
157 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
158
159 ------------------------------------------------------------------------
160
161 type AddWithQuery = Summary "Add with Query to corpus endpoint"
162 :> "corpus"
163 :> Capture "corpus_id" CorpusId
164 :> "query"
165 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
166
167 {-
168 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
169 :> "corpus"
170 :> Capture "corpus_id" CorpusId
171 :> "add"
172 :> "file"
173 :> MultipartForm Mem (MultipartData Mem)
174 :> QueryParam "fileType" FileType
175 :> "async"
176 :> AsyncJobs JobLog '[JSON] () JobLog
177 -}
178
179
180 ------------------------------------------------------------------------
181 -- TODO WithQuery also has a corpus id
182
183
184 addToCorpusWithQuery :: (FlowCmdM env err m, MonadJobStatus m)
185 => User
186 -> CorpusId
187 -> WithQuery
188 -> Maybe Integer
189 -> JobHandle m
190 -> m ()
191 addToCorpusWithQuery user cid (WithQuery { _wq_query = q
192 , _wq_databases = dbs
193 , _wq_datafield = datafield
194 , _wq_lang = l
195 , _wq_flowListWith = flw }) maybeLimit jobHandle = do
196 -- TODO ...
197 -- printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
198 -- printDebug "[addToCorpusWithQuery] datafield" datafield
199 -- printDebug "[addToCorpusWithQuery] flowListWith" flw
200
201 case datafield of
202 Just Web -> do
203 -- printDebug "[addToCorpusWithQuery] processing web request" datafield
204
205 markStarted 1 jobHandle
206
207 _ <- triggerSearxSearch user cid q l jobHandle
208
209 markComplete jobHandle
210
211 _ -> do
212 case datafield of
213 Just (External (PubMed { _api_key })) -> do
214 printDebug "[addToCorpusWithQuery] pubmed api key" _api_key
215 _ <- updateCorpusPubmedAPIKey cid _api_key
216 pure ()
217 _ -> pure ()
218 markStarted 3 jobHandle
219
220 -- TODO add cid
221 -- TODO if cid is folder -> create Corpus
222 -- if cid is corpus -> add to corpus
223 -- if cid is root -> create corpus in Private
224 -- printDebug "[G.A.N.C.New] getDataText with query" q
225 db <- database2origin dbs
226 eTxt <- getDataText db (Multi l) q maybeLimit
227
228 -- printDebug "[G.A.N.C.New] lTxts" lTxts
229 case eTxt of
230 Right txt -> do
231 -- TODO Sum lenghts of each txt elements
232
233 markProgress 1 jobHandle
234
235 void $ flowDataText user txt (Multi l) cid (Just flw) jobHandle
236 -- printDebug "corpus id" cids
237 -- printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
238 sendMail user
239 -- TODO ...
240 markComplete jobHandle
241
242 Left err -> do
243 -- printDebug "Error: " err
244 markFailed (Just $ T.pack (show err)) jobHandle
245
246 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
247 :> "corpus"
248 :> Capture "corpus_id" CorpusId
249 :> "add"
250 :> "form"
251 :> "async"
252 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
253
254 addToCorpusWithForm :: (FlowCmdM env err m, MonadJobStatus m)
255 => User
256 -> CorpusId
257 -> NewWithForm
258 -> JobHandle m
259 -> m ()
260 addToCorpusWithForm user cid (NewWithForm ft ff d l _n sel) jobHandle = do
261 -- printDebug "[addToCorpusWithForm] Parsing corpus: " cid
262 -- printDebug "[addToCorpusWithForm] fileType" ft
263 -- printDebug "[addToCorpusWithForm] fileFormat" ff
264 limit' <- view $ hasConfig . gc_max_docs_parsers
265 let limit = fromIntegral limit' :: Integer
266 let
267 parseC = case ft of
268 CSV_HAL -> Parser.parseFormatC Parser.CsvHal
269 CSV -> Parser.parseFormatC Parser.CsvGargV3
270 WOS -> Parser.parseFormatC Parser.WOS
271 PresseRIS -> Parser.parseFormatC Parser.RisPresse
272 Iramuteq -> Parser.parseFormatC Parser.Iramuteq
273 JSON -> Parser.parseFormatC Parser.JSON
274
275 -- TODO granularity of the logStatus
276 let data' = case ff of
277 Plain -> cs d
278 ZIP -> case BSB64.decode $ TE.encodeUtf8 d of
279 Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err
280 Right decoded -> decoded
281 eDocsC <- liftBase $ parseC ff data'
282 case eDocsC of
283 Right (mCount, docsC) -> do
284 -- TODO Add progress (jobStatus) update for docs - this is a
285 -- long action
286
287 let docsC' = zipSources (yieldMany [1..]) docsC
288 .| mapMC (\(idx, doc) ->
289 if idx > limit then do
290 --printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show limit)
291 let panicMsg' = [ "[addToCorpusWithForm] number of docs "
292 , "exceeds the MAX_DOCS_PARSERS limit ("
293 , show limit
294 , ")" ]
295 let panicMsg = T.concat $ T.pack <$> panicMsg'
296 --logStatus $ jobLogFailTotalWithMessage panicMsg jobLog
297 panic panicMsg
298 else
299 pure doc)
300 .| mapC toHyperdataDocument
301
302 --printDebug "Parsing corpus finished : " cid
303 --logStatus jobLog2
304
305 --printDebug "Starting extraction : " cid
306 -- TODO granularity of the logStatus
307 -- printDebug "flowCorpus with (corpus_id, lang)" (cid, l)
308
309 _cid' <- flowCorpus user
310 (Right [cid])
311 (Multi $ fromMaybe EN l)
312 (Just sel)
313 --(Just $ fromIntegral $ length docs, docsC')
314 (mCount, transPipe liftBase docsC') -- TODO fix number of docs
315 --(map (map toHyperdataDocument) docs)
316 jobHandle
317
318 -- printDebug "Extraction finished : " cid
319 -- printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
320 -- TODO uncomment this
321 --sendMail user
322
323 markComplete jobHandle
324 Left e -> do
325 printDebug "[addToCorpusWithForm] parse error" e
326 markFailed (Just $ T.pack e) jobHandle
327
328 {-
329 addToCorpusWithFile :: FlowCmdM env err m
330 => CorpusId
331 -> MultipartData Mem
332 -> Maybe FileType
333 -> (JobLog -> m ())
334 -> m JobLog
335 addToCorpusWithFile cid input filetype logStatus = do
336 logStatus JobLog { _scst_succeeded = Just 10
337 , _scst_failed = Just 2
338 , _scst_remaining = Just 138
339 , _scst_events = Just []
340 }
341 printDebug "addToCorpusWithFile" cid
342 _h <- postUpload cid filetype input
343
344 pure JobLog { _scst_succeeded = Just 137
345 , _scst_failed = Just 13
346 , _scst_remaining = Just 0
347 , _scst_events = Just []
348 }
349 -}
350
351
352
353 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
354 :> "corpus"
355 :> Capture "corpus_id" CorpusId
356 :> "add"
357 :> "file"
358 :> "async"
359 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
360
361 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m, MonadJobStatus m)
362 => User
363 -> CorpusId
364 -> NewWithFile
365 -> JobHandle m
366 -> m ()
367 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) jobHandle = do
368
369 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
370 markStarted 1 jobHandle
371
372 fPath <- GargDB.writeFile nwf
373 printDebug "[addToCorpusWithFile] File saved as: " fPath
374
375 uId <- getUserId user
376 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
377
378 _ <- case nIds of
379 [nId] -> do
380 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
381 let hl = node ^. node_hyperdata
382 _ <- updateHyperdata nId $ hl { _hff_name = fName
383 , _hff_path = T.pack fPath }
384
385 printDebug "[addToCorpusWithFile] Created node with id: " nId
386 _ -> pure ()
387
388 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
389
390 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
391 sendMail user
392
393 markComplete jobHandle