]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/New.hs
Merge remote-tracking branch 'origin/571-dev-node-corpus-api-search-fixes' into dev...
[gargantext.git] / src / Gargantext / API / Node / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
17
18 module Gargantext.API.Node.Corpus.New
19 where
20
21 import Conduit
22 import Control.Lens hiding (elements, Empty)
23 import Control.Monad
24 import Data.Aeson
25 import Data.Aeson.TH (deriveJSON)
26 import qualified Data.ByteString.Base64 as BSB64
27 import Data.Conduit.Internal (zipSources)
28 import Data.Either
29 import Data.Maybe (fromMaybe)
30 import Data.Swagger
31 import Data.Text (Text)
32 import qualified Data.Text as T
33 import GHC.Generics (Generic)
34 import Servant
35 import Servant.Job.Utils (jsonOptions)
36 -- import Servant.Multipart
37 import qualified Data.Text.Encoding as TE
38 -- import Test.QuickCheck (elements)
39 import Test.QuickCheck.Arbitrary
40
41 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
42 import Gargantext.API.Admin.Types (HasSettings)
43 import Gargantext.API.Node.Corpus.New.Types
44 import Gargantext.API.Node.Corpus.Searx
45 import Gargantext.API.Node.Corpus.Types
46 import Gargantext.API.Node.Types
47 import Gargantext.Core (Lang(..))
48 import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
49 import Gargantext.Core.Types.Individu (User(..))
50 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
51 import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
52 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
53 import Gargantext.Database.Action.Mail (sendMail)
54 import Gargantext.Database.Action.Node (mkNodeWithParent)
55 import Gargantext.Database.Action.User (getUserId)
56 import Gargantext.Database.Admin.Types.Hyperdata
57 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
58 import Gargantext.Database.Prelude (hasConfig)
59 import Gargantext.Database.Query.Table.Node (getNodeWith, updateCorpusPubmedAPIKey)
60 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
61 import Gargantext.Database.Schema.Node (node_hyperdata)
62 import Gargantext.Prelude
63 import Gargantext.Prelude.Config (gc_max_docs_parsers)
64 import Gargantext.Utils.Jobs (JobHandle, MonadJobStatus(..))
65 import qualified Gargantext.Core.Text.Corpus.API as API
66 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileType(..), parseFormatC)
67 import qualified Gargantext.Database.GargDB as GargDB
68
69 ------------------------------------------------------------------------
70 {-
71 data Query = Query { query_query :: Text
72 , query_node_id :: Int
73 , query_lang :: Lang
74 , query_databases :: [DataOrigin]
75 }
76 deriving (Eq, Generic)
77
78 deriveJSON (unPrefix "query_") 'Query
79
80 instance Arbitrary Query where
81 arbitrary = elements [ Query q n la fs
82 | q <- ["honeybee* AND collapse"
83 ,"covid 19"
84 ]
85 , n <- [0..10]
86 , la <- allLangs
87 , fs <- take 3 $ repeat allDataOrigins
88 ]
89
90 instance ToSchema Query where
91 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
92 -}
93
94 ------------------------------------------------------------------------
95
96 {-
97 type Api = PostApi
98 :<|> GetApi
99
100 type PostApi = Summary "New Corpus endpoint"
101 :> ReqBody '[JSON] Query
102 :> Post '[JSON] CorpusId
103 type GetApi = Get '[JSON] ApiInfo
104 -}
105
106 -- | TODO manage several apis
107 -- TODO-ACCESS
108 -- TODO this is only the POST
109 {-
110 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
111 api uid (Query q _ as) = do
112 cId <- case head as of
113 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
114 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
115 Just a -> do
116 docs <- liftBase $ API.get a q (Just 1000)
117 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
118 pure cId'
119
120 pure cId
121 -}
122
123 ------------------------------------------------
124 -- TODO use this route for Client implementation
125 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
126 deriving (Generic)
127 instance Arbitrary ApiInfo where
128 arbitrary = ApiInfo <$> arbitrary
129
130 deriveJSON (unPrefix "") 'ApiInfo
131
132 instance ToSchema ApiInfo
133
134 info :: FlowCmdM env err m => UserId -> m ApiInfo
135 info _u = do
136 ext <- API.externalAPIs
137
138 pure $ ApiInfo ext
139
140 ------------------------------------------------------------------------
141 ------------------------------------------------------------------------
142 data WithQuery = WithQuery
143 { _wq_query :: !API.RawQuery
144 , _wq_databases :: !Database
145 , _wq_datafield :: !(Maybe Datafield)
146 , _wq_lang :: !Lang
147 , _wq_node_id :: !Int
148 , _wq_flowListWith :: !FlowSocialListWith
149 }
150 deriving (Show, Eq, Generic)
151
152 makeLenses ''WithQuery
153 instance FromJSON WithQuery where
154 parseJSON = genericParseJSON $ jsonOptions "_wq_"
155 instance ToJSON WithQuery where
156 toJSON = genericToJSON $ jsonOptions "_wq_"
157 instance ToSchema WithQuery where
158 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
159
160 instance Arbitrary WithQuery where
161 arbitrary = WithQuery <$> arbitrary
162 <*> arbitrary
163 <*> arbitrary
164 <*> arbitrary
165 <*> arbitrary
166 <*> arbitrary
167
168 ------------------------------------------------------------------------
169
170 type AddWithQuery = Summary "Add with Query to corpus endpoint"
171 :> "corpus"
172 :> Capture "corpus_id" CorpusId
173 :> "query"
174 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
175
176 {-
177 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
178 :> "corpus"
179 :> Capture "corpus_id" CorpusId
180 :> "add"
181 :> "file"
182 :> MultipartForm Mem (MultipartData Mem)
183 :> QueryParam "fileType" FileType
184 :> "async"
185 :> AsyncJobs JobLog '[JSON] () JobLog
186 -}
187
188
189 ------------------------------------------------------------------------
190 -- TODO WithQuery also has a corpus id
191
192
193 addToCorpusWithQuery :: (FlowCmdM env err m, MonadJobStatus m)
194 => User
195 -> CorpusId
196 -> WithQuery
197 -> Maybe API.Limit
198 -> JobHandle m
199 -> m ()
200 addToCorpusWithQuery user cid (WithQuery { _wq_query = q
201 , _wq_databases = dbs
202 , _wq_datafield = datafield
203 , _wq_lang = l
204 , _wq_flowListWith = flw }) maybeLimit jobHandle = do
205 -- TODO ...
206 -- printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
207 -- printDebug "[addToCorpusWithQuery] datafield" datafield
208 -- printDebug "[addToCorpusWithQuery] flowListWith" flw
209
210 case datafield of
211 Just Web -> do
212 -- printDebug "[addToCorpusWithQuery] processing web request" datafield
213
214 markStarted 1 jobHandle
215
216 _ <- triggerSearxSearch user cid q l jobHandle
217
218 markComplete jobHandle
219
220 _ -> do
221 case datafield of
222 Just (External (PubMed { _api_key })) -> do
223 printDebug "[addToCorpusWithQuery] pubmed api key" _api_key
224 _ <- updateCorpusPubmedAPIKey cid _api_key
225 pure ()
226 _ -> pure ()
227 markStarted 3 jobHandle
228
229 -- TODO add cid
230 -- TODO if cid is folder -> create Corpus
231 -- if cid is corpus -> add to corpus
232 -- if cid is root -> create corpus in Private
233 -- printDebug "[G.A.N.C.New] getDataText with query" q
234 db <- database2origin dbs
235 eTxt <- getDataText db (Multi l) q maybeLimit
236
237 -- printDebug "[G.A.N.C.New] lTxts" lTxts
238 case eTxt of
239 Right txt -> do
240 -- TODO Sum lenghts of each txt elements
241
242 markProgress 1 jobHandle
243
244 void $ flowDataText user txt (Multi l) cid (Just flw) jobHandle
245 -- printDebug "corpus id" cids
246 -- printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
247 sendMail user
248 -- TODO ...
249 markComplete jobHandle
250
251 Left err -> do
252 -- printDebug "Error: " err
253 markFailed (Just $ T.pack (show err)) jobHandle
254
255 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
256 :> "corpus"
257 :> Capture "corpus_id" CorpusId
258 :> "add"
259 :> "form"
260 :> "async"
261 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
262
263 addToCorpusWithForm :: (FlowCmdM env err m, MonadJobStatus m)
264 => User
265 -> CorpusId
266 -> NewWithForm
267 -> JobHandle m
268 -> m ()
269 addToCorpusWithForm user cid (NewWithForm ft ff d l _n sel) jobHandle = do
270 -- printDebug "[addToCorpusWithForm] Parsing corpus: " cid
271 -- printDebug "[addToCorpusWithForm] fileType" ft
272 -- printDebug "[addToCorpusWithForm] fileFormat" ff
273 limit' <- view $ hasConfig . gc_max_docs_parsers
274 let limit = fromIntegral limit' :: Integer
275 let
276 parseC = case ft of
277 CSV_HAL -> Parser.parseFormatC Parser.CsvHal
278 CSV -> Parser.parseFormatC Parser.CsvGargV3
279 WOS -> Parser.parseFormatC Parser.WOS
280 PresseRIS -> Parser.parseFormatC Parser.RisPresse
281 Iramuteq -> Parser.parseFormatC Parser.Iramuteq
282 JSON -> Parser.parseFormatC Parser.JSON
283
284 -- TODO granularity of the logStatus
285 let data' = case ff of
286 Plain -> cs d
287 ZIP -> case BSB64.decode $ TE.encodeUtf8 d of
288 Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err
289 Right decoded -> decoded
290 eDocsC <- liftBase $ parseC ff data'
291 case eDocsC of
292 Right (mCount, docsC) -> do
293 -- TODO Add progress (jobStatus) update for docs - this is a
294 -- long action
295
296 let docsC' = zipSources (yieldMany [1..]) docsC
297 .| mapMC (\(idx, doc) ->
298 if idx > limit then do
299 --printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show limit)
300 let panicMsg' = [ "[addToCorpusWithForm] number of docs "
301 , "exceeds the MAX_DOCS_PARSERS limit ("
302 , show limit
303 , ")" ]
304 let panicMsg = T.concat $ T.pack <$> panicMsg'
305 --logStatus $ jobLogFailTotalWithMessage panicMsg jobLog
306 panic panicMsg
307 else
308 pure doc)
309 .| mapC toHyperdataDocument
310
311 --printDebug "Parsing corpus finished : " cid
312 --logStatus jobLog2
313
314 --printDebug "Starting extraction : " cid
315 -- TODO granularity of the logStatus
316 -- printDebug "flowCorpus with (corpus_id, lang)" (cid, l)
317
318 _cid' <- flowCorpus user
319 (Right [cid])
320 (Multi $ fromMaybe EN l)
321 (Just sel)
322 --(Just $ fromIntegral $ length docs, docsC')
323 (mCount, transPipe liftBase docsC') -- TODO fix number of docs
324 --(map (map toHyperdataDocument) docs)
325 jobHandle
326
327 -- printDebug "Extraction finished : " cid
328 -- printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
329 -- TODO uncomment this
330 --sendMail user
331
332 markComplete jobHandle
333 Left e -> do
334 printDebug "[addToCorpusWithForm] parse error" e
335 markFailed (Just $ T.pack e) jobHandle
336
337 {-
338 addToCorpusWithFile :: FlowCmdM env err m
339 => CorpusId
340 -> MultipartData Mem
341 -> Maybe FileType
342 -> (JobLog -> m ())
343 -> m JobLog
344 addToCorpusWithFile cid input filetype logStatus = do
345 logStatus JobLog { _scst_succeeded = Just 10
346 , _scst_failed = Just 2
347 , _scst_remaining = Just 138
348 , _scst_events = Just []
349 }
350 printDebug "addToCorpusWithFile" cid
351 _h <- postUpload cid filetype input
352
353 pure JobLog { _scst_succeeded = Just 137
354 , _scst_failed = Just 13
355 , _scst_remaining = Just 0
356 , _scst_events = Just []
357 }
358 -}
359
360
361
362 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
363 :> "corpus"
364 :> Capture "corpus_id" CorpusId
365 :> "add"
366 :> "file"
367 :> "async"
368 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
369
370 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m, MonadJobStatus m)
371 => User
372 -> CorpusId
373 -> NewWithFile
374 -> JobHandle m
375 -> m ()
376 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) jobHandle = do
377
378 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
379 markStarted 1 jobHandle
380
381 fPath <- GargDB.writeFile nwf
382 printDebug "[addToCorpusWithFile] File saved as: " fPath
383
384 uId <- getUserId user
385 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
386
387 _ <- case nIds of
388 [nId] -> do
389 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
390 let hl = node ^. node_hyperdata
391 _ <- updateHyperdata nId $ hl { _hff_name = fName
392 , _hff_path = T.pack fPath }
393
394 printDebug "[addToCorpusWithFile] Created node with id: " nId
395 _ -> pure ()
396
397 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
398
399 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
400 sendMail user
401
402 markComplete jobHandle