]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/New.hs
Merge branch 'dev' into 193-dev-api-query-dev-fix
[gargantext.git] / src / Gargantext / API / Node / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
17
18 module Gargantext.API.Node.Corpus.New
19 where
20
21 import Conduit
22 import Control.Lens hiding (elements, Empty)
23 import Data.Aeson
24 import Data.Aeson.TH (deriveJSON)
25 import qualified Data.ByteString.Base64 as BSB64
26 import Data.Conduit.Internal (zipSources)
27 import Data.Either
28 import Data.Maybe (fromMaybe)
29 import Data.Swagger
30 import Data.Text (Text)
31 import qualified Data.Text as T
32 import GHC.Generics (Generic)
33 import Servant
34 import Servant.Job.Utils (jsonOptions)
35 -- import Servant.Multipart
36 import qualified Data.Text.Encoding as TE
37 -- import Test.QuickCheck (elements)
38 import Test.QuickCheck.Arbitrary
39
40 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
41 import Gargantext.API.Admin.Types (HasSettings)
42 import Gargantext.API.Node.Corpus.New.Types
43 import Gargantext.API.Node.Corpus.Searx
44 import Gargantext.API.Node.Corpus.Types
45 import Gargantext.API.Node.Types
46 import Gargantext.Core (Lang(..){-, allLangs-})
47 import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
48 import Gargantext.Core.Types.Individu (User(..))
49 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
50 import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
51 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
52 import Gargantext.Database.Action.Mail (sendMail)
53 import Gargantext.Database.Action.Node (mkNodeWithParent)
54 import Gargantext.Database.Action.User (getUserId)
55 import Gargantext.Database.Admin.Types.Hyperdata
56 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
57 import Gargantext.Database.Prelude (hasConfig)
58 import Gargantext.Database.Query.Table.Node (getNodeWith)
59 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
60 import Gargantext.Database.Schema.Node (node_hyperdata)
61 import Gargantext.Prelude
62 import Gargantext.Prelude.Config (gc_max_docs_parsers)
63 import Gargantext.Utils.Jobs (JobHandle, MonadJobStatus(..))
64 import qualified Gargantext.Core.Text.Corpus.API as API
65 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileType(..), parseFormatC)
66 import qualified Gargantext.Database.GargDB as GargDB
67 ------------------------------------------------------------------------
68 {-
69 data Query = Query { query_query :: Text
70 , query_node_id :: Int
71 , query_lang :: Lang
72 , query_databases :: [DataOrigin]
73 }
74 deriving (Eq, Generic)
75
76 deriveJSON (unPrefix "query_") 'Query
77
78 instance Arbitrary Query where
79 arbitrary = elements [ Query q n la fs
80 | q <- ["honeybee* AND collapse"
81 ,"covid 19"
82 ]
83 , n <- [0..10]
84 , la <- allLangs
85 , fs <- take 3 $ repeat allDataOrigins
86 ]
87
88 instance ToSchema Query where
89 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
90 -}
91
92 ------------------------------------------------------------------------
93
94 {-
95 type Api = PostApi
96 :<|> GetApi
97
98 type PostApi = Summary "New Corpus endpoint"
99 :> ReqBody '[JSON] Query
100 :> Post '[JSON] CorpusId
101 type GetApi = Get '[JSON] ApiInfo
102 -}
103
104 -- | TODO manage several apis
105 -- TODO-ACCESS
106 -- TODO this is only the POST
107 {-
108 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
109 api uid (Query q _ as) = do
110 cId <- case head as of
111 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
112 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
113 Just a -> do
114 docs <- liftBase $ API.get a q (Just 1000)
115 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
116 pure cId'
117
118 pure cId
119 -}
120
121 ------------------------------------------------
122 -- TODO use this route for Client implementation
123 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
124 deriving (Generic)
125 instance Arbitrary ApiInfo where
126 arbitrary = ApiInfo <$> arbitrary
127
128 deriveJSON (unPrefix "") 'ApiInfo
129
130 instance ToSchema ApiInfo
131
132 info :: FlowCmdM env err m => UserId -> m ApiInfo
133 info _u = do
134 ext <- API.externalAPIs
135
136 pure $ ApiInfo ext
137
138 ------------------------------------------------------------------------
139 ------------------------------------------------------------------------
140 data WithQuery = WithQuery
141 { _wq_query :: !Text
142 , _wq_databases :: !Database
143 , _wq_datafield :: !(Maybe Datafield)
144 , _wq_lang :: !Lang
145 , _wq_node_id :: !Int
146 , _wq_flowListWith :: !FlowSocialListWith
147 }
148 deriving Generic
149
150 makeLenses ''WithQuery
151 instance FromJSON WithQuery where
152 parseJSON = genericParseJSON $ jsonOptions "_wq_"
153 instance ToJSON WithQuery where
154 toJSON = genericToJSON $ jsonOptions "_wq_"
155 instance ToSchema WithQuery where
156 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
157
158 ------------------------------------------------------------------------
159
160 type AddWithQuery = Summary "Add with Query to corpus endpoint"
161 :> "corpus"
162 :> Capture "corpus_id" CorpusId
163 :> "query"
164 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
165
166 {-
167 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
168 :> "corpus"
169 :> Capture "corpus_id" CorpusId
170 :> "add"
171 :> "file"
172 :> MultipartForm Mem (MultipartData Mem)
173 :> QueryParam "fileType" FileType
174 :> "async"
175 :> AsyncJobs JobLog '[JSON] () JobLog
176 -}
177
178
179 ------------------------------------------------------------------------
180 -- TODO WithQuery also has a corpus id
181
182
183 addToCorpusWithQuery :: (FlowCmdM env err m, MonadJobStatus m)
184 => User
185 -> CorpusId
186 -> WithQuery
187 -> Maybe Integer
188 -> JobHandle m
189 -> m ()
190 addToCorpusWithQuery user cid (WithQuery { _wq_query = q
191 , _wq_databases = dbs
192 , _wq_datafield = datafield
193 , _wq_lang = l
194 , _wq_flowListWith = flw }) maybeLimit jobHandle = do
195 -- TODO ...
196 markStarted 3 jobHandle
197 -- printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
198 -- printDebug "[addToCorpusWithQuery] datafield" datafield
199 -- printDebug "[addToCorpusWithQuery] flowListWith" flw
200
201 case datafield of
202 Just Web -> do
203 -- printDebug "[addToCorpusWithQuery] processing web request" datafield
204
205 _ <- triggerSearxSearch user cid q l jobHandle
206
207 markComplete jobHandle
208
209 _ -> do
210 -- TODO add cid
211 -- TODO if cid is folder -> create Corpus
212 -- if cid is corpus -> add to corpus
213 -- if cid is root -> create corpus in Private
214 -- printDebug "[G.A.N.C.New] getDataText with query" q
215 databaseOrigin <- database2origin dbs
216 eTxts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [databaseOrigin]
217
218 let lTxts = lefts eTxts
219 -- printDebug "[G.A.N.C.New] lTxts" lTxts
220 case lTxts of
221 [] -> do
222 let txts = rights eTxts
223 -- TODO Sum lenghts of each txt elements
224
225 -- NOTE(adinapoli) Some other weird arithmetic to have the
226 -- following 'JobLog' as output:
227 -- JobLog
228 -- { _scst_succeeded = Just 2
229 -- , _scst_failed = Just 0
230 -- , _scst_remaining = Just $ 1 + length txts
231 -- , _scst_events = Just []
232 -- }
233
234 markStarted (3 + length txts) jobHandle
235 markProgress 2 jobHandle
236
237 _cids <- mapM (\txt -> do
238 flowDataText user txt (Multi l) cid (Just flw) jobHandle) txts
239 -- printDebug "corpus id" cids
240 -- printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
241 sendMail user
242 -- TODO ...
243 markComplete jobHandle
244
245 (err:_) -> do
246 -- printDebug "Error: " err
247 markFailure 1 (Just $ T.pack (show err)) jobHandle
248
249 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
250 :> "corpus"
251 :> Capture "corpus_id" CorpusId
252 :> "add"
253 :> "form"
254 :> "async"
255 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
256
257 addToCorpusWithForm :: (FlowCmdM env err m, MonadJobStatus m)
258 => User
259 -> CorpusId
260 -> NewWithForm
261 -> JobHandle m
262 -> m ()
263 addToCorpusWithForm user cid (NewWithForm ft ff d l _n sel) jobHandle = do
264 -- printDebug "[addToCorpusWithForm] Parsing corpus: " cid
265 -- printDebug "[addToCorpusWithForm] fileType" ft
266 -- printDebug "[addToCorpusWithForm] fileFormat" ff
267 limit' <- view $ hasConfig . gc_max_docs_parsers
268 let limit = fromIntegral limit' :: Integer
269 let
270 parseC = case ft of
271 CSV_HAL -> Parser.parseFormatC Parser.CsvHal
272 CSV -> Parser.parseFormatC Parser.CsvGargV3
273 WOS -> Parser.parseFormatC Parser.WOS
274 PresseRIS -> Parser.parseFormatC Parser.RisPresse
275
276 -- TODO granularity of the logStatus
277 let data' = case ff of
278 Plain -> cs d
279 ZIP -> case BSB64.decode $ TE.encodeUtf8 d of
280 Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err
281 Right decoded -> decoded
282 eDocsC <- liftBase $ parseC ff data'
283 case eDocsC of
284 Right (mCount, docsC) -> do
285 -- TODO Add progress (jobStatus) update for docs - this is a
286 -- long action
287
288 let docsC' = zipSources (yieldMany [1..]) docsC
289 .| mapMC (\(idx, doc) ->
290 if idx > limit then do
291 --printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show limit)
292 let panicMsg' = [ "[addToCorpusWithForm] number of docs "
293 , "exceeds the MAX_DOCS_PARSERS limit ("
294 , show limit
295 , ")" ]
296 let panicMsg = T.concat $ T.pack <$> panicMsg'
297 --logStatus $ jobLogFailTotalWithMessage panicMsg jobLog
298 panic panicMsg
299 else
300 pure doc)
301 .| mapC toHyperdataDocument
302
303 --printDebug "Parsing corpus finished : " cid
304 --logStatus jobLog2
305
306 --printDebug "Starting extraction : " cid
307 -- TODO granularity of the logStatus
308 -- printDebug "flowCorpus with (corpus_id, lang)" (cid, l)
309
310 _cid' <- flowCorpus user
311 (Right [cid])
312 (Multi $ fromMaybe EN l)
313 (Just sel)
314 --(Just $ fromIntegral $ length docs, docsC')
315 (mCount, transPipe liftBase docsC') -- TODO fix number of docs
316 --(map (map toHyperdataDocument) docs)
317 jobHandle
318
319 -- printDebug "Extraction finished : " cid
320 -- printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
321 -- TODO uncomment this
322 --sendMail user
323
324 markComplete jobHandle
325 Left e -> do
326 printDebug "[addToCorpusWithForm] parse error" e
327 markFailed (Just $ T.pack e) jobHandle
328
329 {-
330 addToCorpusWithFile :: FlowCmdM env err m
331 => CorpusId
332 -> MultipartData Mem
333 -> Maybe FileType
334 -> (JobLog -> m ())
335 -> m JobLog
336 addToCorpusWithFile cid input filetype logStatus = do
337 logStatus JobLog { _scst_succeeded = Just 10
338 , _scst_failed = Just 2
339 , _scst_remaining = Just 138
340 , _scst_events = Just []
341 }
342 printDebug "addToCorpusWithFile" cid
343 _h <- postUpload cid filetype input
344
345 pure JobLog { _scst_succeeded = Just 137
346 , _scst_failed = Just 13
347 , _scst_remaining = Just 0
348 , _scst_events = Just []
349 }
350 -}
351
352
353
354 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
355 :> "corpus"
356 :> Capture "corpus_id" CorpusId
357 :> "add"
358 :> "file"
359 :> "async"
360 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
361
362 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m, MonadJobStatus m)
363 => User
364 -> CorpusId
365 -> NewWithFile
366 -> JobHandle m
367 -> m ()
368 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) jobHandle = do
369
370 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
371 markStarted 1 jobHandle
372
373 fPath <- GargDB.writeFile nwf
374 printDebug "[addToCorpusWithFile] File saved as: " fPath
375
376 uId <- getUserId user
377 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
378
379 _ <- case nIds of
380 [nId] -> do
381 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
382 let hl = node ^. node_hyperdata
383 _ <- updateHyperdata nId $ hl { _hff_name = fName
384 , _hff_path = T.pack fPath }
385
386 printDebug "[addToCorpusWithFile] Created node with id: " nId
387 _ -> pure ()
388
389 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
390
391 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
392 sendMail user
393
394 markComplete jobHandle