2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
18 module Gargantext.API.Node.Corpus.New
22 import Control.Lens hiding (elements, Empty)
24 import Data.Aeson.TH (deriveJSON)
25 import qualified Data.ByteString.Base64 as BSB64
26 import Data.Conduit.Internal (zipSources)
28 import Data.Maybe (fromMaybe)
30 import Data.Text (Text)
31 import qualified Data.Text as T
32 import GHC.Generics (Generic)
34 import Servant.Job.Utils (jsonOptions)
35 -- import Servant.Multipart
36 import qualified Data.Text.Encoding as TE
37 -- import Test.QuickCheck (elements)
38 import Test.QuickCheck.Arbitrary
40 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs, ScraperEvent(..), scst_events)
41 import Gargantext.API.Admin.Types (HasSettings)
42 import Gargantext.API.Job (addEvent, jobLogSuccess, jobLogFailTotal)
43 import Gargantext.API.Node.Corpus.New.Types
44 import Gargantext.API.Node.Corpus.Searx
45 import Gargantext.API.Node.Corpus.Types
46 import Gargantext.API.Node.Types
47 import Gargantext.Core (Lang(..){-, allLangs-})
48 import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
49 import Gargantext.Core.Types.Individu (User(..))
50 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
51 import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
52 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
53 import Gargantext.Database.Action.Mail (sendMail)
54 import Gargantext.Database.Action.Node (mkNodeWithParent)
55 import Gargantext.Database.Action.User (getUserId)
56 import Gargantext.Database.Admin.Types.Hyperdata
57 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
58 import Gargantext.Database.Prelude (hasConfig)
59 import Gargantext.Database.Query.Table.Node (getNodeWith)
60 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
61 import Gargantext.Database.Schema.Node (node_hyperdata)
62 import Gargantext.Prelude
63 import Gargantext.Prelude.Config (gc_max_docs_parsers)
64 import qualified Gargantext.Core.Text.Corpus.API as API
65 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileType(..), parseFormatC)
66 import qualified Gargantext.Database.GargDB as GargDB
67 ------------------------------------------------------------------------
69 data Query = Query { query_query :: Text
70 , query_node_id :: Int
72 , query_databases :: [DataOrigin]
74 deriving (Eq, Generic)
76 deriveJSON (unPrefix "query_") 'Query
78 instance Arbitrary Query where
79 arbitrary = elements [ Query q n la fs
80 | q <- ["honeybee* AND collapse"
85 , fs <- take 3 $ repeat allDataOrigins
88 instance ToSchema Query where
89 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
92 ------------------------------------------------------------------------
98 type PostApi = Summary "New Corpus endpoint"
99 :> ReqBody '[JSON] Query
100 :> Post '[JSON] CorpusId
101 type GetApi = Get '[JSON] ApiInfo
104 -- | TODO manage several apis
106 -- TODO this is only the POST
108 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
109 api uid (Query q _ as) = do
110 cId <- case head as of
111 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
112 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
114 docs <- liftBase $ API.get a q (Just 1000)
115 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
121 ------------------------------------------------
122 -- TODO use this route for Client implementation
123 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
125 instance Arbitrary ApiInfo where
126 arbitrary = ApiInfo <$> arbitrary
128 deriveJSON (unPrefix "") 'ApiInfo
130 instance ToSchema ApiInfo
132 info :: FlowCmdM env err m => UserId -> m ApiInfo
133 info _u = pure $ ApiInfo API.externalAPIs
135 ------------------------------------------------------------------------
136 ------------------------------------------------------------------------
137 data WithQuery = WithQuery
139 , _wq_databases :: !Database
140 , _wq_datafield :: !(Maybe Datafield)
142 , _wq_node_id :: !Int
143 , _wq_flowListWith :: !FlowSocialListWith
147 makeLenses ''WithQuery
148 instance FromJSON WithQuery where
149 parseJSON = genericParseJSON $ jsonOptions "_wq_"
150 instance ToJSON WithQuery where
151 toJSON = genericToJSON $ jsonOptions "_wq_"
152 instance ToSchema WithQuery where
153 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
155 ------------------------------------------------------------------------
157 type AddWithQuery = Summary "Add with Query to corpus endpoint"
159 :> Capture "corpus_id" CorpusId
161 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
164 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
166 :> Capture "corpus_id" CorpusId
169 :> MultipartForm Mem (MultipartData Mem)
170 :> QueryParam "fileType" FileType
172 :> AsyncJobs JobLog '[JSON] () JobLog
176 ------------------------------------------------------------------------
177 -- TODO WithQuery also has a corpus id
180 addToCorpusWithQuery :: FlowCmdM env err m
187 addToCorpusWithQuery user cid (WithQuery { _wq_query = q
188 , _wq_databases = dbs
189 , _wq_datafield = datafield
191 , _wq_flowListWith = flw }) maybeLimit logStatus = do
193 logStatus JobLog { _scst_succeeded = Just 0
194 , _scst_failed = Just 0
195 , _scst_remaining = Just 3
196 , _scst_events = Just []
198 printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
199 printDebug "[addToCorpusWithQuery] datafield" datafield
200 printDebug "[addToCorpusWithQuery] flowListWith" flw
204 printDebug "[addToCorpusWithQuery] processing web request" datafield
206 _ <- triggerSearxSearch user cid q l logStatus
208 pure JobLog { _scst_succeeded = Just 3
209 , _scst_failed = Just 0
210 , _scst_remaining = Just 0
211 , _scst_events = Just []
216 -- TODO if cid is folder -> create Corpus
217 -- if cid is corpus -> add to corpus
218 -- if cid is root -> create corpus in Private
219 printDebug "[G.A.N.C.New] getDataText with query" q
220 eTxts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
222 let lTxts = lefts eTxts
223 printDebug "[G.A.N.C.New] lTxts" lTxts
226 let txts = rights eTxts
227 -- TODO Sum lenghts of each txt elements
228 logStatus $ JobLog { _scst_succeeded = Just 2
229 , _scst_failed = Just 0
230 , _scst_remaining = Just $ 1 + length txts
231 , _scst_events = Just []
234 cids <- mapM (\txt -> do
235 flowDataText user txt (Multi l) cid Nothing logStatus) txts
236 printDebug "corpus id" cids
237 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
240 pure JobLog { _scst_succeeded = Just 3
241 , _scst_failed = Just 0
242 , _scst_remaining = Just 0
243 , _scst_events = Just []
247 printDebug "Error: " err
248 let jl = addEvent "ERROR" (T.pack $ show err) $
249 JobLog { _scst_succeeded = Just 2
250 , _scst_failed = Just 1
251 , _scst_remaining = Just 0
252 , _scst_events = Just []
258 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
260 :> Capture "corpus_id" CorpusId
264 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
266 addToCorpusWithForm :: (FlowCmdM env err m)
273 addToCorpusWithForm user cid (NewWithForm ft ff d l _n) logStatus jobLog = do
274 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
275 printDebug "[addToCorpusWithForm] fileType" ft
276 printDebug "[addToCorpusWithForm] fileFormat" ff
278 limit' <- view $ hasConfig . gc_max_docs_parsers
279 let limit = fromIntegral limit' :: Integer
282 CSV_HAL -> Parser.parseFormatC Parser.CsvHal
283 CSV -> Parser.parseFormatC Parser.CsvGargV3
284 WOS -> Parser.parseFormatC Parser.WOS
285 PresseRIS -> Parser.parseFormatC Parser.RisPresse
287 -- TODO granularity of the logStatus
288 let data' = case ff of
290 ZIP -> case BSB64.decode $ TE.encodeUtf8 d of
291 Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err
292 Right decoded -> decoded
293 eDocsC <- liftBase $ parseC ff data'
295 Right (mCount, docsC) -> do
296 -- TODO Add progress (jobStatus) update for docs - this is a
299 let docsC' = zipSources (yieldMany [1..]) docsC
300 .| mapMC (\(idx, doc) ->
301 if idx > limit then do
302 --printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show limit)
303 let panicMsg' = [ "[addToCorpusWithForm] number of docs "
304 , "exceeds the MAX_DOCS_PARSERS limit ("
307 let panicMsg = T.concat $ T.pack <$> panicMsg'
308 --logStatus $ jobLogFailTotalWithMessage panicMsg jobLog
312 .| mapC toHyperdataDocument
314 --printDebug "Parsing corpus finished : " cid
317 --printDebug "Starting extraction : " cid
318 -- TODO granularity of the logStatus
319 printDebug "flowCorpus with (corpus_id, lang)" (cid, l)
321 _cid' <- flowCorpus user
323 (Multi $ fromMaybe EN l)
325 --(Just $ fromIntegral $ length docs, docsC')
326 (mCount, transPipe liftBase docsC') -- TODO fix number of docs
327 --(map (map toHyperdataDocument) docs)
330 printDebug "Extraction finished : " cid
331 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
332 -- TODO uncomment this
338 printDebug "[addToCorpusWithForm] parse error" e
340 let evt = ScraperEvent { _scev_message = Just $ T.pack e
341 , _scev_level = Just "ERROR"
342 , _scev_date = Nothing }
344 logStatus $ over (scst_events . _Just) (\evt' -> evt' <> [evt]) jobLogE
347 jobLog2 = jobLogSuccess jobLog
348 jobLog3 = jobLogSuccess jobLog2
349 jobLogE = jobLogFailTotal jobLog
352 addToCorpusWithFile :: FlowCmdM env err m
358 addToCorpusWithFile cid input filetype logStatus = do
359 logStatus JobLog { _scst_succeeded = Just 10
360 , _scst_failed = Just 2
361 , _scst_remaining = Just 138
362 , _scst_events = Just []
364 printDebug "addToCorpusWithFile" cid
365 _h <- postUpload cid filetype input
367 pure JobLog { _scst_succeeded = Just 137
368 , _scst_failed = Just 13
369 , _scst_remaining = Just 0
370 , _scst_events = Just []
376 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
378 :> Capture "corpus_id" CorpusId
382 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
384 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
390 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
392 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
393 logStatus JobLog { _scst_succeeded = Just 0
394 , _scst_failed = Just 0
395 , _scst_remaining = Just 1
396 , _scst_events = Just []
399 fPath <- GargDB.writeFile nwf
400 printDebug "[addToCorpusWithFile] File saved as: " fPath
402 uId <- getUserId user
403 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
407 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
408 let hl = node ^. node_hyperdata
409 _ <- updateHyperdata nId $ hl { _hff_name = fName
410 , _hff_path = T.pack fPath }
412 printDebug "[addToCorpusWithFile] Created node with id: " nId
415 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
417 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
420 pure $ JobLog { _scst_succeeded = Just 1
421 , _scst_failed = Just 0
422 , _scst_remaining = Just 0
423 , _scst_events = Just []