2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
18 module Gargantext.API.Node.Corpus.New
22 import Control.Lens hiding (elements, Empty)
24 import Data.Aeson.TH (deriveJSON)
25 import qualified Data.ByteString.Base64 as BSB64
27 import Data.Maybe (fromMaybe)
29 import Data.Text (Text)
30 import qualified Data.Text as T
31 import GHC.Generics (Generic)
33 import Servant.Job.Utils (jsonOptions)
34 -- import Servant.Multipart
35 import qualified Data.Text.Encoding as TE
36 -- import Test.QuickCheck (elements)
37 import Test.QuickCheck.Arbitrary
39 import Gargantext.Prelude
41 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs, ScraperEvent(..), scst_events)
42 import Gargantext.API.Admin.Types (HasSettings)
43 import Gargantext.API.Job (jobLogSuccess, jobLogFailTotal, jobLogFailTotalWithMessage)
44 import Gargantext.API.Node.Corpus.New.File
45 import Gargantext.API.Node.Corpus.Searx
46 import Gargantext.API.Node.Corpus.Types
47 import Gargantext.API.Node.Types
48 import Gargantext.Core (Lang(..){-, allLangs-})
49 import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
50 import qualified Gargantext.Core.Text.Corpus.API as API
51 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
52 import Gargantext.Core.Types.Individu (User(..))
53 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
54 import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
55 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
56 import Gargantext.Database.Action.Mail (sendMail)
57 import Gargantext.Database.Action.Node (mkNodeWithParent)
58 import Gargantext.Database.Action.User (getUserId)
59 import Gargantext.Database.Admin.Types.Hyperdata
60 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
61 import Gargantext.Database.Prelude (hasConfig)
62 import Gargantext.Database.Query.Table.Node (getNodeWith)
63 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
64 import Gargantext.Database.Schema.Node (node_hyperdata)
65 import qualified Gargantext.Database.GargDB as GargDB
66 import Gargantext.Prelude.Config (gc_max_docs_parsers)
67 ------------------------------------------------------------------------
69 data Query = Query { query_query :: Text
70 , query_node_id :: Int
72 , query_databases :: [DataOrigin]
74 deriving (Eq, Generic)
76 deriveJSON (unPrefix "query_") 'Query
78 instance Arbitrary Query where
79 arbitrary = elements [ Query q n la fs
80 | q <- ["honeybee* AND collapse"
85 , fs <- take 3 $ repeat allDataOrigins
88 instance ToSchema Query where
89 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
92 ------------------------------------------------------------------------
98 type PostApi = Summary "New Corpus endpoint"
99 :> ReqBody '[JSON] Query
100 :> Post '[JSON] CorpusId
101 type GetApi = Get '[JSON] ApiInfo
104 -- | TODO manage several apis
106 -- TODO this is only the POST
108 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
109 api uid (Query q _ as) = do
110 cId <- case head as of
111 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
112 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
114 docs <- liftBase $ API.get a q (Just 1000)
115 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
121 ------------------------------------------------
122 -- TODO use this route for Client implementation
123 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
125 instance Arbitrary ApiInfo where
126 arbitrary = ApiInfo <$> arbitrary
128 deriveJSON (unPrefix "") 'ApiInfo
130 instance ToSchema ApiInfo
132 info :: FlowCmdM env err m => UserId -> m ApiInfo
133 info _u = pure $ ApiInfo API.externalAPIs
135 ------------------------------------------------------------------------
136 ------------------------------------------------------------------------
137 data WithQuery = WithQuery
139 , _wq_databases :: !Database
140 , _wq_datafield :: !(Maybe Datafield)
142 , _wq_node_id :: !Int
143 , _wq_flowListWith :: !FlowSocialListWith
147 makeLenses ''WithQuery
148 instance FromJSON WithQuery where
149 parseJSON = genericParseJSON $ jsonOptions "_wq_"
150 instance ToJSON WithQuery where
151 toJSON = genericToJSON $ jsonOptions "_wq_"
152 instance ToSchema WithQuery where
153 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
155 ------------------------------------------------------------------------
157 type AddWithQuery = Summary "Add with Query to corpus endpoint"
159 :> Capture "corpus_id" CorpusId
161 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
164 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
166 :> Capture "corpus_id" CorpusId
169 :> MultipartForm Mem (MultipartData Mem)
170 :> QueryParam "fileType" FileType
172 :> AsyncJobs JobLog '[JSON] () JobLog
176 ------------------------------------------------------------------------
177 -- TODO WithQuery also has a corpus id
178 addToCorpusWithQuery :: FlowCmdM env err m
185 addToCorpusWithQuery user cid (WithQuery { _wq_query = q
186 , _wq_databases = dbs
187 , _wq_datafield = datafield
189 , _wq_flowListWith = flw }) maybeLimit logStatus = do
191 logStatus JobLog { _scst_succeeded = Just 0
192 , _scst_failed = Just 0
193 , _scst_remaining = Just 3
194 , _scst_events = Just []
196 printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
197 printDebug "[addToCorpusWithQuery] datafield" datafield
198 printDebug "[addToCorpusWithQuery] flowListWith" flw
202 printDebug "[addToCorpusWithQuery] processing web request" datafield
204 _ <- triggerSearxSearch user cid q l logStatus
206 pure JobLog { _scst_succeeded = Just 3
207 , _scst_failed = Just 0
208 , _scst_remaining = Just 0
209 , _scst_events = Just []
214 -- TODO if cid is folder -> create Corpus
215 -- if cid is corpus -> add to corpus
216 -- if cid is root -> create corpus in Private
217 txts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
219 -- TODO Sum lenghts of each txt elements
220 logStatus JobLog { _scst_succeeded = Just 2
221 , _scst_failed = Just 0
222 , _scst_remaining = Just $ 1 + length txts
223 , _scst_events = Just []
226 cids <- mapM (\txt -> flowDataText user txt (Multi l) cid Nothing logStatus) txts
227 printDebug "corpus id" cids
228 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
231 pure JobLog { _scst_succeeded = Just 3
232 , _scst_failed = Just 0
233 , _scst_remaining = Just 0
234 , _scst_events = Just []
238 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
240 :> Capture "corpus_id" CorpusId
244 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
246 addToCorpusWithForm :: (FlowCmdM env err m)
253 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
254 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
255 printDebug "[addToCorpusWithForm] fileType" ft
259 CSV_HAL -> Parser.parseFormat Parser.CsvHal
260 CSV -> Parser.parseFormat Parser.CsvGargV3
261 WOS -> Parser.parseFormat Parser.WOS
262 PresseRIS -> Parser.parseFormat Parser.RisPresse
263 ZIP -> Parser.parseFormat Parser.ZIP
265 -- TODO granularity of the logStatus
266 let data' = case ft of
267 ZIP -> case BSB64.decode $ TE.encodeUtf8 d of
268 Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err
269 Right decoded -> decoded
271 eDocs <- liftBase $ parse data'
274 -- TODO Add progress (jobStatus) update for docs - this is a
276 limit' <- view $ hasConfig . gc_max_docs_parsers
277 let limit = fromIntegral limit'
278 if length docs' > limit then do
279 printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show $ length docs')
280 let panicMsg' = [ "[addToCorpusWithForm] number of docs ("
281 , show $ length docs'
282 , ") exceeds the MAX_DOCS_PARSERS limit ("
285 let panicMsg = T.concat $ T.pack <$> panicMsg'
286 logStatus $ jobLogFailTotalWithMessage panicMsg jobLog
290 let docs = splitEvery 500 $ take limit docs'
292 printDebug "Parsing corpus finished : " cid
295 printDebug "Starting extraction : " cid
296 -- TODO granularity of the logStatus
297 _cid' <- flowCorpus user
299 (Multi $ fromMaybe EN l)
301 (map (map toHyperdataDocument) docs)
304 printDebug "Extraction finished : " cid
305 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
311 printDebug "[addToCorpusWithForm] parse error" e
313 let evt = ScraperEvent { _scev_message = Just $ T.pack e
314 , _scev_level = Just "ERROR"
315 , _scev_date = Nothing }
317 logStatus $ over (scst_events . _Just) (\evt' -> evt' <> [evt]) jobLogE
320 jobLog2 = jobLogSuccess jobLog
321 jobLog3 = jobLogSuccess jobLog2
322 jobLogE = jobLogFailTotal jobLog
325 addToCorpusWithFile :: FlowCmdM env err m
331 addToCorpusWithFile cid input filetype logStatus = do
332 logStatus JobLog { _scst_succeeded = Just 10
333 , _scst_failed = Just 2
334 , _scst_remaining = Just 138
335 , _scst_events = Just []
337 printDebug "addToCorpusWithFile" cid
338 _h <- postUpload cid filetype input
340 pure JobLog { _scst_succeeded = Just 137
341 , _scst_failed = Just 13
342 , _scst_remaining = Just 0
343 , _scst_events = Just []
349 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
351 :> Capture "corpus_id" CorpusId
355 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
357 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
363 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
365 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
366 logStatus JobLog { _scst_succeeded = Just 0
367 , _scst_failed = Just 0
368 , _scst_remaining = Just 1
369 , _scst_events = Just []
372 fPath <- GargDB.writeFile nwf
373 printDebug "[addToCorpusWithFile] File saved as: " fPath
375 uId <- getUserId user
376 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
380 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
381 let hl = node ^. node_hyperdata
382 _ <- updateHyperdata nId $ hl { _hff_name = fName
383 , _hff_path = T.pack fPath }
385 printDebug "[addToCorpusWithFile] Created node with id: " nId
388 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
390 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
393 pure $ JobLog { _scst_succeeded = Just 1
394 , _scst_failed = Just 0
395 , _scst_remaining = Just 0
396 , _scst_events = Just []