2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
18 module Gargantext.API.Node.Corpus.New
21 import Control.Lens hiding (elements, Empty)
23 import Data.Aeson.TH (deriveJSON)
24 import qualified Data.ByteString.Base64 as BSB64
26 import Data.Maybe (fromMaybe)
28 import Data.Text (Text)
29 import qualified Data.Text as T
30 import GHC.Generics (Generic)
32 import Servant.Job.Utils (jsonOptions)
33 -- import Servant.Multipart
34 import qualified Data.Text.Encoding as TE
35 -- import Test.QuickCheck (elements)
36 import Test.QuickCheck.Arbitrary
38 import Gargantext.Prelude
40 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs, ScraperEvent(..), scst_events)
41 import Gargantext.API.Admin.Types (HasSettings)
42 import Gargantext.API.Job (jobLogSuccess, jobLogFailTotal, jobLogFailTotalWithMessage)
43 import Gargantext.API.Node.Corpus.New.File
44 import Gargantext.API.Node.Corpus.Searx
45 import Gargantext.API.Node.Corpus.Types
46 import Gargantext.API.Node.Types
47 import Gargantext.Core (Lang(..){-, allLangs-})
48 import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
49 import qualified Gargantext.Core.Text.Corpus.API as API
50 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
51 import Gargantext.Core.Types.Individu (User(..))
52 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
53 import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
54 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
55 import Gargantext.Database.Action.Mail (sendMail)
56 import Gargantext.Database.Action.Node (mkNodeWithParent)
57 import Gargantext.Database.Action.User (getUserId)
58 import Gargantext.Database.Admin.Types.Hyperdata
59 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
60 import Gargantext.Database.Prelude (hasConfig)
61 import Gargantext.Database.Query.Table.Node (getNodeWith)
62 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
63 import Gargantext.Database.Schema.Node (node_hyperdata)
64 import qualified Gargantext.Database.GargDB as GargDB
65 import Gargantext.Prelude.Config (gc_max_docs_parsers)
66 ------------------------------------------------------------------------
68 data Query = Query { query_query :: Text
69 , query_node_id :: Int
71 , query_databases :: [DataOrigin]
73 deriving (Eq, Generic)
75 deriveJSON (unPrefix "query_") 'Query
77 instance Arbitrary Query where
78 arbitrary = elements [ Query q n la fs
79 | q <- ["honeybee* AND collapse"
84 , fs <- take 3 $ repeat allDataOrigins
87 instance ToSchema Query where
88 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
91 ------------------------------------------------------------------------
97 type PostApi = Summary "New Corpus endpoint"
98 :> ReqBody '[JSON] Query
99 :> Post '[JSON] CorpusId
100 type GetApi = Get '[JSON] ApiInfo
103 -- | TODO manage several apis
105 -- TODO this is only the POST
107 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
108 api uid (Query q _ as) = do
109 cId <- case head as of
110 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
111 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
113 docs <- liftBase $ API.get a q (Just 1000)
114 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
120 ------------------------------------------------
121 -- TODO use this route for Client implementation
122 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
124 instance Arbitrary ApiInfo where
125 arbitrary = ApiInfo <$> arbitrary
127 deriveJSON (unPrefix "") 'ApiInfo
129 instance ToSchema ApiInfo
131 info :: FlowCmdM env err m => UserId -> m ApiInfo
132 info _u = pure $ ApiInfo API.externalAPIs
134 ------------------------------------------------------------------------
135 ------------------------------------------------------------------------
136 data WithQuery = WithQuery
138 , _wq_databases :: !Database
139 , _wq_datafield :: !(Maybe Datafield)
141 , _wq_node_id :: !Int
142 , _wq_flowListWith :: !FlowSocialListWith
146 makeLenses ''WithQuery
147 instance FromJSON WithQuery where
148 parseJSON = genericParseJSON $ jsonOptions "_wq_"
149 instance ToJSON WithQuery where
150 toJSON = genericToJSON $ jsonOptions "_wq_"
151 instance ToSchema WithQuery where
152 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
154 ------------------------------------------------------------------------
156 type AddWithQuery = Summary "Add with Query to corpus endpoint"
158 :> Capture "corpus_id" CorpusId
160 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
163 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
165 :> Capture "corpus_id" CorpusId
168 :> MultipartForm Mem (MultipartData Mem)
169 :> QueryParam "fileType" FileType
171 :> AsyncJobs JobLog '[JSON] () JobLog
175 ------------------------------------------------------------------------
176 -- TODO WithQuery also has a corpus id
177 addToCorpusWithQuery :: FlowCmdM env err m
184 addToCorpusWithQuery user cid (WithQuery { _wq_query = q
185 , _wq_databases = dbs
186 , _wq_datafield = datafield
188 , _wq_flowListWith = flw }) maybeLimit logStatus = do
190 logStatus JobLog { _scst_succeeded = Just 0
191 , _scst_failed = Just 0
192 , _scst_remaining = Just 3
193 , _scst_events = Just []
195 printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
196 printDebug "[addToCorpusWithQuery] datafield" datafield
197 printDebug "[addToCorpusWithQuery] flowListWith" flw
201 printDebug "[addToCorpusWithQuery] processing web request" datafield
203 _ <- triggerSearxSearch user cid q l logStatus
205 pure JobLog { _scst_succeeded = Just 3
206 , _scst_failed = Just 0
207 , _scst_remaining = Just 0
208 , _scst_events = Just []
213 -- TODO if cid is folder -> create Corpus
214 -- if cid is corpus -> add to corpus
215 -- if cid is root -> create corpus in Private
216 txts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
218 logStatus JobLog { _scst_succeeded = Just 2
219 , _scst_failed = Just 0
220 , _scst_remaining = Just $ 1 + length txts
221 , _scst_events = Just []
224 cids <- mapM (\txt -> flowDataText user txt (Multi l) cid Nothing logStatus) txts
225 printDebug "corpus id" cids
226 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
229 pure JobLog { _scst_succeeded = Just 3
230 , _scst_failed = Just 0
231 , _scst_remaining = Just 0
232 , _scst_events = Just []
236 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
238 :> Capture "corpus_id" CorpusId
242 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
244 addToCorpusWithForm :: (FlowCmdM env err m)
251 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
252 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
253 printDebug "[addToCorpusWithForm] fileType" ft
257 CSV_HAL -> Parser.parseFormat Parser.CsvHal
258 CSV -> Parser.parseFormat Parser.CsvGargV3
259 WOS -> Parser.parseFormat Parser.WOS
260 PresseRIS -> Parser.parseFormat Parser.RisPresse
261 ZIP -> Parser.parseFormat Parser.ZIP
263 -- TODO granularity of the logStatus
264 let data' = case ft of
265 ZIP -> case BSB64.decode $ TE.encodeUtf8 d of
266 Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err
267 Right decoded -> decoded
269 eDocs <- liftBase $ parse data'
272 -- TODO Add progress (jobStatus) update for docs - this is a
274 limit' <- view $ hasConfig . gc_max_docs_parsers
275 let limit = fromIntegral limit'
276 if length docs' > limit then do
277 printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show $ length docs')
278 let panicMsg' = [ "[addToCorpusWithForm] number of docs ("
279 , show $ length docs'
280 , ") exceeds the MAX_DOCS_PARSERS limit ("
283 let panicMsg = T.concat $ T.pack <$> panicMsg'
284 logStatus $ jobLogFailTotalWithMessage panicMsg jobLog
288 let docs = splitEvery 500 $ take limit docs'
290 printDebug "Parsing corpus finished : " cid
293 printDebug "Starting extraction : " cid
294 -- TODO granularity of the logStatus
295 _cid' <- flowCorpus user
297 (Multi $ fromMaybe EN l)
299 (map (map toHyperdataDocument) docs)
302 printDebug "Extraction finished : " cid
303 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
309 printDebug "[addToCorpusWithForm] parse error" e
311 let evt = ScraperEvent { _scev_message = Just $ T.pack e
312 , _scev_level = Just "ERROR"
313 , _scev_date = Nothing }
315 logStatus $ over (scst_events . _Just) (\evt' -> evt' <> [evt]) jobLogE
318 jobLog2 = jobLogSuccess jobLog
319 jobLog3 = jobLogSuccess jobLog2
320 jobLogE = jobLogFailTotal jobLog
323 addToCorpusWithFile :: FlowCmdM env err m
329 addToCorpusWithFile cid input filetype logStatus = do
330 logStatus JobLog { _scst_succeeded = Just 10
331 , _scst_failed = Just 2
332 , _scst_remaining = Just 138
333 , _scst_events = Just []
335 printDebug "addToCorpusWithFile" cid
336 _h <- postUpload cid filetype input
338 pure JobLog { _scst_succeeded = Just 137
339 , _scst_failed = Just 13
340 , _scst_remaining = Just 0
341 , _scst_events = Just []
347 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
349 :> Capture "corpus_id" CorpusId
353 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
355 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
361 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
363 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
364 logStatus JobLog { _scst_succeeded = Just 0
365 , _scst_failed = Just 0
366 , _scst_remaining = Just 1
367 , _scst_events = Just []
370 fPath <- GargDB.writeFile nwf
371 printDebug "[addToCorpusWithFile] File saved as: " fPath
373 uId <- getUserId user
374 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
378 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
379 let hl = node ^. node_hyperdata
380 _ <- updateHyperdata nId $ hl { _hff_name = fName
381 , _hff_path = T.pack fPath }
383 printDebug "[addToCorpusWithFile] Created node with id: " nId
386 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
388 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
391 pure $ JobLog { _scst_succeeded = Just 1
392 , _scst_failed = Just 0
393 , _scst_remaining = Just 0
394 , _scst_events = Just []