2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
18 module Gargantext.API.Node.Corpus.New
21 import Control.Lens hiding (elements, Empty)
23 import Data.Aeson.TH (deriveJSON)
24 import qualified Data.ByteString.Base64 as BSB64
26 import Data.Maybe (fromMaybe)
28 import Data.Text (Text)
29 import qualified Data.Text as T
30 import GHC.Generics (Generic)
32 import Servant.Job.Utils (jsonOptions)
33 -- import Servant.Multipart
34 import qualified Data.Text.Encoding as TE
35 -- import Test.QuickCheck (elements)
36 import Test.QuickCheck.Arbitrary
38 import Gargantext.Prelude
40 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs, ScraperEvent(..), scst_events)
41 import Gargantext.API.Admin.Types (HasSettings)
42 import Gargantext.API.Job (jobLogSuccess, jobLogFailTotal, jobLogFailTotalWithMessage)
43 import Gargantext.API.Node.Corpus.New.File
44 import Gargantext.API.Node.Corpus.Searx
45 import Gargantext.API.Node.Corpus.Types
46 import Gargantext.API.Node.Types
47 import Gargantext.Core (Lang(..){-, allLangs-})
48 import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
49 import qualified Gargantext.Core.Text.Corpus.API as API
50 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
51 import Gargantext.Core.Types.Individu (User(..))
52 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
53 import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
54 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
55 import Gargantext.Database.Action.Mail (sendMail)
56 import Gargantext.Database.Action.Node (mkNodeWithParent)
57 import Gargantext.Database.Action.User (getUserId)
58 import Gargantext.Database.Admin.Types.Hyperdata
59 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
60 import Gargantext.Database.Prelude (hasConfig)
61 import Gargantext.Database.Query.Table.Node (getNodeWith)
62 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
63 import Gargantext.Database.Schema.Node (node_hyperdata)
64 import qualified Gargantext.Database.GargDB as GargDB
65 import Gargantext.Prelude.Config (gc_max_docs_parsers)
66 ------------------------------------------------------------------------
68 data Query = Query { query_query :: Text
69 , query_node_id :: Int
71 , query_databases :: [DataOrigin]
73 deriving (Eq, Generic)
75 deriveJSON (unPrefix "query_") 'Query
77 instance Arbitrary Query where
78 arbitrary = elements [ Query q n la fs
79 | q <- ["honeybee* AND collapse"
84 , fs <- take 3 $ repeat allDataOrigins
87 instance ToSchema Query where
88 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
91 ------------------------------------------------------------------------
97 type PostApi = Summary "New Corpus endpoint"
98 :> ReqBody '[JSON] Query
99 :> Post '[JSON] CorpusId
100 type GetApi = Get '[JSON] ApiInfo
103 -- | TODO manage several apis
105 -- TODO this is only the POST
107 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
108 api uid (Query q _ as) = do
109 cId <- case head as of
110 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
111 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
113 docs <- liftBase $ API.get a q (Just 1000)
114 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
120 ------------------------------------------------
121 -- TODO use this route for Client implementation
122 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
124 instance Arbitrary ApiInfo where
125 arbitrary = ApiInfo <$> arbitrary
127 deriveJSON (unPrefix "") 'ApiInfo
129 instance ToSchema ApiInfo
131 info :: FlowCmdM env err m => UserId -> m ApiInfo
132 info _u = pure $ ApiInfo API.externalAPIs
134 ------------------------------------------------------------------------
135 ------------------------------------------------------------------------
136 data WithQuery = WithQuery
138 , _wq_databases :: !Database
139 , _wq_datafield :: !(Maybe Datafield)
141 , _wq_node_id :: !Int
142 , _wq_flowListWith :: !FlowSocialListWith
146 makeLenses ''WithQuery
147 instance FromJSON WithQuery where
148 parseJSON = genericParseJSON $ jsonOptions "_wq_"
149 instance ToJSON WithQuery where
150 toJSON = genericToJSON $ jsonOptions "_wq_"
151 instance ToSchema WithQuery where
152 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
154 ------------------------------------------------------------------------
156 type AddWithQuery = Summary "Add with Query to corpus endpoint"
158 :> Capture "corpus_id" CorpusId
160 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
163 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
165 :> Capture "corpus_id" CorpusId
168 :> MultipartForm Mem (MultipartData Mem)
169 :> QueryParam "fileType" FileType
171 :> AsyncJobs JobLog '[JSON] () JobLog
175 ------------------------------------------------------------------------
176 -- TODO WithQuery also has a corpus id
177 addToCorpusWithQuery :: FlowCmdM env err m
184 addToCorpusWithQuery user cid (WithQuery { _wq_query = q
185 , _wq_databases = dbs
186 , _wq_datafield = datafield
188 , _wq_flowListWith = flw }) maybeLimit logStatus = do
190 logStatus JobLog { _scst_succeeded = Just 0
191 , _scst_failed = Just 0
192 , _scst_remaining = Just 3
193 , _scst_events = Just []
195 printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
196 printDebug "[addToCorpusWithQuery] datafield" datafield
197 printDebug "[addToCorpusWithQuery] flowListWith" flw
201 printDebug "[addToCorpusWithQuery] processing web request" datafield
203 _ <- triggerSearxSearch cid q l
205 pure JobLog { _scst_succeeded = Just 3
206 , _scst_failed = Just 0
207 , _scst_remaining = Just 0
208 , _scst_events = Just []
213 -- TODO if cid is folder -> create Corpus
214 -- if cid is corpus -> add to corpus
215 -- if cid is root -> create corpus in Private
216 txts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
218 logStatus JobLog { _scst_succeeded = Just 2
219 , _scst_failed = Just 0
220 , _scst_remaining = Just 1
221 , _scst_events = Just []
224 cids <- mapM (\txt -> flowDataText user txt (Multi l) cid Nothing) txts
225 printDebug "corpus id" cids
226 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
229 pure JobLog { _scst_succeeded = Just 3
230 , _scst_failed = Just 0
231 , _scst_remaining = Just 0
232 , _scst_events = Just []
236 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
238 :> Capture "corpus_id" CorpusId
242 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
244 addToCorpusWithForm :: (FlowCmdM env err m)
251 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
252 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
253 printDebug "[addToCorpusWithForm] fileType" ft
257 CSV_HAL -> Parser.parseFormat Parser.CsvHal
258 CSV -> Parser.parseFormat Parser.CsvGargV3
259 WOS -> Parser.parseFormat Parser.WOS
260 PresseRIS -> Parser.parseFormat Parser.RisPresse
261 ZIP -> Parser.parseFormat Parser.ZIP
263 -- TODO granularity of the logStatus
264 let data' = case ft of
265 ZIP -> case BSB64.decode $ TE.encodeUtf8 d of
266 Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err
267 Right decoded -> decoded
269 eDocs <- liftBase $ parse data'
272 -- TODO Add progress (jobStatus) update for docs - this is a
274 limit' <- view $ hasConfig . gc_max_docs_parsers
275 let limit = fromIntegral limit'
276 if length docs' > limit then do
277 printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show $ length docs')
278 let panicMsg' = [ "[addToCorpusWithForm] number of docs ("
279 , show $ length docs'
280 , ") exceeds the MAX_DOCS_PARSERS limit ("
283 let panicMsg = T.concat $ T.pack <$> panicMsg'
284 logStatus $ jobLogFailTotalWithMessage panicMsg jobLog
288 let docs = splitEvery 500 $ take limit docs'
290 printDebug "Parsing corpus finished : " cid
293 printDebug "Starting extraction : " cid
294 -- TODO granularity of the logStatus
295 _cid' <- flowCorpus user
297 (Multi $ fromMaybe EN l)
299 (map (map toHyperdataDocument) docs)
301 printDebug "Extraction finished : " cid
302 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
308 printDebug "[addToCorpusWithForm] parse error" e
310 let evt = ScraperEvent { _scev_message = Just $ T.pack e
311 , _scev_level = Just "ERROR"
312 , _scev_date = Nothing }
314 logStatus $ over (scst_events . _Just) (\evt' -> evt' <> [evt]) jobLogE
317 jobLog2 = jobLogSuccess jobLog
318 jobLog3 = jobLogSuccess jobLog2
319 jobLogE = jobLogFailTotal jobLog
322 addToCorpusWithFile :: FlowCmdM env err m
328 addToCorpusWithFile cid input filetype logStatus = do
329 logStatus JobLog { _scst_succeeded = Just 10
330 , _scst_failed = Just 2
331 , _scst_remaining = Just 138
332 , _scst_events = Just []
334 printDebug "addToCorpusWithFile" cid
335 _h <- postUpload cid filetype input
337 pure JobLog { _scst_succeeded = Just 137
338 , _scst_failed = Just 13
339 , _scst_remaining = Just 0
340 , _scst_events = Just []
346 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
348 :> Capture "corpus_id" CorpusId
352 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
354 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
360 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
362 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
363 logStatus JobLog { _scst_succeeded = Just 0
364 , _scst_failed = Just 0
365 , _scst_remaining = Just 1
366 , _scst_events = Just []
369 fPath <- GargDB.writeFile nwf
370 printDebug "[addToCorpusWithFile] File saved as: " fPath
372 uId <- getUserId user
373 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
377 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
378 let hl = node ^. node_hyperdata
379 _ <- updateHyperdata nId $ hl { _hff_name = fName
380 , _hff_path = T.pack fPath }
382 printDebug "[addToCorpusWithFile] Created node with id: " nId
385 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
387 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
390 pure $ JobLog { _scst_succeeded = Just 1
391 , _scst_failed = Just 0
392 , _scst_remaining = Just 0
393 , _scst_events = Just []