2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
18 module Gargantext.API.Node.Corpus.New
22 import Control.Lens hiding (elements, Empty)
24 import Data.Aeson.TH (deriveJSON)
25 import qualified Data.ByteString.Base64 as BSB64
26 import Data.Conduit.Internal (zipSources)
28 import Data.Maybe (fromMaybe)
30 import Data.Text (Text)
31 import qualified Data.Text as T
32 import GHC.Generics (Generic)
34 import Servant.Job.Utils (jsonOptions)
35 -- import Servant.Multipart
36 import qualified Data.Text.Encoding as TE
37 -- import Test.QuickCheck (elements)
38 import Test.QuickCheck.Arbitrary
40 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
41 import Gargantext.API.Admin.Types (HasSettings)
42 import Gargantext.API.Node.Corpus.New.Types
43 import Gargantext.API.Node.Corpus.Searx
44 import Gargantext.API.Node.Corpus.Types
45 import Gargantext.API.Node.Types
46 import Gargantext.Core (Lang(..){-, allLangs-})
47 import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
48 import Gargantext.Core.Types.Individu (User(..))
49 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
50 import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
51 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
52 import Gargantext.Database.Action.Mail (sendMail)
53 import Gargantext.Database.Action.Node (mkNodeWithParent)
54 import Gargantext.Database.Action.User (getUserId)
55 import Gargantext.Database.Admin.Types.Hyperdata
56 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
57 import Gargantext.Database.Prelude (hasConfig)
58 import Gargantext.Database.Query.Table.Node (getNodeWith)
59 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
60 import Gargantext.Database.Schema.Node (node_hyperdata)
61 import Gargantext.Prelude
62 import Gargantext.Prelude.Config (gc_max_docs_parsers)
63 import Gargantext.Utils.Jobs (JobHandle, MonadJobStatus(..))
64 import qualified Gargantext.Core.Text.Corpus.API as API
65 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileType(..), parseFormatC)
66 import qualified Gargantext.Database.GargDB as GargDB
67 ------------------------------------------------------------------------
69 data Query = Query { query_query :: Text
70 , query_node_id :: Int
72 , query_databases :: [DataOrigin]
74 deriving (Eq, Generic)
76 deriveJSON (unPrefix "query_") 'Query
78 instance Arbitrary Query where
79 arbitrary = elements [ Query q n la fs
80 | q <- ["honeybee* AND collapse"
85 , fs <- take 3 $ repeat allDataOrigins
88 instance ToSchema Query where
89 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
92 ------------------------------------------------------------------------
98 type PostApi = Summary "New Corpus endpoint"
99 :> ReqBody '[JSON] Query
100 :> Post '[JSON] CorpusId
101 type GetApi = Get '[JSON] ApiInfo
104 -- | TODO manage several apis
106 -- TODO this is only the POST
108 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
109 api uid (Query q _ as) = do
110 cId <- case head as of
111 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
112 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
114 docs <- liftBase $ API.get a q (Just 1000)
115 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
121 ------------------------------------------------
122 -- TODO use this route for Client implementation
123 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
125 instance Arbitrary ApiInfo where
126 arbitrary = ApiInfo <$> arbitrary
128 deriveJSON (unPrefix "") 'ApiInfo
130 instance ToSchema ApiInfo
132 info :: FlowCmdM env err m => UserId -> m ApiInfo
134 ext <- API.externalAPIs
138 ------------------------------------------------------------------------
139 ------------------------------------------------------------------------
140 data WithQuery = WithQuery
142 , _wq_databases :: !Database
143 , _wq_datafield :: !(Maybe Datafield)
145 , _wq_node_id :: !Int
146 , _wq_flowListWith :: !FlowSocialListWith
150 makeLenses ''WithQuery
151 instance FromJSON WithQuery where
152 parseJSON = genericParseJSON $ jsonOptions "_wq_"
153 instance ToJSON WithQuery where
154 toJSON = genericToJSON $ jsonOptions "_wq_"
155 instance ToSchema WithQuery where
156 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
158 ------------------------------------------------------------------------
160 type AddWithQuery = Summary "Add with Query to corpus endpoint"
162 :> Capture "corpus_id" CorpusId
164 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
167 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
169 :> Capture "corpus_id" CorpusId
172 :> MultipartForm Mem (MultipartData Mem)
173 :> QueryParam "fileType" FileType
175 :> AsyncJobs JobLog '[JSON] () JobLog
179 ------------------------------------------------------------------------
180 -- TODO WithQuery also has a corpus id
183 addToCorpusWithQuery :: (FlowCmdM env err m, MonadJobStatus m)
190 addToCorpusWithQuery user cid (WithQuery { _wq_query = q
191 , _wq_databases = dbs
192 , _wq_datafield = datafield
194 , _wq_flowListWith = flw }) maybeLimit jobHandle = do
196 markStarted 3 jobHandle
197 -- printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
198 -- printDebug "[addToCorpusWithQuery] datafield" datafield
199 -- printDebug "[addToCorpusWithQuery] flowListWith" flw
203 -- printDebug "[addToCorpusWithQuery] processing web request" datafield
205 _ <- triggerSearxSearch user cid q l jobHandle
207 markComplete jobHandle
211 -- TODO if cid is folder -> create Corpus
212 -- if cid is corpus -> add to corpus
213 -- if cid is root -> create corpus in Private
214 -- printDebug "[G.A.N.C.New] getDataText with query" q
215 databaseOrigin <- database2origin dbs
216 eTxts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [databaseOrigin]
218 let lTxts = lefts eTxts
219 -- printDebug "[G.A.N.C.New] lTxts" lTxts
222 let txts = rights eTxts
223 -- TODO Sum lenghts of each txt elements
225 -- NOTE(adinapoli) Some other weird arithmetic to have the
226 -- following 'JobLog' as output:
228 -- { _scst_succeeded = Just 2
229 -- , _scst_failed = Just 0
230 -- , _scst_remaining = Just $ 1 + length txts
231 -- , _scst_events = Just []
234 markStarted (3 + length txts) jobHandle
235 markProgress 2 jobHandle
237 _cids <- mapM (\txt -> do
238 flowDataText user txt (Multi l) cid (Just flw) jobHandle) txts
239 -- printDebug "corpus id" cids
240 -- printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
243 markComplete jobHandle
246 -- printDebug "Error: " err
247 markFailure 1 (Just $ T.pack (show err)) jobHandle
249 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
251 :> Capture "corpus_id" CorpusId
255 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
257 addToCorpusWithForm :: (FlowCmdM env err m, MonadJobStatus m)
263 addToCorpusWithForm user cid (NewWithForm ft ff d l _n sel) jobHandle = do
264 -- printDebug "[addToCorpusWithForm] Parsing corpus: " cid
265 -- printDebug "[addToCorpusWithForm] fileType" ft
266 -- printDebug "[addToCorpusWithForm] fileFormat" ff
267 limit' <- view $ hasConfig . gc_max_docs_parsers
268 let limit = fromIntegral limit' :: Integer
271 CSV_HAL -> Parser.parseFormatC Parser.CsvHal
272 CSV -> Parser.parseFormatC Parser.CsvGargV3
273 WOS -> Parser.parseFormatC Parser.WOS
274 PresseRIS -> Parser.parseFormatC Parser.RisPresse
276 -- TODO granularity of the logStatus
277 let data' = case ff of
279 ZIP -> case BSB64.decode $ TE.encodeUtf8 d of
280 Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err
281 Right decoded -> decoded
282 eDocsC <- liftBase $ parseC ff data'
284 Right (mCount, docsC) -> do
285 -- TODO Add progress (jobStatus) update for docs - this is a
288 let docsC' = zipSources (yieldMany [1..]) docsC
289 .| mapMC (\(idx, doc) ->
290 if idx > limit then do
291 --printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show limit)
292 let panicMsg' = [ "[addToCorpusWithForm] number of docs "
293 , "exceeds the MAX_DOCS_PARSERS limit ("
296 let panicMsg = T.concat $ T.pack <$> panicMsg'
297 --logStatus $ jobLogFailTotalWithMessage panicMsg jobLog
301 .| mapC toHyperdataDocument
303 --printDebug "Parsing corpus finished : " cid
306 --printDebug "Starting extraction : " cid
307 -- TODO granularity of the logStatus
308 -- printDebug "flowCorpus with (corpus_id, lang)" (cid, l)
310 _cid' <- flowCorpus user
312 (Multi $ fromMaybe EN l)
314 --(Just $ fromIntegral $ length docs, docsC')
315 (mCount, transPipe liftBase docsC') -- TODO fix number of docs
316 --(map (map toHyperdataDocument) docs)
319 -- printDebug "Extraction finished : " cid
320 -- printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
321 -- TODO uncomment this
324 markComplete jobHandle
326 printDebug "[addToCorpusWithForm] parse error" e
327 markFailed (Just $ T.pack e) jobHandle
330 addToCorpusWithFile :: FlowCmdM env err m
336 addToCorpusWithFile cid input filetype logStatus = do
337 logStatus JobLog { _scst_succeeded = Just 10
338 , _scst_failed = Just 2
339 , _scst_remaining = Just 138
340 , _scst_events = Just []
342 printDebug "addToCorpusWithFile" cid
343 _h <- postUpload cid filetype input
345 pure JobLog { _scst_succeeded = Just 137
346 , _scst_failed = Just 13
347 , _scst_remaining = Just 0
348 , _scst_events = Just []
354 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
356 :> Capture "corpus_id" CorpusId
360 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
362 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m, MonadJobStatus m)
368 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) jobHandle = do
370 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
371 markStarted 1 jobHandle
373 fPath <- GargDB.writeFile nwf
374 printDebug "[addToCorpusWithFile] File saved as: " fPath
376 uId <- getUserId user
377 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
381 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
382 let hl = node ^. node_hyperdata
383 _ <- updateHyperdata nId $ hl { _hff_name = fName
384 , _hff_path = T.pack fPath }
386 printDebug "[addToCorpusWithFile] Created node with id: " nId
389 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
391 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
394 markComplete jobHandle