2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
18 module Gargantext.API.Node.Corpus.New
22 import Control.Lens hiding (elements, Empty)
25 import Data.Aeson.TH (deriveJSON)
26 import qualified Data.ByteString.Base64 as BSB64
27 import Data.Conduit.Internal (zipSources)
29 import Data.Maybe (fromMaybe)
31 import Data.Text (Text)
32 import qualified Data.Text as T
33 import GHC.Generics (Generic)
35 import Servant.Job.Utils (jsonOptions)
36 -- import Servant.Multipart
37 import qualified Data.Text.Encoding as TE
38 -- import Test.QuickCheck (elements)
39 import Test.QuickCheck.Arbitrary
41 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
42 import Gargantext.API.Admin.Types (HasSettings)
43 import Gargantext.API.Node.Corpus.New.Types
44 import Gargantext.API.Node.Corpus.Searx
45 import Gargantext.API.Node.Corpus.Types
46 import Gargantext.API.Node.Types
47 import Gargantext.Core (Lang(..))
48 import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
49 import Gargantext.Core.Types.Individu (User(..))
50 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
51 import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
52 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
53 import Gargantext.Database.Action.Mail (sendMail)
54 import Gargantext.Database.Action.Node (mkNodeWithParent)
55 import Gargantext.Database.Action.User (getUserId)
56 import Gargantext.Database.Admin.Types.Hyperdata
57 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
58 import Gargantext.Database.Prelude (hasConfig)
59 import Gargantext.Database.Query.Table.Node (getNodeWith)
60 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
61 import Gargantext.Database.Schema.Node (node_hyperdata)
62 import Gargantext.Prelude
63 import Gargantext.Prelude.Config (gc_max_docs_parsers)
64 import Gargantext.Utils.Jobs (JobHandle, MonadJobStatus(..))
65 import qualified Gargantext.Core.Text.Corpus.API as API
66 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileType(..), parseFormatC)
67 import qualified Gargantext.Database.GargDB as GargDB
68 ------------------------------------------------------------------------
70 data Query = Query { query_query :: Text
71 , query_node_id :: Int
73 , query_databases :: [DataOrigin]
75 deriving (Eq, Generic)
77 deriveJSON (unPrefix "query_") 'Query
79 instance Arbitrary Query where
80 arbitrary = elements [ Query q n la fs
81 | q <- ["honeybee* AND collapse"
86 , fs <- take 3 $ repeat allDataOrigins
89 instance ToSchema Query where
90 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
93 ------------------------------------------------------------------------
99 type PostApi = Summary "New Corpus endpoint"
100 :> ReqBody '[JSON] Query
101 :> Post '[JSON] CorpusId
102 type GetApi = Get '[JSON] ApiInfo
105 -- | TODO manage several apis
107 -- TODO this is only the POST
109 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
110 api uid (Query q _ as) = do
111 cId <- case head as of
112 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
113 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
115 docs <- liftBase $ API.get a q (Just 1000)
116 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
122 ------------------------------------------------
123 -- TODO use this route for Client implementation
124 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
126 instance Arbitrary ApiInfo where
127 arbitrary = ApiInfo <$> arbitrary
129 deriveJSON (unPrefix "") 'ApiInfo
131 instance ToSchema ApiInfo
133 info :: FlowCmdM env err m => UserId -> m ApiInfo
135 ext <- API.externalAPIs
139 ------------------------------------------------------------------------
140 ------------------------------------------------------------------------
141 data WithQuery = WithQuery
143 , _wq_databases :: !Database
144 , _wq_datafield :: !(Maybe Datafield)
146 , _wq_node_id :: !Int
147 , _wq_flowListWith :: !FlowSocialListWith
151 makeLenses ''WithQuery
152 instance FromJSON WithQuery where
153 parseJSON = genericParseJSON $ jsonOptions "_wq_"
154 instance ToJSON WithQuery where
155 toJSON = genericToJSON $ jsonOptions "_wq_"
156 instance ToSchema WithQuery where
157 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
159 ------------------------------------------------------------------------
161 type AddWithQuery = Summary "Add with Query to corpus endpoint"
163 :> Capture "corpus_id" CorpusId
165 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
168 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
170 :> Capture "corpus_id" CorpusId
173 :> MultipartForm Mem (MultipartData Mem)
174 :> QueryParam "fileType" FileType
176 :> AsyncJobs JobLog '[JSON] () JobLog
180 ------------------------------------------------------------------------
181 -- TODO WithQuery also has a corpus id
184 addToCorpusWithQuery :: (FlowCmdM env err m, MonadJobStatus m)
191 addToCorpusWithQuery user cid (WithQuery { _wq_query = q
192 , _wq_databases = dbs
193 , _wq_datafield = datafield
195 , _wq_flowListWith = flw }) maybeLimit jobHandle = do
197 -- printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
198 -- printDebug "[addToCorpusWithQuery] datafield" datafield
199 -- printDebug "[addToCorpusWithQuery] flowListWith" flw
203 -- printDebug "[addToCorpusWithQuery] processing web request" datafield
205 markStarted 1 jobHandle
207 _ <- triggerSearxSearch user cid q l jobHandle
209 markComplete jobHandle
212 markStarted 3 jobHandle
215 -- TODO if cid is folder -> create Corpus
216 -- if cid is corpus -> add to corpus
217 -- if cid is root -> create corpus in Private
218 -- printDebug "[G.A.N.C.New] getDataText with query" q
219 db <- database2origin dbs
220 eTxt <- getDataText db (Multi l) q maybeLimit
222 -- printDebug "[G.A.N.C.New] lTxts" lTxts
225 -- TODO Sum lenghts of each txt elements
227 markProgress 1 jobHandle
229 void $ flowDataText user txt (Multi l) cid (Just flw) jobHandle
230 -- printDebug "corpus id" cids
231 -- printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
234 markComplete jobHandle
237 -- printDebug "Error: " err
238 markFailed (Just $ T.pack (show err)) jobHandle
240 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
242 :> Capture "corpus_id" CorpusId
246 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
248 addToCorpusWithForm :: (FlowCmdM env err m, MonadJobStatus m)
254 addToCorpusWithForm user cid (NewWithForm ft ff d l _n sel) jobHandle = do
255 -- printDebug "[addToCorpusWithForm] Parsing corpus: " cid
256 -- printDebug "[addToCorpusWithForm] fileType" ft
257 -- printDebug "[addToCorpusWithForm] fileFormat" ff
258 limit' <- view $ hasConfig . gc_max_docs_parsers
259 let limit = fromIntegral limit' :: Integer
262 CSV_HAL -> Parser.parseFormatC Parser.CsvHal
263 CSV -> Parser.parseFormatC Parser.CsvGargV3
264 WOS -> Parser.parseFormatC Parser.WOS
265 PresseRIS -> Parser.parseFormatC Parser.RisPresse
267 -- TODO granularity of the logStatus
268 let data' = case ff of
270 ZIP -> case BSB64.decode $ TE.encodeUtf8 d of
271 Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err
272 Right decoded -> decoded
273 eDocsC <- liftBase $ parseC ff data'
275 Right (mCount, docsC) -> do
276 -- TODO Add progress (jobStatus) update for docs - this is a
279 let docsC' = zipSources (yieldMany [1..]) docsC
280 .| mapMC (\(idx, doc) ->
281 if idx > limit then do
282 --printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show limit)
283 let panicMsg' = [ "[addToCorpusWithForm] number of docs "
284 , "exceeds the MAX_DOCS_PARSERS limit ("
287 let panicMsg = T.concat $ T.pack <$> panicMsg'
288 --logStatus $ jobLogFailTotalWithMessage panicMsg jobLog
292 .| mapC toHyperdataDocument
294 --printDebug "Parsing corpus finished : " cid
297 --printDebug "Starting extraction : " cid
298 -- TODO granularity of the logStatus
299 -- printDebug "flowCorpus with (corpus_id, lang)" (cid, l)
301 _cid' <- flowCorpus user
303 (Multi $ fromMaybe EN l)
305 --(Just $ fromIntegral $ length docs, docsC')
306 (mCount, transPipe liftBase docsC') -- TODO fix number of docs
307 --(map (map toHyperdataDocument) docs)
310 -- printDebug "Extraction finished : " cid
311 -- printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
312 -- TODO uncomment this
315 markComplete jobHandle
317 printDebug "[addToCorpusWithForm] parse error" e
318 markFailed (Just $ T.pack e) jobHandle
321 addToCorpusWithFile :: FlowCmdM env err m
327 addToCorpusWithFile cid input filetype logStatus = do
328 logStatus JobLog { _scst_succeeded = Just 10
329 , _scst_failed = Just 2
330 , _scst_remaining = Just 138
331 , _scst_events = Just []
333 printDebug "addToCorpusWithFile" cid
334 _h <- postUpload cid filetype input
336 pure JobLog { _scst_succeeded = Just 137
337 , _scst_failed = Just 13
338 , _scst_remaining = Just 0
339 , _scst_events = Just []
345 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
347 :> Capture "corpus_id" CorpusId
351 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
353 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m, MonadJobStatus m)
359 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) jobHandle = do
361 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
362 markStarted 1 jobHandle
364 fPath <- GargDB.writeFile nwf
365 printDebug "[addToCorpusWithFile] File saved as: " fPath
367 uId <- getUserId user
368 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
372 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
373 let hl = node ^. node_hyperdata
374 _ <- updateHyperdata nId $ hl { _hff_name = fName
375 , _hff_path = T.pack fPath }
377 printDebug "[addToCorpusWithFile] Created node with id: " nId
380 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
382 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
385 markComplete jobHandle