2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
18 module Gargantext.API.Node.Corpus.New
22 import Control.Lens hiding (elements, Empty)
25 import Data.Aeson.TH (deriveJSON)
26 import qualified Data.ByteString.Base64 as BSB64
27 import Data.Conduit.Internal (zipSources)
29 import Data.Maybe (fromMaybe)
31 import Data.Text (Text)
32 import qualified Data.Text as T
33 import GHC.Generics (Generic)
35 import Servant.Job.Utils (jsonOptions)
36 -- import Servant.Multipart
37 import qualified Data.Text.Encoding as TE
38 -- import Test.QuickCheck (elements)
39 import Test.QuickCheck.Arbitrary
41 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
42 import Gargantext.API.Admin.Types (HasSettings)
43 import Gargantext.API.Node.Corpus.New.Types
44 import Gargantext.API.Node.Corpus.Searx
45 import Gargantext.API.Node.Corpus.Types
46 import Gargantext.API.Node.Types
47 import Gargantext.Core (Lang(..))
48 import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
49 import Gargantext.Core.Types.Individu (User(..))
50 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
51 import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
52 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
53 import Gargantext.Database.Action.Mail (sendMail)
54 import Gargantext.Database.Action.Node (mkNodeWithParent)
55 import Gargantext.Database.Action.User (getUserId)
56 import Gargantext.Database.Admin.Types.Hyperdata
57 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..))
58 import Gargantext.Database.Prelude (hasConfig)
59 import Gargantext.Database.Query.Table.Node (getNodeWith)
60 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
61 import Gargantext.Database.Query.Table.User (getUserPubmedAPIKey)
62 import Gargantext.Database.Schema.Node (node_hyperdata)
63 import Gargantext.Prelude
64 import Gargantext.Prelude.Config (gc_max_docs_parsers)
65 import Gargantext.Utils.Jobs (JobHandle, MonadJobStatus(..))
66 import qualified Gargantext.Core.Text.Corpus.API as API
67 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileType(..), parseFormatC)
68 import qualified Gargantext.Database.GargDB as GargDB
70 ------------------------------------------------------------------------
72 data Query = Query { query_query :: Text
73 , query_node_id :: Int
75 , query_databases :: [DataOrigin]
77 deriving (Eq, Generic)
79 deriveJSON (unPrefix "query_") 'Query
81 instance Arbitrary Query where
82 arbitrary = elements [ Query q n la fs
83 | q <- ["honeybee* AND collapse"
88 , fs <- take 3 $ repeat allDataOrigins
91 instance ToSchema Query where
92 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
95 ------------------------------------------------------------------------
101 type PostApi = Summary "New Corpus endpoint"
102 :> ReqBody '[JSON] Query
103 :> Post '[JSON] CorpusId
104 type GetApi = Get '[JSON] ApiInfo
107 -- | TODO manage several apis
109 -- TODO this is only the POST
111 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
112 api uid (Query q _ as) = do
113 cId <- case head as of
114 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
115 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
117 docs <- liftBase $ API.get a q (Just 1000)
118 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
124 ------------------------------------------------
125 -- TODO use this route for Client implementation
126 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
128 instance Arbitrary ApiInfo where
129 arbitrary = ApiInfo <$> arbitrary
131 deriveJSON (unPrefix "") 'ApiInfo
133 instance ToSchema ApiInfo
136 info = ApiInfo API.externalAPIs
138 ------------------------------------------------------------------------
139 ------------------------------------------------------------------------
140 data WithQuery = WithQuery
141 { _wq_query :: !API.RawQuery
142 , _wq_databases :: !Database
143 , _wq_datafield :: !(Maybe Datafield)
145 , _wq_node_id :: !Int
146 , _wq_flowListWith :: !FlowSocialListWith
148 deriving (Show, Eq, Generic)
150 makeLenses ''WithQuery
151 instance FromJSON WithQuery where
152 parseJSON = genericParseJSON $ jsonOptions "_wq_"
153 instance ToJSON WithQuery where
154 toJSON = genericToJSON $ jsonOptions "_wq_"
155 instance ToSchema WithQuery where
156 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
158 instance Arbitrary WithQuery where
159 arbitrary = WithQuery <$> arbitrary
166 ------------------------------------------------------------------------
168 type AddWithQuery = Summary "Add with Query to corpus endpoint"
170 :> Capture "corpus_id" CorpusId
172 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
175 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
177 :> Capture "corpus_id" CorpusId
180 :> MultipartForm Mem (MultipartData Mem)
181 :> QueryParam "fileType" FileType
183 :> AsyncJobs JobLog '[JSON] () JobLog
187 ------------------------------------------------------------------------
188 -- TODO WithQuery also has a corpus id
191 addToCorpusWithQuery :: (FlowCmdM env err m, MonadJobStatus m)
198 addToCorpusWithQuery user cid (WithQuery { _wq_query = q
199 , _wq_databases = dbs
200 , _wq_datafield = datafield
202 , _wq_flowListWith = flw }) maybeLimit jobHandle = do
204 -- printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
205 -- printDebug "[addToCorpusWithQuery] datafield" datafield
206 -- printDebug "[addToCorpusWithQuery] flowListWith" flw
210 -- printDebug "[addToCorpusWithQuery] processing web request" datafield
212 markStarted 1 jobHandle
214 _ <- triggerSearxSearch user cid q l jobHandle
216 markComplete jobHandle
219 markStarted 3 jobHandle
222 -- TODO if cid is folder -> create Corpus
223 -- if cid is corpus -> add to corpus
224 -- if cid is root -> create corpus in Private
225 -- printDebug "[G.A.N.C.New] getDataText with query" q
226 let db = database2origin dbs
227 mPubmedAPIKey <- getUserPubmedAPIKey user
228 -- printDebug "[addToCorpusWithQuery] mPubmedAPIKey" mPubmedAPIKey
229 eTxt <- getDataText db (Multi l) q mPubmedAPIKey maybeLimit
231 -- printDebug "[G.A.N.C.New] lTxts" lTxts
234 -- TODO Sum lenghts of each txt elements
236 markProgress 1 jobHandle
238 void $ flowDataText user txt (Multi l) cid (Just flw) jobHandle
239 -- printDebug "corpus id" cids
240 -- printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
243 markComplete jobHandle
246 -- printDebug "Error: " err
247 markFailed (Just $ T.pack (show err)) jobHandle
249 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
251 :> Capture "corpus_id" CorpusId
255 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
257 addToCorpusWithForm :: (FlowCmdM env err m, MonadJobStatus m)
263 addToCorpusWithForm user cid (NewWithForm ft ff d l _n sel) jobHandle = do
264 -- printDebug "[addToCorpusWithForm] Parsing corpus: " cid
265 -- printDebug "[addToCorpusWithForm] fileType" ft
266 -- printDebug "[addToCorpusWithForm] fileFormat" ff
267 limit' <- view $ hasConfig . gc_max_docs_parsers
268 let limit = fromIntegral limit' :: Integer
271 CSV_HAL -> Parser.parseFormatC Parser.CsvHal
272 CSV -> Parser.parseFormatC Parser.CsvGargV3
273 WOS -> Parser.parseFormatC Parser.WOS
274 PresseRIS -> Parser.parseFormatC Parser.RisPresse
275 Iramuteq -> Parser.parseFormatC Parser.Iramuteq
276 JSON -> Parser.parseFormatC Parser.JSON
278 -- TODO granularity of the logStatus
279 let data' = case ff of
281 ZIP -> case BSB64.decode $ TE.encodeUtf8 d of
282 Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err
283 Right decoded -> decoded
284 eDocsC <- liftBase $ parseC ff data'
286 Right (mCount, docsC) -> do
287 -- TODO Add progress (jobStatus) update for docs - this is a
290 let docsC' = zipSources (yieldMany [1..]) docsC
291 .| mapMC (\(idx, doc) ->
292 if idx > limit then do
293 --printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show limit)
294 let panicMsg' = [ "[addToCorpusWithForm] number of docs "
295 , "exceeds the MAX_DOCS_PARSERS limit ("
298 let panicMsg = T.concat $ T.pack <$> panicMsg'
299 --logStatus $ jobLogFailTotalWithMessage panicMsg jobLog
303 .| mapC toHyperdataDocument
305 --printDebug "Parsing corpus finished : " cid
308 --printDebug "Starting extraction : " cid
309 -- TODO granularity of the logStatus
310 -- printDebug "flowCorpus with (corpus_id, lang)" (cid, l)
312 _cid' <- flowCorpus user
314 (Multi $ fromMaybe EN l)
316 --(Just $ fromIntegral $ length docs, docsC')
317 (mCount, transPipe liftBase docsC') -- TODO fix number of docs
318 --(map (map toHyperdataDocument) docs)
321 -- printDebug "Extraction finished : " cid
322 -- printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
323 -- TODO uncomment this
326 markComplete jobHandle
328 printDebug "[addToCorpusWithForm] parse error" e
329 markFailed (Just $ T.pack e) jobHandle
332 addToCorpusWithFile :: FlowCmdM env err m
338 addToCorpusWithFile cid input filetype logStatus = do
339 logStatus JobLog { _scst_succeeded = Just 10
340 , _scst_failed = Just 2
341 , _scst_remaining = Just 138
342 , _scst_events = Just []
344 printDebug "addToCorpusWithFile" cid
345 _h <- postUpload cid filetype input
347 pure JobLog { _scst_succeeded = Just 137
348 , _scst_failed = Just 13
349 , _scst_remaining = Just 0
350 , _scst_events = Just []
356 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
358 :> Capture "corpus_id" CorpusId
362 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
364 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m, MonadJobStatus m)
370 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) jobHandle = do
372 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
373 markStarted 1 jobHandle
375 fPath <- GargDB.writeFile nwf
376 printDebug "[addToCorpusWithFile] File saved as: " fPath
378 uId <- getUserId user
379 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
383 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
384 let hl = node ^. node_hyperdata
385 _ <- updateHyperdata nId $ hl { _hff_name = fName
386 , _hff_path = T.pack fPath }
388 printDebug "[addToCorpusWithFile] Created node with id: " nId
391 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
393 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
396 markComplete jobHandle