2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
18 module Gargantext.API.Node.Corpus.New
22 import Control.Lens hiding (elements, Empty)
25 import Data.Aeson.TH (deriveJSON)
26 import qualified Data.ByteString.Base64 as BSB64
27 import Data.Conduit.Internal (zipSources)
29 import Data.Maybe (fromMaybe)
31 import Data.Text (Text)
32 import qualified Data.Text as T
33 import GHC.Generics (Generic)
35 import Servant.Job.Utils (jsonOptions)
36 -- import Servant.Multipart
37 import qualified Data.Text.Encoding as TE
38 -- import Test.QuickCheck (elements)
39 import Test.QuickCheck.Arbitrary
41 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
42 import Gargantext.API.Admin.Types (HasSettings)
43 import Gargantext.API.Node.Corpus.New.Types
44 import Gargantext.API.Node.Corpus.Searx
45 import Gargantext.API.Node.Corpus.Types
46 import Gargantext.API.Node.Types
47 import Gargantext.Core (Lang(..))
48 import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
49 import Gargantext.Core.Types.Individu (User(..))
50 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
51 import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
52 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
53 import Gargantext.Database.Action.Mail (sendMail)
54 import Gargantext.Database.Action.Node (mkNodeWithParent)
55 import Gargantext.Database.Action.User (getUserId)
56 import Gargantext.Database.Admin.Types.Hyperdata
57 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..))
58 import Gargantext.Database.Prelude (hasConfig)
59 import Gargantext.Database.Query.Table.Node (getNodeWith, updateCorpusPubmedAPIKey)
60 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
61 import Gargantext.Database.Schema.Node (node_hyperdata)
62 import Gargantext.Prelude
63 import Gargantext.Prelude.Config (gc_max_docs_parsers, gc_pubmed_api_key)
64 import Gargantext.Utils.Jobs (JobHandle, MonadJobStatus(..))
65 import qualified Gargantext.Core.Text.Corpus.API as API
66 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileType(..), parseFormatC)
67 import qualified Gargantext.Database.GargDB as GargDB
68 ------------------------------------------------------------------------
70 data Query = Query { query_query :: Text
71 , query_node_id :: Int
73 , query_databases :: [DataOrigin]
75 deriving (Eq, Generic)
77 deriveJSON (unPrefix "query_") 'Query
79 instance Arbitrary Query where
80 arbitrary = elements [ Query q n la fs
81 | q <- ["honeybee* AND collapse"
86 , fs <- take 3 $ repeat allDataOrigins
89 instance ToSchema Query where
90 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
93 ------------------------------------------------------------------------
99 type PostApi = Summary "New Corpus endpoint"
100 :> ReqBody '[JSON] Query
101 :> Post '[JSON] CorpusId
102 type GetApi = Get '[JSON] ApiInfo
105 -- | TODO manage several apis
107 -- TODO this is only the POST
109 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
110 api uid (Query q _ as) = do
111 cId <- case head as of
112 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
113 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
115 docs <- liftBase $ API.get a q (Just 1000)
116 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
122 ------------------------------------------------
123 -- TODO use this route for Client implementation
124 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
126 instance Arbitrary ApiInfo where
127 arbitrary = ApiInfo <$> arbitrary
129 deriveJSON (unPrefix "") 'ApiInfo
131 instance ToSchema ApiInfo
134 info = ApiInfo API.externalAPIs
136 ------------------------------------------------------------------------
137 ------------------------------------------------------------------------
138 data WithQuery = WithQuery
140 , _wq_databases :: !Database
141 , _wq_datafield :: !(Maybe Datafield)
143 , _wq_node_id :: !Int
144 , _wq_flowListWith :: !FlowSocialListWith
148 makeLenses ''WithQuery
149 instance FromJSON WithQuery where
150 parseJSON = genericParseJSON $ jsonOptions "_wq_"
151 instance ToJSON WithQuery where
152 toJSON = genericToJSON $ jsonOptions "_wq_"
153 instance ToSchema WithQuery where
154 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
156 ------------------------------------------------------------------------
158 type AddWithQuery = Summary "Add with Query to corpus endpoint"
160 :> Capture "corpus_id" CorpusId
162 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
165 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
167 :> Capture "corpus_id" CorpusId
170 :> MultipartForm Mem (MultipartData Mem)
171 :> QueryParam "fileType" FileType
173 :> AsyncJobs JobLog '[JSON] () JobLog
177 ------------------------------------------------------------------------
178 -- TODO WithQuery also has a corpus id
181 addToCorpusWithQuery :: (FlowCmdM env err m, MonadJobStatus m)
188 addToCorpusWithQuery user cid (WithQuery { _wq_query = q
189 , _wq_databases = dbs
190 , _wq_datafield = datafield
192 , _wq_flowListWith = flw }) maybeLimit jobHandle = do
194 -- printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
195 -- printDebug "[addToCorpusWithQuery] datafield" datafield
196 -- printDebug "[addToCorpusWithQuery] flowListWith" flw
200 -- printDebug "[addToCorpusWithQuery] processing web request" datafield
202 markStarted 1 jobHandle
204 _ <- triggerSearxSearch user cid q l jobHandle
206 markComplete jobHandle
210 Just (External PubMed) -> do
211 _api_key <- view $ hasConfig . gc_pubmed_api_key
212 printDebug "[addToCorpusWithQuery] pubmed api key" _api_key
213 _ <- updateCorpusPubmedAPIKey cid _api_key
216 markStarted 3 jobHandle
219 -- TODO if cid is folder -> create Corpus
220 -- if cid is corpus -> add to corpus
221 -- if cid is root -> create corpus in Private
222 -- printDebug "[G.A.N.C.New] getDataText with query" q
223 let db = database2origin dbs
224 eTxt <- getDataText db (Multi l) q maybeLimit
226 -- printDebug "[G.A.N.C.New] lTxts" lTxts
229 -- TODO Sum lenghts of each txt elements
231 markProgress 1 jobHandle
233 void $ flowDataText user txt (Multi l) cid (Just flw) jobHandle
234 -- printDebug "corpus id" cids
235 -- printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
238 markComplete jobHandle
241 -- printDebug "Error: " err
242 markFailed (Just $ T.pack (show err)) jobHandle
244 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
246 :> Capture "corpus_id" CorpusId
250 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
252 addToCorpusWithForm :: (FlowCmdM env err m, MonadJobStatus m)
258 addToCorpusWithForm user cid (NewWithForm ft ff d l _n sel) jobHandle = do
259 -- printDebug "[addToCorpusWithForm] Parsing corpus: " cid
260 -- printDebug "[addToCorpusWithForm] fileType" ft
261 -- printDebug "[addToCorpusWithForm] fileFormat" ff
262 limit' <- view $ hasConfig . gc_max_docs_parsers
263 let limit = fromIntegral limit' :: Integer
266 CSV_HAL -> Parser.parseFormatC Parser.CsvHal
267 CSV -> Parser.parseFormatC Parser.CsvGargV3
268 WOS -> Parser.parseFormatC Parser.WOS
269 PresseRIS -> Parser.parseFormatC Parser.RisPresse
270 Iramuteq -> Parser.parseFormatC Parser.Iramuteq
271 JSON -> Parser.parseFormatC Parser.JSON
273 -- TODO granularity of the logStatus
274 let data' = case ff of
276 ZIP -> case BSB64.decode $ TE.encodeUtf8 d of
277 Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err
278 Right decoded -> decoded
279 eDocsC <- liftBase $ parseC ff data'
281 Right (mCount, docsC) -> do
282 -- TODO Add progress (jobStatus) update for docs - this is a
285 let docsC' = zipSources (yieldMany [1..]) docsC
286 .| mapMC (\(idx, doc) ->
287 if idx > limit then do
288 --printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show limit)
289 let panicMsg' = [ "[addToCorpusWithForm] number of docs "
290 , "exceeds the MAX_DOCS_PARSERS limit ("
293 let panicMsg = T.concat $ T.pack <$> panicMsg'
294 --logStatus $ jobLogFailTotalWithMessage panicMsg jobLog
298 .| mapC toHyperdataDocument
300 --printDebug "Parsing corpus finished : " cid
303 --printDebug "Starting extraction : " cid
304 -- TODO granularity of the logStatus
305 -- printDebug "flowCorpus with (corpus_id, lang)" (cid, l)
307 _cid' <- flowCorpus user
309 (Multi $ fromMaybe EN l)
311 --(Just $ fromIntegral $ length docs, docsC')
312 (mCount, transPipe liftBase docsC') -- TODO fix number of docs
313 --(map (map toHyperdataDocument) docs)
316 -- printDebug "Extraction finished : " cid
317 -- printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
318 -- TODO uncomment this
321 markComplete jobHandle
323 printDebug "[addToCorpusWithForm] parse error" e
324 markFailed (Just $ T.pack e) jobHandle
327 addToCorpusWithFile :: FlowCmdM env err m
333 addToCorpusWithFile cid input filetype logStatus = do
334 logStatus JobLog { _scst_succeeded = Just 10
335 , _scst_failed = Just 2
336 , _scst_remaining = Just 138
337 , _scst_events = Just []
339 printDebug "addToCorpusWithFile" cid
340 _h <- postUpload cid filetype input
342 pure JobLog { _scst_succeeded = Just 137
343 , _scst_failed = Just 13
344 , _scst_remaining = Just 0
345 , _scst_events = Just []
351 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
353 :> Capture "corpus_id" CorpusId
357 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
359 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m, MonadJobStatus m)
365 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) jobHandle = do
367 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
368 markStarted 1 jobHandle
370 fPath <- GargDB.writeFile nwf
371 printDebug "[addToCorpusWithFile] File saved as: " fPath
373 uId <- getUserId user
374 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
378 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
379 let hl = node ^. node_hyperdata
380 _ <- updateHyperdata nId $ hl { _hff_name = fName
381 , _hff_path = T.pack fPath }
383 printDebug "[addToCorpusWithFile] Created node with id: " nId
386 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
388 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
391 markComplete jobHandle