2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
18 module Gargantext.API.Node.Corpus.New
21 -- import Servant.Multipart
22 -- import Test.QuickCheck (elements)
23 import Control.Lens hiding (elements, Empty)
25 import Data.Aeson.TH (deriveJSON)
27 import Data.Maybe (fromMaybe)
29 import Data.Text (Text)
30 import GHC.Generics (Generic)
31 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
32 import Gargantext.API.Admin.Types (HasSettings)
33 import Gargantext.API.Node.Corpus.New.File
34 import Gargantext.API.Node.Types
35 import Gargantext.Core (Lang(..){-, allLangs-})
36 import Gargantext.Core.Types.Individu (User(..))
37 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
38 import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..), DataOrigin(..){-, allDataOrigins-})
39 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
40 import Gargantext.Database.Action.Mail (sendMail)
41 import Gargantext.Database.Action.Node (mkNodeWithParent)
42 import Gargantext.Database.Action.User (getUserId)
43 import Gargantext.Database.Admin.Types.Hyperdata
44 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
45 import Gargantext.Database.Query.Table.Node (getNodeWith)
46 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
47 import Gargantext.Database.Schema.Node (node_hyperdata)
48 import Gargantext.Prelude
50 import Servant.Job.Utils (jsonOptions)
51 import Test.QuickCheck.Arbitrary
52 import qualified Data.Text as T
53 import qualified Gargantext.API.Admin.Orchestrator.Types as T
54 import qualified Gargantext.Core.Text.Corpus.API as API
55 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
56 import qualified Gargantext.Database.GargDB as GargDB
58 ------------------------------------------------------------------------
60 data Query = Query { query_query :: Text
61 , query_node_id :: Int
63 , query_databases :: [DataOrigin]
65 deriving (Eq, Generic)
67 deriveJSON (unPrefix "query_") 'Query
69 instance Arbitrary Query where
70 arbitrary = elements [ Query q n la fs
71 | q <- ["honeybee* AND collapse"
76 , fs <- take 3 $ repeat allDataOrigins
79 instance ToSchema Query where
80 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
83 ------------------------------------------------------------------------
89 type PostApi = Summary "New Corpus endpoint"
90 :> ReqBody '[JSON] Query
91 :> Post '[JSON] CorpusId
92 type GetApi = Get '[JSON] ApiInfo
95 -- | TODO manage several apis
97 -- TODO this is only the POST
99 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
100 api uid (Query q _ as) = do
101 cId <- case head as of
102 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
103 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
105 docs <- liftBase $ API.get a q (Just 1000)
106 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
112 ------------------------------------------------
113 -- TODO use this route for Client implementation
114 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
116 instance Arbitrary ApiInfo where
117 arbitrary = ApiInfo <$> arbitrary
119 deriveJSON (unPrefix "") 'ApiInfo
121 instance ToSchema ApiInfo
123 info :: FlowCmdM env err m => UserId -> m ApiInfo
124 info _u = pure $ ApiInfo API.externalAPIs
126 ------------------------------------------------------------------------
128 data Database = Empty
133 deriving (Eq, Show, Generic)
135 deriveJSON (unPrefix "") ''Database
136 instance ToSchema Database
138 database2origin :: Database -> DataOrigin
139 database2origin Empty = InternalOrigin T.IsTex
140 database2origin PubMed = ExternalOrigin T.PubMed
141 database2origin HAL = ExternalOrigin T.HAL
142 database2origin IsTex = ExternalOrigin T.IsTex
143 database2origin Isidore = ExternalOrigin T.Isidore
145 ------------------------------------------------------------------------
146 data WithQuery = WithQuery
148 , _wq_databases :: !Database
150 , _wq_node_id :: !Int
154 makeLenses ''WithQuery
155 instance FromJSON WithQuery where
156 parseJSON = genericParseJSON $ jsonOptions "_wq_"
157 instance ToJSON WithQuery where
158 toJSON = genericToJSON $ jsonOptions "_wq_"
159 instance ToSchema WithQuery where
160 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
162 ------------------------------------------------------------------------
164 type AddWithQuery = Summary "Add with Query to corpus endpoint"
166 :> Capture "corpus_id" CorpusId
168 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
171 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
173 :> Capture "corpus_id" CorpusId
176 :> MultipartForm Mem (MultipartData Mem)
177 :> QueryParam "fileType" FileType
179 :> AsyncJobs JobLog '[JSON] () JobLog
183 ------------------------------------------------------------------------
184 -- TODO WithQuery also has a corpus id
185 addToCorpusWithQuery :: FlowCmdM env err m
192 addToCorpusWithQuery user cid (WithQuery q dbs l _nid) maybeLimit logStatus = do
194 logStatus JobLog { _scst_succeeded = Just 0
195 , _scst_failed = Just 0
196 , _scst_remaining = Just 5
197 , _scst_events = Just []
199 printDebug "addToCorpusWithQuery" (cid, dbs)
201 -- TODO if cid is folder -> create Corpus
202 -- if cid is corpus -> add to corpus
203 -- if cid is root -> create corpus in Private
204 txts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
206 logStatus JobLog { _scst_succeeded = Just 2
207 , _scst_failed = Just 0
208 , _scst_remaining = Just 1
209 , _scst_events = Just []
212 cids <- mapM (\txt -> flowDataText user txt (Multi l) cid) txts
213 printDebug "corpus id" cids
214 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
217 pure JobLog { _scst_succeeded = Just 3
218 , _scst_failed = Just 0
219 , _scst_remaining = Just 0
220 , _scst_events = Just []
224 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
226 :> Capture "corpus_id" CorpusId
230 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
232 addToCorpusWithForm :: FlowCmdM env err m
238 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus = do
240 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
241 printDebug "[addToCorpusWithForm] fileType" ft
242 logStatus JobLog { _scst_succeeded = Just 0
243 , _scst_failed = Just 0
244 , _scst_remaining = Just 2
245 , _scst_events = Just []
249 CSV_HAL -> Parser.parseFormat Parser.CsvHal
250 CSV -> Parser.parseFormat Parser.CsvGargV3
251 WOS -> Parser.parseFormat Parser.WOS
252 PresseRIS -> Parser.parseFormat Parser.RisPresse
254 -- TODO granularity of the logStatus
255 docs <- liftBase $ splitEvery 500
259 printDebug "Parsing corpus finished : " cid
260 logStatus JobLog { _scst_succeeded = Just 1
261 , _scst_failed = Just 0
262 , _scst_remaining = Just 1
263 , _scst_events = Just []
267 printDebug "Starting extraction : " cid
268 -- TODO granularity of the logStatus
269 _cid' <- flowCorpus user
271 (Multi $ fromMaybe EN l)
272 (map (map toHyperdataDocument) docs)
274 printDebug "Extraction finished : " cid
275 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
278 pure JobLog { _scst_succeeded = Just 2
279 , _scst_failed = Just 0
280 , _scst_remaining = Just 0
281 , _scst_events = Just []
285 addToCorpusWithFile :: FlowCmdM env err m
291 addToCorpusWithFile cid input filetype logStatus = do
292 logStatus JobLog { _scst_succeeded = Just 10
293 , _scst_failed = Just 2
294 , _scst_remaining = Just 138
295 , _scst_events = Just []
297 printDebug "addToCorpusWithFile" cid
298 _h <- postUpload cid filetype input
300 pure JobLog { _scst_succeeded = Just 137
301 , _scst_failed = Just 13
302 , _scst_remaining = Just 0
303 , _scst_events = Just []
309 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
311 :> Capture "corpus_id" CorpusId
315 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
317 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
323 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
325 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
326 logStatus JobLog { _scst_succeeded = Just 0
327 , _scst_failed = Just 0
328 , _scst_remaining = Just 1
329 , _scst_events = Just []
332 fPath <- GargDB.writeFile nwf
333 printDebug "[addToCorpusWithFile] File saved as: " fPath
335 uId <- getUserId user
336 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
340 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
341 let hl = node ^. node_hyperdata
342 _ <- updateHyperdata nId $ hl { _hff_name = fName
343 , _hff_path = T.pack fPath }
345 printDebug "[addToCorpusWithFile] Created node with id: " nId
348 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
350 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
353 pure $ JobLog { _scst_succeeded = Just 1
354 , _scst_failed = Just 0
355 , _scst_remaining = Just 0
356 , _scst_events = Just []