2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
17 {-# OPTIONS_GHC -fno-warn-orphans #-}
19 module Gargantext.API.Node.Corpus.New
22 import Control.Lens hiding (elements, Empty)
24 import Data.Aeson.TH (deriveJSON)
26 import Data.Maybe (fromMaybe)
28 import Data.Text (Text)
29 import qualified Data.Text as T
30 import qualified Data.Text.IO as TIO
31 import GHC.Generics (Generic)
33 import Servant.Job.Core
34 import Servant.Job.Types
35 import Servant.Job.Utils (jsonOptions)
36 -- import Servant.Multipart
37 -- import Test.QuickCheck (elements)
38 import Test.QuickCheck.Arbitrary
39 import Web.FormUrlEncoded (FromForm)
41 import Gargantext.Prelude
43 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
44 import qualified Gargantext.API.Admin.Orchestrator.Types as T
45 import Gargantext.API.Admin.Settings (HasSettings)
46 import Gargantext.API.Node.Corpus.New.File
47 import Gargantext.Core (Lang(..){-, allLangs-})
48 import Gargantext.Core.Types.Individu (User(..))
49 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
50 import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, getDataText, flowDataText, TermType(..), DataOrigin(..){-, allDataOrigins-})
51 import Gargantext.Database.Action.Flow.Utils (getUserId)
52 import Gargantext.Database.Action.Node (mkNodeWithParent)
53 import Gargantext.Database.Admin.Types.Hyperdata
54 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
55 import Gargantext.Database.Query.Table.Node (getNodeWith)
56 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
57 import Gargantext.Database.Schema.Node (node_hyperdata)
58 import qualified Gargantext.Prelude.Utils as GPU
59 import qualified Gargantext.Text.Corpus.API as API
60 import qualified Gargantext.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
62 ------------------------------------------------------------------------
64 data Query = Query { query_query :: Text
65 , query_node_id :: Int
67 , query_databases :: [DataOrigin]
69 deriving (Eq, Generic)
71 deriveJSON (unPrefix "query_") 'Query
73 instance Arbitrary Query where
74 arbitrary = elements [ Query q n la fs
75 | q <- ["honeybee* AND collapse"
80 , fs <- take 3 $ repeat allDataOrigins
83 instance ToSchema Query where
84 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
87 ------------------------------------------------------------------------
93 type PostApi = Summary "New Corpus endpoint"
94 :> ReqBody '[JSON] Query
95 :> Post '[JSON] CorpusId
96 type GetApi = Get '[JSON] ApiInfo
99 -- | TODO manage several apis
101 -- TODO this is only the POST
103 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
104 api uid (Query q _ as) = do
105 cId <- case head as of
106 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
107 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
109 docs <- liftBase $ API.get a q (Just 1000)
110 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
116 ------------------------------------------------
117 -- TODO use this route for Client implementation
118 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
120 instance Arbitrary ApiInfo where
121 arbitrary = ApiInfo <$> arbitrary
123 deriveJSON (unPrefix "") 'ApiInfo
125 instance ToSchema ApiInfo
127 info :: FlowCmdM env err m => UserId -> m ApiInfo
128 info _u = pure $ ApiInfo API.externalAPIs
130 ------------------------------------------------------------------------
132 data Database = Empty
137 deriving (Eq, Show, Generic)
139 deriveJSON (unPrefix "") ''Database
140 instance ToSchema Database
142 database2origin :: Database -> DataOrigin
143 database2origin Empty = InternalOrigin T.IsTex
144 database2origin PubMed = ExternalOrigin T.PubMed
145 database2origin HAL = ExternalOrigin T.HAL
146 database2origin IsTex = ExternalOrigin T.IsTex
147 database2origin Isidore = ExternalOrigin T.Isidore
149 ------------------------------------------------------------------------
150 data WithQuery = WithQuery
152 , _wq_databases :: !Database
154 , _wq_node_id :: !Int
158 makeLenses ''WithQuery
159 instance FromJSON WithQuery where
160 parseJSON = genericParseJSON $ jsonOptions "_wq_"
161 instance ToSchema WithQuery where
162 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
164 -------------------------------------------------------
165 data NewWithForm = NewWithForm
166 { _wf_filetype :: !FileType
168 , _wf_lang :: !(Maybe Lang)
170 } deriving (Eq, Show, Generic)
172 makeLenses ''NewWithForm
173 instance FromForm NewWithForm
174 instance FromJSON NewWithForm where
175 parseJSON = genericParseJSON $ jsonOptions "_wf_"
176 instance ToSchema NewWithForm where
177 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")
179 -------------------------------------------------------
180 data NewWithFile = NewWithFile
182 , _wfi_lang :: !(Maybe Lang)
184 } deriving (Eq, Show, Generic)
186 makeLenses ''NewWithFile
187 instance FromForm NewWithFile
188 instance FromJSON NewWithFile where
189 parseJSON = genericParseJSON $ jsonOptions "_wfi_"
190 instance ToSchema NewWithFile where
191 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wfi_")
193 instance GPU.SaveFile NewWithFile where
194 saveFile' fp (NewWithFile d _ _) = TIO.writeFile fp d
196 --instance GPU.ReadFile NewWithFile where
197 -- readFile' = TIO.readFile
199 ------------------------------------------------------------------------
200 type AsyncJobs event ctI input output =
201 AsyncJobsAPI' 'Unsafe 'Safe ctI '[JSON] Maybe event input output
202 ------------------------------------------------------------------------
204 type AddWithQuery = Summary "Add with Query to corpus endpoint"
206 :> Capture "corpus_id" CorpusId
208 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
211 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
213 :> Capture "corpus_id" CorpusId
216 :> MultipartForm Mem (MultipartData Mem)
217 :> QueryParam "fileType" FileType
219 :> AsyncJobs JobLog '[JSON] () JobLog
223 ------------------------------------------------------------------------
224 -- TODO WithQuery also has a corpus id
225 addToCorpusWithQuery :: FlowCmdM env err m
231 addToCorpusWithQuery u cid (WithQuery q dbs l _nid) logStatus = do
233 logStatus JobLog { _scst_succeeded = Just 0
234 , _scst_failed = Just 0
235 , _scst_remaining = Just 5
236 , _scst_events = Just []
238 printDebug "addToCorpusWithQuery" (cid, dbs)
240 -- TODO if cid is folder -> create Corpus
241 -- if cid is corpus -> add to corpus
242 -- if cid is root -> create corpus in Private
243 txts <- mapM (\db -> getDataText db (Multi l) q Nothing) [database2origin dbs]
245 logStatus JobLog { _scst_succeeded = Just 2
246 , _scst_failed = Just 0
247 , _scst_remaining = Just 1
248 , _scst_events = Just []
251 cids <- mapM (\txt -> flowDataText u txt (Multi l) cid) txts
252 printDebug "corpus id" cids
254 pure JobLog { _scst_succeeded = Just 3
255 , _scst_failed = Just 0
256 , _scst_remaining = Just 0
257 , _scst_events = Just []
261 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
263 :> Capture "corpus_id" CorpusId
267 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
269 addToCorpusWithForm :: FlowCmdM env err m
275 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus = do
277 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
278 printDebug "[addToCorpusWithForm] fileType" ft
279 logStatus JobLog { _scst_succeeded = Just 0
280 , _scst_failed = Just 0
281 , _scst_remaining = Just 2
282 , _scst_events = Just []
286 CSV_HAL -> Parser.parseFormat Parser.CsvHal
287 CSV -> Parser.parseFormat Parser.CsvGargV3
288 WOS -> Parser.parseFormat Parser.WOS
289 PresseRIS -> Parser.parseFormat Parser.RisPresse
291 -- TODO granularity of the logStatus
292 docs <- liftBase $ splitEvery 500
296 printDebug "Parsing corpus finished : " cid
297 logStatus JobLog { _scst_succeeded = Just 1
298 , _scst_failed = Just 0
299 , _scst_remaining = Just 1
300 , _scst_events = Just []
304 printDebug "Starting extraction : " cid
305 -- TODO granularity of the logStatus
306 _cid' <- flowCorpus user
308 (Multi $ fromMaybe EN l)
309 (map (map toHyperdataDocument) docs)
311 printDebug "Extraction finished : " cid
312 pure JobLog { _scst_succeeded = Just 2
313 , _scst_failed = Just 0
314 , _scst_remaining = Just 0
315 , _scst_events = Just []
319 addToCorpusWithFile :: FlowCmdM env err m
325 addToCorpusWithFile cid input filetype logStatus = do
326 logStatus JobLog { _scst_succeeded = Just 10
327 , _scst_failed = Just 2
328 , _scst_remaining = Just 138
329 , _scst_events = Just []
331 printDebug "addToCorpusWithFile" cid
332 _h <- postUpload cid filetype input
334 pure JobLog { _scst_succeeded = Just 137
335 , _scst_failed = Just 13
336 , _scst_remaining = Just 0
337 , _scst_events = Just []
343 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
345 :> Capture "corpus_id" CorpusId
349 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
351 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
357 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
359 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
360 logStatus JobLog { _scst_succeeded = Just 0
361 , _scst_failed = Just 0
362 , _scst_remaining = Just 1
363 , _scst_events = Just []
366 fPath <- GPU.writeFile nwf
367 printDebug "[addToCorpusWithFile] File saved as: " fPath
369 uId <- getUserId user
370 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
374 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
375 let hl = node ^. node_hyperdata
376 _ <- updateHyperdata nId $ hl { _hff_name = fName
377 , _hff_path = T.pack fPath }
379 printDebug "[addToCorpusWithFile] Created node with id: " nId
382 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
383 pure $ JobLog { _scst_succeeded = Just 1
384 , _scst_failed = Just 0
385 , _scst_remaining = Just 0
386 , _scst_events = Just []