2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
18 module Gargantext.API.Node.Corpus.New
21 import Control.Lens hiding (elements, Empty)
23 import Data.Aeson.TH (deriveJSON)
25 import Data.Maybe (fromMaybe)
27 import Data.Text (Text)
28 import qualified Data.Text as T
29 import GHC.Generics (Generic)
31 import Servant.Job.Utils (jsonOptions)
32 -- import Servant.Multipart
33 -- import Test.QuickCheck (elements)
34 import Test.QuickCheck.Arbitrary
36 import Gargantext.Prelude
38 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
39 import qualified Gargantext.API.Admin.Orchestrator.Types as T
40 import Gargantext.API.Admin.Settings (HasSettings)
41 import Gargantext.API.Node.Corpus.New.File
42 import Gargantext.API.Node.Types
43 import Gargantext.Core (Lang(..){-, allLangs-})
44 import Gargantext.Core.Types.Individu (User(..))
45 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
46 import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, getDataText, flowDataText, TermType(..), DataOrigin(..){-, allDataOrigins-})
47 import Gargantext.Database.Action.Flow.Utils (getUserId)
48 import Gargantext.Database.Action.Node (mkNodeWithParent)
49 import Gargantext.Database.Admin.Types.Hyperdata
50 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
51 import Gargantext.Database.Query.Table.Node (getNodeWith)
52 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
53 import Gargantext.Database.Schema.Node (node_hyperdata)
54 import qualified Gargantext.Prelude.Utils as GPU
55 import qualified Gargantext.Core.Text.Corpus.API as API
56 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
58 ------------------------------------------------------------------------
60 data Query = Query { query_query :: Text
61 , query_node_id :: Int
63 , query_databases :: [DataOrigin]
65 deriving (Eq, Generic)
67 deriveJSON (unPrefix "query_") 'Query
69 instance Arbitrary Query where
70 arbitrary = elements [ Query q n la fs
71 | q <- ["honeybee* AND collapse"
76 , fs <- take 3 $ repeat allDataOrigins
79 instance ToSchema Query where
80 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
83 ------------------------------------------------------------------------
89 type PostApi = Summary "New Corpus endpoint"
90 :> ReqBody '[JSON] Query
91 :> Post '[JSON] CorpusId
92 type GetApi = Get '[JSON] ApiInfo
95 -- | TODO manage several apis
97 -- TODO this is only the POST
99 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
100 api uid (Query q _ as) = do
101 cId <- case head as of
102 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
103 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
105 docs <- liftBase $ API.get a q (Just 1000)
106 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
112 ------------------------------------------------
113 -- TODO use this route for Client implementation
114 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
116 instance Arbitrary ApiInfo where
117 arbitrary = ApiInfo <$> arbitrary
119 deriveJSON (unPrefix "") 'ApiInfo
121 instance ToSchema ApiInfo
123 info :: FlowCmdM env err m => UserId -> m ApiInfo
124 info _u = pure $ ApiInfo API.externalAPIs
126 ------------------------------------------------------------------------
128 data Database = Empty
133 deriving (Eq, Show, Generic)
135 deriveJSON (unPrefix "") ''Database
136 instance ToSchema Database
138 database2origin :: Database -> DataOrigin
139 database2origin Empty = InternalOrigin T.IsTex
140 database2origin PubMed = ExternalOrigin T.PubMed
141 database2origin HAL = ExternalOrigin T.HAL
142 database2origin IsTex = ExternalOrigin T.IsTex
143 database2origin Isidore = ExternalOrigin T.Isidore
145 ------------------------------------------------------------------------
146 data WithQuery = WithQuery
148 , _wq_databases :: !Database
150 , _wq_node_id :: !Int
154 makeLenses ''WithQuery
155 instance FromJSON WithQuery where
156 parseJSON = genericParseJSON $ jsonOptions "_wq_"
157 instance ToSchema WithQuery where
158 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
160 ------------------------------------------------------------------------
162 type AddWithQuery = Summary "Add with Query to corpus endpoint"
164 :> Capture "corpus_id" CorpusId
166 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
169 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
171 :> Capture "corpus_id" CorpusId
174 :> MultipartForm Mem (MultipartData Mem)
175 :> QueryParam "fileType" FileType
177 :> AsyncJobs JobLog '[JSON] () JobLog
181 ------------------------------------------------------------------------
182 -- TODO WithQuery also has a corpus id
183 addToCorpusWithQuery :: FlowCmdM env err m
189 addToCorpusWithQuery u cid (WithQuery q dbs l _nid) logStatus = do
191 logStatus JobLog { _scst_succeeded = Just 0
192 , _scst_failed = Just 0
193 , _scst_remaining = Just 5
194 , _scst_events = Just []
196 printDebug "addToCorpusWithQuery" (cid, dbs)
198 -- TODO if cid is folder -> create Corpus
199 -- if cid is corpus -> add to corpus
200 -- if cid is root -> create corpus in Private
201 txts <- mapM (\db -> getDataText db (Multi l) q Nothing) [database2origin dbs]
203 logStatus JobLog { _scst_succeeded = Just 2
204 , _scst_failed = Just 0
205 , _scst_remaining = Just 1
206 , _scst_events = Just []
209 cids <- mapM (\txt -> flowDataText u txt (Multi l) cid) txts
210 printDebug "corpus id" cids
212 pure JobLog { _scst_succeeded = Just 3
213 , _scst_failed = Just 0
214 , _scst_remaining = Just 0
215 , _scst_events = Just []
219 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
221 :> Capture "corpus_id" CorpusId
225 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
227 addToCorpusWithForm :: FlowCmdM env err m
233 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus = do
235 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
236 printDebug "[addToCorpusWithForm] fileType" ft
237 logStatus JobLog { _scst_succeeded = Just 0
238 , _scst_failed = Just 0
239 , _scst_remaining = Just 2
240 , _scst_events = Just []
244 CSV_HAL -> Parser.parseFormat Parser.CsvHal
245 CSV -> Parser.parseFormat Parser.CsvGargV3
246 WOS -> Parser.parseFormat Parser.WOS
247 PresseRIS -> Parser.parseFormat Parser.RisPresse
249 -- TODO granularity of the logStatus
250 docs <- liftBase $ splitEvery 500
254 printDebug "Parsing corpus finished : " cid
255 logStatus JobLog { _scst_succeeded = Just 1
256 , _scst_failed = Just 0
257 , _scst_remaining = Just 1
258 , _scst_events = Just []
262 printDebug "Starting extraction : " cid
263 -- TODO granularity of the logStatus
264 _cid' <- flowCorpus user
266 (Multi $ fromMaybe EN l)
267 (map (map toHyperdataDocument) docs)
269 printDebug "Extraction finished : " cid
270 pure JobLog { _scst_succeeded = Just 2
271 , _scst_failed = Just 0
272 , _scst_remaining = Just 0
273 , _scst_events = Just []
277 addToCorpusWithFile :: FlowCmdM env err m
283 addToCorpusWithFile cid input filetype logStatus = do
284 logStatus JobLog { _scst_succeeded = Just 10
285 , _scst_failed = Just 2
286 , _scst_remaining = Just 138
287 , _scst_events = Just []
289 printDebug "addToCorpusWithFile" cid
290 _h <- postUpload cid filetype input
292 pure JobLog { _scst_succeeded = Just 137
293 , _scst_failed = Just 13
294 , _scst_remaining = Just 0
295 , _scst_events = Just []
301 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
303 :> Capture "corpus_id" CorpusId
307 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
309 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
315 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
317 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
318 logStatus JobLog { _scst_succeeded = Just 0
319 , _scst_failed = Just 0
320 , _scst_remaining = Just 1
321 , _scst_events = Just []
324 fPath <- GPU.writeFile nwf
325 printDebug "[addToCorpusWithFile] File saved as: " fPath
327 uId <- getUserId user
328 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
332 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
333 let hl = node ^. node_hyperdata
334 _ <- updateHyperdata nId $ hl { _hff_name = fName
335 , _hff_path = T.pack fPath }
337 printDebug "[addToCorpusWithFile] Created node with id: " nId
340 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
341 pure $ JobLog { _scst_succeeded = Just 1
342 , _scst_failed = Just 0
343 , _scst_remaining = Just 0
344 , _scst_events = Just []