2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
18 module Gargantext.API.Node.Corpus.New
21 import Control.Lens hiding (elements, Empty)
23 import Data.Aeson.TH (deriveJSON)
25 import Data.Maybe (fromMaybe)
27 import Data.Text (Text)
28 import qualified Data.Text as T
29 import GHC.Generics (Generic)
31 import Servant.Job.Utils (jsonOptions)
32 -- import Servant.Multipart
33 -- import Test.QuickCheck (elements)
34 import Test.QuickCheck.Arbitrary
36 import Gargantext.Prelude
38 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
39 import qualified Gargantext.API.Admin.Orchestrator.Types as T
40 import Gargantext.API.Admin.Types (HasSettings)
41 import Gargantext.API.Node.Corpus.New.File
42 import Gargantext.API.Node.Types
43 import Gargantext.Core (Lang(..){-, allLangs-})
44 import Gargantext.Core.Types.Individu (User(..))
45 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
46 import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, getDataText, flowDataText, TermType(..), DataOrigin(..){-, allDataOrigins-})
47 import Gargantext.Database.Action.User (getUserId)
48 import Gargantext.Database.Action.Node (mkNodeWithParent)
49 import Gargantext.Database.Admin.Types.Hyperdata
50 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
51 import Gargantext.Database.Query.Table.Node (getNodeWith)
52 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
53 import Gargantext.Database.Schema.Node (node_hyperdata)
54 import qualified Gargantext.Prelude.Utils as GPU
55 import qualified Gargantext.Core.Text.Corpus.API as API
56 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
58 ------------------------------------------------------------------------
60 data Query = Query { query_query :: Text
61 , query_node_id :: Int
63 , query_databases :: [DataOrigin]
65 deriving (Eq, Generic)
67 deriveJSON (unPrefix "query_") 'Query
69 instance Arbitrary Query where
70 arbitrary = elements [ Query q n la fs
71 | q <- ["honeybee* AND collapse"
76 , fs <- take 3 $ repeat allDataOrigins
79 instance ToSchema Query where
80 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
83 ------------------------------------------------------------------------
89 type PostApi = Summary "New Corpus endpoint"
90 :> ReqBody '[JSON] Query
91 :> Post '[JSON] CorpusId
92 type GetApi = Get '[JSON] ApiInfo
95 -- | TODO manage several apis
97 -- TODO this is only the POST
99 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
100 api uid (Query q _ as) = do
101 cId <- case head as of
102 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
103 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
105 docs <- liftBase $ API.get a q (Just 1000)
106 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
112 ------------------------------------------------
113 -- TODO use this route for Client implementation
114 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
116 instance Arbitrary ApiInfo where
117 arbitrary = ApiInfo <$> arbitrary
119 deriveJSON (unPrefix "") 'ApiInfo
121 instance ToSchema ApiInfo
123 info :: FlowCmdM env err m => UserId -> m ApiInfo
124 info _u = pure $ ApiInfo API.externalAPIs
126 ------------------------------------------------------------------------
128 data Database = Empty
133 deriving (Eq, Show, Generic)
135 deriveJSON (unPrefix "") ''Database
136 instance ToSchema Database
138 database2origin :: Database -> DataOrigin
139 database2origin Empty = InternalOrigin T.IsTex
140 database2origin PubMed = ExternalOrigin T.PubMed
141 database2origin HAL = ExternalOrigin T.HAL
142 database2origin IsTex = ExternalOrigin T.IsTex
143 database2origin Isidore = ExternalOrigin T.Isidore
145 ------------------------------------------------------------------------
146 data WithQuery = WithQuery
148 , _wq_databases :: !Database
150 , _wq_node_id :: !Int
154 makeLenses ''WithQuery
155 instance FromJSON WithQuery where
156 parseJSON = genericParseJSON $ jsonOptions "_wq_"
157 instance ToSchema WithQuery where
158 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
160 ------------------------------------------------------------------------
162 type AddWithQuery = Summary "Add with Query to corpus endpoint"
164 :> Capture "corpus_id" CorpusId
166 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
169 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
171 :> Capture "corpus_id" CorpusId
174 :> MultipartForm Mem (MultipartData Mem)
175 :> QueryParam "fileType" FileType
177 :> AsyncJobs JobLog '[JSON] () JobLog
181 ------------------------------------------------------------------------
182 -- TODO WithQuery also has a corpus id
183 addToCorpusWithQuery :: FlowCmdM env err m
190 addToCorpusWithQuery u cid (WithQuery q dbs l _nid) maybeLimit logStatus = do
192 logStatus JobLog { _scst_succeeded = Just 0
193 , _scst_failed = Just 0
194 , _scst_remaining = Just 5
195 , _scst_events = Just []
197 printDebug "addToCorpusWithQuery" (cid, dbs)
199 -- TODO if cid is folder -> create Corpus
200 -- if cid is corpus -> add to corpus
201 -- if cid is root -> create corpus in Private
202 txts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
204 logStatus JobLog { _scst_succeeded = Just 2
205 , _scst_failed = Just 0
206 , _scst_remaining = Just 1
207 , _scst_events = Just []
210 cids <- mapM (\txt -> flowDataText u txt (Multi l) cid) txts
211 printDebug "corpus id" cids
213 pure JobLog { _scst_succeeded = Just 3
214 , _scst_failed = Just 0
215 , _scst_remaining = Just 0
216 , _scst_events = Just []
220 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
222 :> Capture "corpus_id" CorpusId
226 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
228 addToCorpusWithForm :: FlowCmdM env err m
234 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus = do
236 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
237 printDebug "[addToCorpusWithForm] fileType" ft
238 logStatus JobLog { _scst_succeeded = Just 0
239 , _scst_failed = Just 0
240 , _scst_remaining = Just 2
241 , _scst_events = Just []
245 CSV_HAL -> Parser.parseFormat Parser.CsvHal
246 CSV -> Parser.parseFormat Parser.CsvGargV3
247 WOS -> Parser.parseFormat Parser.WOS
248 PresseRIS -> Parser.parseFormat Parser.RisPresse
250 -- TODO granularity of the logStatus
251 docs <- liftBase $ splitEvery 500
255 printDebug "Parsing corpus finished : " cid
256 logStatus JobLog { _scst_succeeded = Just 1
257 , _scst_failed = Just 0
258 , _scst_remaining = Just 1
259 , _scst_events = Just []
263 printDebug "Starting extraction : " cid
264 -- TODO granularity of the logStatus
265 _cid' <- flowCorpus user
267 (Multi $ fromMaybe EN l)
268 (map (map toHyperdataDocument) docs)
270 printDebug "Extraction finished : " cid
271 pure JobLog { _scst_succeeded = Just 2
272 , _scst_failed = Just 0
273 , _scst_remaining = Just 0
274 , _scst_events = Just []
278 addToCorpusWithFile :: FlowCmdM env err m
284 addToCorpusWithFile cid input filetype logStatus = do
285 logStatus JobLog { _scst_succeeded = Just 10
286 , _scst_failed = Just 2
287 , _scst_remaining = Just 138
288 , _scst_events = Just []
290 printDebug "addToCorpusWithFile" cid
291 _h <- postUpload cid filetype input
293 pure JobLog { _scst_succeeded = Just 137
294 , _scst_failed = Just 13
295 , _scst_remaining = Just 0
296 , _scst_events = Just []
302 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
304 :> Capture "corpus_id" CorpusId
308 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
310 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
316 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
318 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
319 logStatus JobLog { _scst_succeeded = Just 0
320 , _scst_failed = Just 0
321 , _scst_remaining = Just 1
322 , _scst_events = Just []
325 fPath <- GPU.writeFile nwf
326 printDebug "[addToCorpusWithFile] File saved as: " fPath
328 uId <- getUserId user
329 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
333 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
334 let hl = node ^. node_hyperdata
335 _ <- updateHyperdata nId $ hl { _hff_name = fName
336 , _hff_path = T.pack fPath }
338 printDebug "[addToCorpusWithFile] Created node with id: " nId
341 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
342 pure $ JobLog { _scst_succeeded = Just 1
343 , _scst_failed = Just 0
344 , _scst_remaining = Just 0
345 , _scst_events = Just []