2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
17 {-# OPTIONS_GHC -fno-warn-orphans #-}
19 module Gargantext.API.Node.Corpus.New
22 import Control.Lens hiding (elements, Empty)
24 import Data.Aeson.TH (deriveJSON)
26 import Data.Maybe (fromMaybe)
28 import Data.Text (Text)
29 import GHC.Generics (Generic)
31 import Servant.Job.Core
32 import Servant.Job.Types
33 import Servant.Job.Utils (jsonOptions)
34 -- import Servant.Multipart
35 -- import Test.QuickCheck (elements)
36 import Test.QuickCheck.Arbitrary
37 import Web.FormUrlEncoded (FromForm)
39 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
40 import qualified Gargantext.API.Admin.Orchestrator.Types as T
41 import Gargantext.API.Node.Corpus.New.File
42 import Gargantext.Core (Lang(..){-, allLangs-})
43 import Gargantext.Core.Types.Individu (User(..))
44 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
45 import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, getDataText, flowDataText, TermType(..), DataOrigin(..){-, allDataOrigins-})
46 import Gargantext.Database.Admin.Types.Hyperdata
47 import Gargantext.Database.Admin.Types.Node (CorpusId, UserId)
48 import Gargantext.Prelude
49 import qualified Gargantext.Text.Corpus.API as API
50 import qualified Gargantext.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
52 ------------------------------------------------------------------------
54 data Query = Query { query_query :: Text
55 , query_node_id :: Int
57 , query_databases :: [DataOrigin]
59 deriving (Eq, Generic)
61 deriveJSON (unPrefix "query_") 'Query
63 instance Arbitrary Query where
64 arbitrary = elements [ Query q n la fs
65 | q <- ["honeybee* AND collapse"
70 , fs <- take 3 $ repeat allDataOrigins
73 instance ToSchema Query where
74 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
77 ------------------------------------------------------------------------
83 type PostApi = Summary "New Corpus endpoint"
84 :> ReqBody '[JSON] Query
85 :> Post '[JSON] CorpusId
86 type GetApi = Get '[JSON] ApiInfo
89 -- | TODO manage several apis
91 -- TODO this is only the POST
93 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
94 api uid (Query q _ as) = do
95 cId <- case head as of
96 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
97 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
99 docs <- liftBase $ API.get a q (Just 1000)
100 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
106 ------------------------------------------------
107 -- TODO use this route for Client implementation
108 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
110 instance Arbitrary ApiInfo where
111 arbitrary = ApiInfo <$> arbitrary
113 deriveJSON (unPrefix "") 'ApiInfo
115 instance ToSchema ApiInfo
117 info :: FlowCmdM env err m => UserId -> m ApiInfo
118 info _u = pure $ ApiInfo API.externalAPIs
120 ------------------------------------------------------------------------
122 data Database = Empty
127 deriving (Eq, Show, Generic)
129 deriveJSON (unPrefix "") ''Database
130 instance ToSchema Database
132 database2origin :: Database -> DataOrigin
133 database2origin Empty = InternalOrigin T.IsTex
134 database2origin PubMed = ExternalOrigin T.PubMed
135 database2origin HAL = ExternalOrigin T.HAL
136 database2origin IsTex = ExternalOrigin T.IsTex
137 database2origin Isidore = ExternalOrigin T.Isidore
139 ------------------------------------------------------------------------
140 data WithQuery = WithQuery
142 , _wq_databases :: !Database
144 , _wq_node_id :: !Int
148 makeLenses ''WithQuery
149 instance FromJSON WithQuery where
150 parseJSON = genericParseJSON $ jsonOptions "_wq_"
151 instance ToSchema WithQuery where
152 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
154 -------------------------------------------------------
155 data NewWithForm = NewWithForm
156 { _wf_filetype :: !FileType
158 , _wf_lang :: !(Maybe Lang)
160 } deriving (Eq, Show, Generic)
162 makeLenses ''NewWithForm
163 instance FromForm NewWithForm
164 instance FromJSON NewWithForm where
165 parseJSON = genericParseJSON $ jsonOptions "_wf_"
166 instance ToSchema NewWithForm where
167 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")
169 ------------------------------------------------------------------------
170 type AsyncJobs event ctI input output =
171 AsyncJobsAPI' 'Unsafe 'Safe ctI '[JSON] Maybe event input output
172 ------------------------------------------------------------------------
174 type AddWithQuery = Summary "Add with Query to corpus endpoint"
176 :> Capture "corpus_id" CorpusId
178 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
181 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
183 :> Capture "corpus_id" CorpusId
186 :> MultipartForm Mem (MultipartData Mem)
187 :> QueryParam "fileType" FileType
189 :> AsyncJobs JobLog '[JSON] () JobLog
192 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
194 :> Capture "corpus_id" CorpusId
198 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
201 ------------------------------------------------------------------------
202 -- TODO WithQuery also has a corpus id
203 addToCorpusWithQuery :: FlowCmdM env err m
209 addToCorpusWithQuery u cid (WithQuery q dbs l _nid) logStatus = do
211 logStatus JobLog { _scst_succeeded = Just 0
212 , _scst_failed = Just 0
213 , _scst_remaining = Just 5
214 , _scst_events = Just []
216 printDebug "addToCorpusWithQuery" cid
218 -- TODO if cid is folder -> create Corpus
219 -- if cid is corpus -> add to corpus
220 -- if cid is root -> create corpus in Private
221 txts <- mapM (\db -> getDataText db (Multi l) q (Just 10000)) [database2origin dbs]
223 logStatus JobLog { _scst_succeeded = Just 2
224 , _scst_failed = Just 0
225 , _scst_remaining = Just 1
226 , _scst_events = Just []
229 cids <- mapM (\txt -> flowDataText u txt (Multi l) cid) txts
230 printDebug "corpus id" cids
232 pure JobLog { _scst_succeeded = Just 3
233 , _scst_failed = Just 0
234 , _scst_remaining = Just 0
235 , _scst_events = Just []
238 addToCorpusWithForm :: FlowCmdM env err m
244 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus = do
246 printDebug "Parsing corpus: " cid
247 logStatus JobLog { _scst_succeeded = Just 0
248 , _scst_failed = Just 0
249 , _scst_remaining = Just 2
250 , _scst_events = Just []
254 CSV_HAL -> Parser.parseFormat Parser.CsvHal
255 CSV -> Parser.parseFormat Parser.CsvGargV3
256 WOS -> Parser.parseFormat Parser.WOS
257 PresseRIS -> Parser.parseFormat Parser.RisPresse
259 -- TODO granularity of the logStatus
260 docs <- liftBase $ splitEvery 500
264 printDebug "Parsing corpus finished : " cid
265 logStatus JobLog { _scst_succeeded = Just 1
266 , _scst_failed = Just 0
267 , _scst_remaining = Just 1
268 , _scst_events = Just []
272 printDebug "Starting extraction : " cid
273 -- TODO granularity of the logStatus
274 _cid' <- flowCorpus user
276 (Multi $ fromMaybe EN l)
277 (map (map toHyperdataDocument) docs)
279 printDebug "Extraction finished : " cid
280 pure JobLog { _scst_succeeded = Just 2
281 , _scst_failed = Just 0
282 , _scst_remaining = Just 0
283 , _scst_events = Just []
287 addToCorpusWithFile :: FlowCmdM env err m
293 addToCorpusWithFile cid input filetype logStatus = do
294 logStatus JobLog { _scst_succeeded = Just 10
295 , _scst_failed = Just 2
296 , _scst_remaining = Just 138
297 , _scst_events = Just []
299 printDebug "addToCorpusWithFile" cid
300 _h <- postUpload cid filetype input
302 pure JobLog { _scst_succeeded = Just 137
303 , _scst_failed = Just 13
304 , _scst_remaining = Just 0
305 , _scst_events = Just []