2 Module : Gargantext.API.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE DeriveGeneric #-}
18 {-# LANGUAGE DataKinds #-}
19 {-# LANGUAGE TypeOperators #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE FlexibleContexts #-}
22 {-# LANGUAGE RankNTypes #-}
23 {-# OPTIONS_GHC -fno-warn-orphans #-}
25 module Gargantext.API.Corpus.New
28 import Control.Lens hiding (elements)
30 import Data.Aeson.TH (deriveJSON)
31 import Data.Maybe (fromMaybe)
34 import Data.Text (Text)
35 import GHC.Generics (Generic)
36 import Gargantext.API.Corpus.New.File
37 import Gargantext.API.Orchestrator.Types
38 import Gargantext.Core (Lang(..))
39 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
40 import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, flowCorpusSearchInDatabase)
41 import Gargantext.Database.Admin.Types.Node (CorpusId, ToHyperdataDocument(..))
42 import Gargantext.Core.Types.Individu (UserId, User(..))
43 import Gargantext.Prelude
44 import qualified Gargantext.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
45 import Gargantext.Text.Terms (TermType(..))
47 import Servant.API.Flatten (Flat)
48 import Servant.Job.Core
49 import Servant.Job.Types
50 import Servant.Job.Utils (jsonOptions)
51 import Servant.Multipart
52 import Test.QuickCheck (elements)
53 import Test.QuickCheck.Arbitrary
54 import Web.FormUrlEncoded (FromForm)
55 import qualified Gargantext.Text.Corpus.API as API
57 ------------------------------------------------------------------------
58 data Query = Query { query_query :: Text
59 , query_corpus_id :: Int
60 , query_databases :: [API.ExternalAPIs]
62 deriving (Eq, Show, Generic)
64 deriveJSON (unPrefix "query_") 'Query
66 instance Arbitrary Query where
67 arbitrary = elements [ Query q n fs
68 | q <- ["honeybee* AND collopase"
71 , fs <- take 3 $ repeat API.externalAPIs
74 instance ToSchema Query where
75 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
77 ------------------------------------------------------------------------
82 type PostApi = Summary "New Corpus endpoint"
83 :> ReqBody '[JSON] Query
84 :> Post '[JSON] CorpusId
85 type GetApi = Get '[JSON] ApiInfo
87 -- | TODO manage several apis
89 -- TODO this is only the POST
91 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
92 api uid (Query q _ as) = do
93 cId <- case head as of
94 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
95 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
97 docs <- liftBase $ API.get a q (Just 1000)
98 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
104 ------------------------------------------------
105 -- TODO use this route for Client implementation
106 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
108 instance Arbitrary ApiInfo where
109 arbitrary = ApiInfo <$> arbitrary
111 deriveJSON (unPrefix "") 'ApiInfo
113 instance ToSchema ApiInfo
115 info :: FlowCmdM env err m => UserId -> m ApiInfo
116 info _u = pure $ ApiInfo API.externalAPIs
118 ------------------------------------------------------------------------
119 ------------------------------------------------------------------------
120 data WithQuery = WithQuery
122 , _wq_databases :: ![ExternalAPIs]
123 , _wq_lang :: !(Maybe Lang)
127 makeLenses ''WithQuery
128 instance FromJSON WithQuery where
129 parseJSON = genericParseJSON $ jsonOptions "_wq_"
130 instance ToSchema WithQuery where
131 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
133 -------------------------------------------------------
134 data WithForm = WithForm
135 { _wf_filetype :: !FileType
137 , _wf_lang :: !(Maybe Lang)
139 } deriving (Eq, Show, Generic)
141 makeLenses ''WithForm
142 instance FromForm WithForm
143 instance FromJSON WithForm where
144 parseJSON = genericParseJSON $ jsonOptions "_wf_"
145 instance ToSchema WithForm where
146 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")
148 ------------------------------------------------------------------------
149 type AsyncJobs event ctI input output =
150 Flat (AsyncJobsAPI' 'Unsafe 'Safe ctI '[JSON] Maybe event input output)
151 ------------------------------------------------------------------------
153 type Upload = Summary "Corpus Upload endpoint"
155 :> Capture "corpus_id" CorpusId
156 :<|> "addWithquery" :> AsyncJobsAPI ScraperStatus WithQuery ScraperStatus
157 :<|> "addWithfile" :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
159 type AddWithQuery = Summary "Add with Query to corpus endpoint"
161 :> Capture "corpus_id" CorpusId
165 :> AsyncJobsAPI ScraperStatus WithQuery ScraperStatus
167 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
169 :> Capture "corpus_id" CorpusId
172 :> MultipartForm Mem (MultipartData Mem)
173 :> QueryParam "fileType" FileType
175 :> AsyncJobs ScraperStatus '[JSON] () ScraperStatus
177 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
179 :> Capture "corpus_id" CorpusId
183 :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
185 ------------------------------------------------------------------------
186 -- TODO WithQuery also has a corpus id
187 addToCorpusWithQuery :: FlowCmdM env err m
191 -> (ScraperStatus -> m ())
193 addToCorpusWithQuery u cid (WithQuery q _dbs l) logStatus = do
195 logStatus ScraperStatus { _scst_succeeded = Just 10
196 , _scst_failed = Just 2
197 , _scst_remaining = Just 138
198 , _scst_events = Just []
200 printDebug "addToCorpusWithQuery" cid
202 -- TODO if cid is folder -> create Corpus
203 -- if cid is corpus -> add to corpus
204 -- if cid is root -> create corpus in Private
205 cids <- flowCorpusSearchInDatabase u (maybe EN identity l) q
206 printDebug "corpus id" cids
208 pure ScraperStatus { _scst_succeeded = Just 137
209 , _scst_failed = Just 13
210 , _scst_remaining = Just 0
211 , _scst_events = Just []
214 addToCorpusWithFile :: FlowCmdM env err m
218 -> (ScraperStatus -> m ())
220 addToCorpusWithFile cid input filetype logStatus = do
221 logStatus ScraperStatus { _scst_succeeded = Just 10
222 , _scst_failed = Just 2
223 , _scst_remaining = Just 138
224 , _scst_events = Just []
226 printDebug "addToCorpusWithFile" cid
227 _h <- postUpload cid filetype input
229 pure ScraperStatus { _scst_succeeded = Just 137
230 , _scst_failed = Just 13
231 , _scst_remaining = Just 0
232 , _scst_events = Just []
235 addToCorpusWithForm :: FlowCmdM env err m
239 -> (ScraperStatus -> m ())
241 addToCorpusWithForm user cid (WithForm ft d l _n) logStatus = do
245 CSV_HAL -> Parser.parseFormat Parser.CsvHal
246 CSV -> Parser.parseFormat Parser.CsvGargV3
247 WOS -> Parser.parseFormat Parser.WOS
248 PresseRIS -> Parser.parseFormat Parser.RisPresse
250 logStatus ScraperStatus { _scst_succeeded = Just 1
251 , _scst_failed = Just 0
252 , _scst_remaining = Just 1
253 , _scst_events = Just []
256 printDebug "Parsing corpus: " cid
258 -- TODO granularity of the logStatus
259 docs <- liftBase $ splitEvery 500
263 printDebug "Parsing corpus finished : " cid
264 printDebug "Starting extraction : " cid
266 -- TODO granularity of the logStatus
267 _cid' <- flowCorpus user
269 (Multi $ fromMaybe EN l)
270 (map (map toHyperdataDocument) docs)
272 printDebug "Extraction finished : " cid
274 pure ScraperStatus { _scst_succeeded = Just 2
275 , _scst_failed = Just 0
276 , _scst_remaining = Just 0
277 , _scst_events = Just []