2 Module : Gargantext.API.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE DeriveGeneric #-}
18 {-# LANGUAGE DataKinds #-}
19 {-# LANGUAGE TypeOperators #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE FlexibleContexts #-}
22 {-# LANGUAGE RankNTypes #-}
23 {-# OPTIONS_GHC -fno-warn-orphans #-}
25 module Gargantext.API.Corpus.New
28 import Control.Lens hiding (elements)
30 import Data.Aeson.TH (deriveJSON)
32 import Data.Maybe (fromMaybe)
34 import Data.Text (Text)
35 import GHC.Generics (Generic)
36 import Gargantext.API.Admin.Orchestrator.Types
37 import Gargantext.API.Corpus.New.File
38 import Gargantext.Core (Lang(..))
39 import Gargantext.Core.Types.Individu (UserId, User(..))
40 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
41 import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, flowCorpusSearchInDatabase)
42 import Gargantext.Database.Admin.Types.Node (CorpusId, ToHyperdataDocument(..))
43 import Gargantext.Prelude
44 import Gargantext.Text.Terms (TermType(..))
46 import Servant.API.Flatten (Flat)
47 import Servant.Job.Core
48 import Servant.Job.Types
49 import Servant.Job.Utils (jsonOptions)
50 import Servant.Multipart
51 import Test.QuickCheck (elements)
52 import Test.QuickCheck.Arbitrary
53 import Web.FormUrlEncoded (FromForm)
54 import qualified Gargantext.Text.Corpus.API as API
55 import qualified Gargantext.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
57 ------------------------------------------------------------------------
58 data Query = Query { query_query :: Text
59 , query_corpus_id :: Int
60 , query_databases :: [API.ExternalAPIs]
62 deriving (Eq, Show, Generic)
64 deriveJSON (unPrefix "query_") 'Query
66 instance Arbitrary Query where
67 arbitrary = elements [ Query q n fs
68 | q <- ["honeybee* AND collopase"
71 , fs <- take 3 $ repeat API.externalAPIs
74 instance ToSchema Query where
75 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
77 ------------------------------------------------------------------------
82 type PostApi = Summary "New Corpus endpoint"
83 :> ReqBody '[JSON] Query
84 :> Post '[JSON] CorpusId
85 type GetApi = Get '[JSON] ApiInfo
87 -- | TODO manage several apis
89 -- TODO this is only the POST
91 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
92 api uid (Query q _ as) = do
93 cId <- case head as of
94 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
95 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
97 docs <- liftBase $ API.get a q (Just 1000)
98 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
104 ------------------------------------------------
105 -- TODO use this route for Client implementation
106 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
108 instance Arbitrary ApiInfo where
109 arbitrary = ApiInfo <$> arbitrary
111 deriveJSON (unPrefix "") 'ApiInfo
113 instance ToSchema ApiInfo
115 info :: FlowCmdM env err m => UserId -> m ApiInfo
116 info _u = pure $ ApiInfo API.externalAPIs
118 ------------------------------------------------------------------------
119 ------------------------------------------------------------------------
120 data WithQuery = WithQuery
122 , _wq_databases :: ![ExternalAPIs]
123 , _wq_lang :: !(Maybe Lang)
127 makeLenses ''WithQuery
128 instance FromJSON WithQuery where
129 parseJSON = genericParseJSON $ jsonOptions "_wq_"
130 instance ToSchema WithQuery where
131 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
133 -------------------------------------------------------
134 data WithForm = WithForm
135 { _wf_filetype :: !FileType
137 , _wf_lang :: !(Maybe Lang)
139 } deriving (Eq, Show, Generic)
141 makeLenses ''WithForm
142 instance FromForm WithForm
143 instance FromJSON WithForm where
144 parseJSON = genericParseJSON $ jsonOptions "_wf_"
145 instance ToSchema WithForm where
146 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")
148 ------------------------------------------------------------------------
149 type AsyncJobs event ctI input output =
150 Flat (AsyncJobsAPI' 'Unsafe 'Safe ctI '[JSON] Maybe event input output)
151 ------------------------------------------------------------------------
153 type Upload = Summary "Corpus Upload endpoint"
155 :> Capture "corpus_id" CorpusId
157 :> AsyncJobsAPI ScraperStatus WithQuery ScraperStatus
159 :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
161 type AddWithQuery = Summary "Add with Query to corpus endpoint"
163 :> Capture "corpus_id" CorpusId
167 :> AsyncJobsAPI ScraperStatus WithQuery ScraperStatus
169 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
171 :> Capture "corpus_id" CorpusId
174 :> MultipartForm Mem (MultipartData Mem)
175 :> QueryParam "fileType" FileType
177 :> AsyncJobs ScraperStatus '[JSON] () ScraperStatus
179 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
181 :> Capture "corpus_id" CorpusId
185 :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
187 ------------------------------------------------------------------------
188 -- TODO WithQuery also has a corpus id
189 addToCorpusWithQuery :: FlowCmdM env err m
193 -> (ScraperStatus -> m ())
195 addToCorpusWithQuery u cid (WithQuery q _dbs l) logStatus = do
197 logStatus ScraperStatus { _scst_succeeded = Just 10
198 , _scst_failed = Just 2
199 , _scst_remaining = Just 138
200 , _scst_events = Just []
202 printDebug "addToCorpusWithQuery" cid
204 -- TODO if cid is folder -> create Corpus
205 -- if cid is corpus -> add to corpus
206 -- if cid is root -> create corpus in Private
207 cids <- flowCorpusSearchInDatabase u (maybe EN identity l) q
208 printDebug "corpus id" cids
210 pure ScraperStatus { _scst_succeeded = Just 137
211 , _scst_failed = Just 13
212 , _scst_remaining = Just 0
213 , _scst_events = Just []
216 addToCorpusWithFile :: FlowCmdM env err m
220 -> (ScraperStatus -> m ())
222 addToCorpusWithFile cid input filetype logStatus = do
223 logStatus ScraperStatus { _scst_succeeded = Just 10
224 , _scst_failed = Just 2
225 , _scst_remaining = Just 138
226 , _scst_events = Just []
228 printDebug "addToCorpusWithFile" cid
229 _h <- postUpload cid filetype input
231 pure ScraperStatus { _scst_succeeded = Just 137
232 , _scst_failed = Just 13
233 , _scst_remaining = Just 0
234 , _scst_events = Just []
237 addToCorpusWithForm :: FlowCmdM env err m
241 -> (ScraperStatus -> m ())
243 addToCorpusWithForm user cid (WithForm ft d l _n) logStatus = do
247 CSV_HAL -> Parser.parseFormat Parser.CsvHal
248 CSV -> Parser.parseFormat Parser.CsvGargV3
249 WOS -> Parser.parseFormat Parser.WOS
250 PresseRIS -> Parser.parseFormat Parser.RisPresse
252 logStatus ScraperStatus { _scst_succeeded = Just 1
253 , _scst_failed = Just 0
254 , _scst_remaining = Just 1
255 , _scst_events = Just []
258 printDebug "Parsing corpus: " cid
260 -- TODO granularity of the logStatus
261 docs <- liftBase $ splitEvery 500
265 printDebug "Parsing corpus finished : " cid
266 printDebug "Starting extraction : " cid
268 -- TODO granularity of the logStatus
269 _cid' <- flowCorpus user
271 (Multi $ fromMaybe EN l)
272 (map (map toHyperdataDocument) docs)
274 printDebug "Extraction finished : " cid
276 pure ScraperStatus { _scst_succeeded = Just 2
277 , _scst_failed = Just 0
278 , _scst_remaining = Just 0
279 , _scst_events = Just []