2 Module : Gargantext.API.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE DeriveGeneric #-}
18 {-# LANGUAGE DataKinds #-}
19 {-# LANGUAGE TypeOperators #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE FlexibleContexts #-}
22 {-# LANGUAGE RankNTypes #-}
23 {-# OPTIONS_GHC -fno-warn-orphans #-}
25 module Gargantext.API.Corpus.New
28 import Control.Lens hiding (elements, Empty)
30 import Data.Aeson.TH (deriveJSON)
32 import Data.Maybe (fromMaybe)
34 import Data.Text (Text)
35 import GHC.Generics (Generic)
36 import Gargantext.API.Admin.Orchestrator.Types (ScraperStatus(..))
37 import qualified Gargantext.API.Admin.Orchestrator.Types as T
38 import Gargantext.API.Corpus.New.File
39 import Gargantext.Core (Lang(..){-, allLangs-})
40 import Gargantext.Core.Types.Individu (UserId, User(..))
41 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
42 import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, getDataText, flowDataText, TermType(..), DataOrigin(..){-, allDataOrigins-})
43 import Gargantext.Database.Admin.Types.Node (CorpusId, ToHyperdataDocument(..))
44 import Gargantext.Prelude
46 import Servant.API.Flatten (Flat)
47 import Servant.Job.Core
48 import Servant.Job.Types
49 import Servant.Job.Utils (jsonOptions)
50 -- import Servant.Multipart
51 -- import Test.QuickCheck (elements)
52 import Test.QuickCheck.Arbitrary
53 import Web.FormUrlEncoded (FromForm)
54 import qualified Gargantext.Text.Corpus.API as API
55 import qualified Gargantext.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
57 ------------------------------------------------------------------------
59 data Query = Query { query_query :: Text
60 , query_node_id :: Int
62 , query_databases :: [DataOrigin]
64 deriving (Eq, Generic)
66 deriveJSON (unPrefix "query_") 'Query
68 instance Arbitrary Query where
69 arbitrary = elements [ Query q n la fs
70 | q <- ["honeybee* AND collapse"
75 , fs <- take 3 $ repeat allDataOrigins
78 instance ToSchema Query where
79 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
82 ------------------------------------------------------------------------
88 type PostApi = Summary "New Corpus endpoint"
89 :> ReqBody '[JSON] Query
90 :> Post '[JSON] CorpusId
91 type GetApi = Get '[JSON] ApiInfo
94 -- | TODO manage several apis
96 -- TODO this is only the POST
98 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
99 api uid (Query q _ as) = do
100 cId <- case head as of
101 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
102 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
104 docs <- liftBase $ API.get a q (Just 1000)
105 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
111 ------------------------------------------------
112 -- TODO use this route for Client implementation
113 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
115 instance Arbitrary ApiInfo where
116 arbitrary = ApiInfo <$> arbitrary
118 deriveJSON (unPrefix "") 'ApiInfo
120 instance ToSchema ApiInfo
122 info :: FlowCmdM env err m => UserId -> m ApiInfo
123 info _u = pure $ ApiInfo API.externalAPIs
125 ------------------------------------------------------------------------
127 data Database = Empty
132 deriving (Eq, Show, Generic)
134 deriveJSON (unPrefix "") ''Database
135 instance ToSchema Database
137 database2origin :: Database -> DataOrigin
138 database2origin Empty = InternalOrigin T.IsTex
139 database2origin PubMed = ExternalOrigin T.PubMed
140 database2origin HAL = ExternalOrigin T.HAL
141 database2origin IsTex = ExternalOrigin T.IsTex
142 database2origin Isidore = ExternalOrigin T.Isidore
144 ------------------------------------------------------------------------
145 data WithQuery = WithQuery
147 , _wq_databases :: !Database
149 , _wq_node_id :: !Int
153 makeLenses ''WithQuery
154 instance FromJSON WithQuery where
155 parseJSON = genericParseJSON $ jsonOptions "_wq_"
156 instance ToSchema WithQuery where
157 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
159 -------------------------------------------------------
160 data WithForm = WithForm
161 { _wf_filetype :: !FileType
163 , _wf_lang :: !(Maybe Lang)
165 } deriving (Eq, Show, Generic)
167 makeLenses ''WithForm
168 instance FromForm WithForm
169 instance FromJSON WithForm where
170 parseJSON = genericParseJSON $ jsonOptions "_wf_"
171 instance ToSchema WithForm where
172 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")
174 ------------------------------------------------------------------------
175 type AsyncJobs event ctI input output =
176 Flat (AsyncJobsAPI' 'Unsafe 'Safe ctI '[JSON] Maybe event input output)
177 ------------------------------------------------------------------------
179 type AddWithQuery = Summary "Add with Query to corpus endpoint"
181 :> Capture "corpus_id" CorpusId
183 :> AsyncJobs ScraperStatus '[JSON] WithQuery ScraperStatus
186 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
188 :> Capture "corpus_id" CorpusId
191 :> MultipartForm Mem (MultipartData Mem)
192 :> QueryParam "fileType" FileType
194 :> AsyncJobs ScraperStatus '[JSON] () ScraperStatus
197 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
199 :> Capture "corpus_id" CorpusId
203 :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
205 ------------------------------------------------------------------------
206 -- TODO WithQuery also has a corpus id
207 addToCorpusWithQuery :: FlowCmdM env err m
211 -> (ScraperStatus -> m ())
213 addToCorpusWithQuery u cid (WithQuery q dbs l _nid) logStatus = do
215 logStatus ScraperStatus { _scst_succeeded = Just 10
216 , _scst_failed = Just 2
217 , _scst_remaining = Just 138
218 , _scst_events = Just []
220 printDebug "addToCorpusWithQuery" cid
222 -- TODO if cid is folder -> create Corpus
223 -- if cid is corpus -> add to corpus
224 -- if cid is root -> create corpus in Private
225 txts <- mapM (\db -> getDataText db (Multi l) q (Just 10000)) [database2origin dbs]
226 cids <- mapM (\txt -> flowDataText u txt (Multi l) cid) txts
227 printDebug "corpus id" cids
229 pure ScraperStatus { _scst_succeeded = Just 137
230 , _scst_failed = Just 13
231 , _scst_remaining = Just 0
232 , _scst_events = Just []
235 addToCorpusWithForm :: FlowCmdM env err m
239 -> (ScraperStatus -> m ())
241 addToCorpusWithForm user cid (WithForm ft d l _n) logStatus = do
245 CSV_HAL -> Parser.parseFormat Parser.CsvHal
246 CSV -> Parser.parseFormat Parser.CsvGargV3
247 WOS -> Parser.parseFormat Parser.WOS
248 PresseRIS -> Parser.parseFormat Parser.RisPresse
250 logStatus ScraperStatus { _scst_succeeded = Just 1
251 , _scst_failed = Just 0
252 , _scst_remaining = Just 1
253 , _scst_events = Just []
256 printDebug "Parsing corpus: " cid
258 -- TODO granularity of the logStatus
259 docs <- liftBase $ splitEvery 500
263 printDebug "Parsing corpus finished : " cid
264 printDebug "Starting extraction : " cid
266 -- TODO granularity of the logStatus
267 _cid' <- flowCorpus user
269 (Multi $ fromMaybe EN l)
270 (map (map toHyperdataDocument) docs)
272 printDebug "Extraction finished : " cid
274 pure ScraperStatus { _scst_succeeded = Just 2
275 , _scst_failed = Just 0
276 , _scst_remaining = Just 0
277 , _scst_events = Just []
281 addToCorpusWithFile :: FlowCmdM env err m
285 -> (ScraperStatus -> m ())
287 addToCorpusWithFile cid input filetype logStatus = do
288 logStatus ScraperStatus { _scst_succeeded = Just 10
289 , _scst_failed = Just 2
290 , _scst_remaining = Just 138
291 , _scst_events = Just []
293 printDebug "addToCorpusWithFile" cid
294 _h <- postUpload cid filetype input
296 pure ScraperStatus { _scst_succeeded = Just 137
297 , _scst_failed = Just 13
298 , _scst_remaining = Just 0
299 , _scst_events = Just []