2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE DeriveGeneric #-}
18 {-# LANGUAGE DataKinds #-}
19 {-# LANGUAGE TypeOperators #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE FlexibleContexts #-}
22 {-# LANGUAGE RankNTypes #-}
23 {-# OPTIONS_GHC -fno-warn-orphans #-}
25 module Gargantext.API.Node.Corpus.New
28 import Control.Lens hiding (elements, Empty)
30 import Data.Aeson.TH (deriveJSON)
32 import Data.Maybe (fromMaybe)
34 import Data.Text (Text)
35 import GHC.Generics (Generic)
36 import Gargantext.API.Admin.Orchestrator.Types (ScraperStatus(..))
37 import qualified Gargantext.API.Admin.Orchestrator.Types as T
38 import Gargantext.API.Node.Corpus.New.File
39 import Gargantext.Core (Lang(..){-, allLangs-})
40 import Gargantext.Core.Types.Individu (User(..))
41 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
42 import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, getDataText, flowDataText, TermType(..), DataOrigin(..){-, allDataOrigins-})
43 import Gargantext.Database.Admin.Types.Node (CorpusId, ToHyperdataDocument(..), UserId)
44 import Gargantext.Prelude
46 import Servant.API.Flatten (Flat)
47 import Servant.Job.Core
48 import Servant.Job.Types
49 import Servant.Job.Utils (jsonOptions)
50 -- import Servant.Multipart
51 -- import Test.QuickCheck (elements)
52 import Test.QuickCheck.Arbitrary
53 import Web.FormUrlEncoded (FromForm)
54 import qualified Gargantext.Text.Corpus.API as API
55 import qualified Gargantext.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
57 ------------------------------------------------------------------------
59 data Query = Query { query_query :: Text
60 , query_node_id :: Int
62 , query_databases :: [DataOrigin]
64 deriving (Eq, Generic)
66 deriveJSON (unPrefix "query_") 'Query
68 instance Arbitrary Query where
69 arbitrary = elements [ Query q n la fs
70 | q <- ["honeybee* AND collapse"
75 , fs <- take 3 $ repeat allDataOrigins
78 instance ToSchema Query where
79 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
82 ------------------------------------------------------------------------
88 type PostApi = Summary "New Corpus endpoint"
89 :> ReqBody '[JSON] Query
90 :> Post '[JSON] CorpusId
91 type GetApi = Get '[JSON] ApiInfo
94 -- | TODO manage several apis
96 -- TODO this is only the POST
98 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
99 api uid (Query q _ as) = do
100 cId <- case head as of
101 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
102 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
104 docs <- liftBase $ API.get a q (Just 1000)
105 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
111 ------------------------------------------------
112 -- TODO use this route for Client implementation
113 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
115 instance Arbitrary ApiInfo where
116 arbitrary = ApiInfo <$> arbitrary
118 deriveJSON (unPrefix "") 'ApiInfo
120 instance ToSchema ApiInfo
122 info :: FlowCmdM env err m => UserId -> m ApiInfo
123 info _u = pure $ ApiInfo API.externalAPIs
125 ------------------------------------------------------------------------
127 data Database = Empty
132 deriving (Eq, Show, Generic)
134 deriveJSON (unPrefix "") ''Database
135 instance ToSchema Database
137 database2origin :: Database -> DataOrigin
138 database2origin Empty = InternalOrigin T.IsTex
139 database2origin PubMed = ExternalOrigin T.PubMed
140 database2origin HAL = ExternalOrigin T.HAL
141 database2origin IsTex = ExternalOrigin T.IsTex
142 database2origin Isidore = ExternalOrigin T.Isidore
144 ------------------------------------------------------------------------
145 data WithQuery = WithQuery
147 , _wq_databases :: !Database
149 , _wq_node_id :: !Int
153 makeLenses ''WithQuery
154 instance FromJSON WithQuery where
155 parseJSON = genericParseJSON $ jsonOptions "_wq_"
156 instance ToSchema WithQuery where
157 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
159 -------------------------------------------------------
160 data WithForm = WithForm
161 { _wf_filetype :: !FileType
163 , _wf_lang :: !(Maybe Lang)
165 } deriving (Eq, Show, Generic)
167 makeLenses ''WithForm
168 instance FromForm WithForm
169 instance FromJSON WithForm where
170 parseJSON = genericParseJSON $ jsonOptions "_wf_"
171 instance ToSchema WithForm where
172 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")
174 ------------------------------------------------------------------------
175 type AsyncJobs event ctI input output =
176 Flat (AsyncJobsAPI' 'Unsafe 'Safe ctI '[JSON] Maybe event input output)
177 ------------------------------------------------------------------------
179 type AddWithQuery = Summary "Add with Query to corpus endpoint"
181 :> Capture "corpus_id" CorpusId
183 :> AsyncJobs ScraperStatus '[JSON] WithQuery ScraperStatus
186 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
188 :> Capture "corpus_id" CorpusId
191 :> MultipartForm Mem (MultipartData Mem)
192 :> QueryParam "fileType" FileType
194 :> AsyncJobs ScraperStatus '[JSON] () ScraperStatus
197 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
199 :> Capture "corpus_id" CorpusId
203 :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
206 ------------------------------------------------------------------------
207 -- TODO WithQuery also has a corpus id
208 addToCorpusWithQuery :: FlowCmdM env err m
212 -> (ScraperStatus -> m ())
214 addToCorpusWithQuery u cid (WithQuery q dbs l _nid) logStatus = do
216 logStatus ScraperStatus { _scst_succeeded = Just 10
217 , _scst_failed = Just 2
218 , _scst_remaining = Just 138
219 , _scst_events = Just []
221 printDebug "addToCorpusWithQuery" cid
223 -- TODO if cid is folder -> create Corpus
224 -- if cid is corpus -> add to corpus
225 -- if cid is root -> create corpus in Private
226 txts <- mapM (\db -> getDataText db (Multi l) q (Just 10000)) [database2origin dbs]
227 cids <- mapM (\txt -> flowDataText u txt (Multi l) cid) txts
228 printDebug "corpus id" cids
230 pure ScraperStatus { _scst_succeeded = Just 137
231 , _scst_failed = Just 13
232 , _scst_remaining = Just 0
233 , _scst_events = Just []
236 addToCorpusWithForm :: FlowCmdM env err m
240 -> (ScraperStatus -> m ())
242 addToCorpusWithForm user cid (WithForm ft d l _n) logStatus = do
246 CSV_HAL -> Parser.parseFormat Parser.CsvHal
247 CSV -> Parser.parseFormat Parser.CsvGargV3
248 WOS -> Parser.parseFormat Parser.WOS
249 PresseRIS -> Parser.parseFormat Parser.RisPresse
251 logStatus ScraperStatus { _scst_succeeded = Just 1
252 , _scst_failed = Just 0
253 , _scst_remaining = Just 1
254 , _scst_events = Just []
257 printDebug "Parsing corpus: " cid
259 -- TODO granularity of the logStatus
260 docs <- liftBase $ splitEvery 500
264 printDebug "Parsing corpus finished : " cid
265 printDebug "Starting extraction : " cid
267 -- TODO granularity of the logStatus
268 _cid' <- flowCorpus user
270 (Multi $ fromMaybe EN l)
271 (map (map toHyperdataDocument) docs)
273 printDebug "Extraction finished : " cid
275 pure ScraperStatus { _scst_succeeded = Just 2
276 , _scst_failed = Just 0
277 , _scst_remaining = Just 0
278 , _scst_events = Just []
282 addToCorpusWithFile :: FlowCmdM env err m
286 -> (ScraperStatus -> m ())
288 addToCorpusWithFile cid input filetype logStatus = do
289 logStatus ScraperStatus { _scst_succeeded = Just 10
290 , _scst_failed = Just 2
291 , _scst_remaining = Just 138
292 , _scst_events = Just []
294 printDebug "addToCorpusWithFile" cid
295 _h <- postUpload cid filetype input
297 pure ScraperStatus { _scst_succeeded = Just 137
298 , _scst_failed = Just 13
299 , _scst_remaining = Just 0
300 , _scst_events = Just []