2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE DeriveGeneric #-}
18 {-# LANGUAGE DataKinds #-}
19 {-# LANGUAGE TypeOperators #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE FlexibleContexts #-}
22 {-# LANGUAGE RankNTypes #-}
23 {-# OPTIONS_GHC -fno-warn-orphans #-}
25 module Gargantext.API.Node.Corpus.New
28 import Control.Lens hiding (elements, Empty)
30 import Data.Aeson.TH (deriveJSON)
32 import Data.Maybe (fromMaybe)
34 import Data.Text (Text)
35 import GHC.Generics (Generic)
37 import Servant.Job.Core
38 import Servant.Job.Types
39 import Servant.Job.Utils (jsonOptions)
40 -- import Servant.Multipart
41 -- import Test.QuickCheck (elements)
42 import Test.QuickCheck.Arbitrary
43 import Web.FormUrlEncoded (FromForm)
45 import Gargantext.API.Admin.Orchestrator.Types (ScraperStatus(..))
46 import qualified Gargantext.API.Admin.Orchestrator.Types as T
47 import Gargantext.API.Node.Corpus.New.File
48 import Gargantext.Core (Lang(..){-, allLangs-})
49 import Gargantext.Core.Types.Individu (User(..))
50 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
51 import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, getDataText, flowDataText, TermType(..), DataOrigin(..){-, allDataOrigins-})
52 import Gargantext.Database.Admin.Types.Hyperdata
53 import Gargantext.Database.Admin.Types.Node (CorpusId, UserId)
54 import Gargantext.Prelude
55 import qualified Gargantext.Text.Corpus.API as API
56 import qualified Gargantext.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
58 ------------------------------------------------------------------------
60 data Query = Query { query_query :: Text
61 , query_node_id :: Int
63 , query_databases :: [DataOrigin]
65 deriving (Eq, Generic)
67 deriveJSON (unPrefix "query_") 'Query
69 instance Arbitrary Query where
70 arbitrary = elements [ Query q n la fs
71 | q <- ["honeybee* AND collapse"
76 , fs <- take 3 $ repeat allDataOrigins
79 instance ToSchema Query where
80 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
83 ------------------------------------------------------------------------
89 type PostApi = Summary "New Corpus endpoint"
90 :> ReqBody '[JSON] Query
91 :> Post '[JSON] CorpusId
92 type GetApi = Get '[JSON] ApiInfo
95 -- | TODO manage several apis
97 -- TODO this is only the POST
99 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
100 api uid (Query q _ as) = do
101 cId <- case head as of
102 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
103 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
105 docs <- liftBase $ API.get a q (Just 1000)
106 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
112 ------------------------------------------------
113 -- TODO use this route for Client implementation
114 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
116 instance Arbitrary ApiInfo where
117 arbitrary = ApiInfo <$> arbitrary
119 deriveJSON (unPrefix "") 'ApiInfo
121 instance ToSchema ApiInfo
123 info :: FlowCmdM env err m => UserId -> m ApiInfo
124 info _u = pure $ ApiInfo API.externalAPIs
126 ------------------------------------------------------------------------
128 data Database = Empty
133 deriving (Eq, Show, Generic)
135 deriveJSON (unPrefix "") ''Database
136 instance ToSchema Database
138 database2origin :: Database -> DataOrigin
139 database2origin Empty = InternalOrigin T.IsTex
140 database2origin PubMed = ExternalOrigin T.PubMed
141 database2origin HAL = ExternalOrigin T.HAL
142 database2origin IsTex = ExternalOrigin T.IsTex
143 database2origin Isidore = ExternalOrigin T.Isidore
145 ------------------------------------------------------------------------
146 data WithQuery = WithQuery
148 , _wq_databases :: !Database
150 , _wq_node_id :: !Int
154 makeLenses ''WithQuery
155 instance FromJSON WithQuery where
156 parseJSON = genericParseJSON $ jsonOptions "_wq_"
157 instance ToSchema WithQuery where
158 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
160 -------------------------------------------------------
161 data WithForm = WithForm
162 { _wf_filetype :: !FileType
164 , _wf_lang :: !(Maybe Lang)
166 } deriving (Eq, Show, Generic)
168 makeLenses ''WithForm
169 instance FromForm WithForm
170 instance FromJSON WithForm where
171 parseJSON = genericParseJSON $ jsonOptions "_wf_"
172 instance ToSchema WithForm where
173 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")
175 ------------------------------------------------------------------------
176 type AsyncJobs event ctI input output =
177 AsyncJobsAPI' 'Unsafe 'Safe ctI '[JSON] Maybe event input output
178 ------------------------------------------------------------------------
180 type AddWithQuery = Summary "Add with Query to corpus endpoint"
182 :> Capture "corpus_id" CorpusId
184 :> AsyncJobs ScraperStatus '[JSON] WithQuery ScraperStatus
187 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
189 :> Capture "corpus_id" CorpusId
192 :> MultipartForm Mem (MultipartData Mem)
193 :> QueryParam "fileType" FileType
195 :> AsyncJobs ScraperStatus '[JSON] () ScraperStatus
198 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
200 :> Capture "corpus_id" CorpusId
204 :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
207 ------------------------------------------------------------------------
208 -- TODO WithQuery also has a corpus id
209 addToCorpusWithQuery :: FlowCmdM env err m
213 -> (ScraperStatus -> m ())
215 addToCorpusWithQuery u cid (WithQuery q dbs l _nid) logStatus = do
217 logStatus ScraperStatus { _scst_succeeded = Just 10
218 , _scst_failed = Just 2
219 , _scst_remaining = Just 138
220 , _scst_events = Just []
222 printDebug "addToCorpusWithQuery" cid
224 -- TODO if cid is folder -> create Corpus
225 -- if cid is corpus -> add to corpus
226 -- if cid is root -> create corpus in Private
227 txts <- mapM (\db -> getDataText db (Multi l) q (Just 10000)) [database2origin dbs]
228 cids <- mapM (\txt -> flowDataText u txt (Multi l) cid) txts
229 printDebug "corpus id" cids
231 pure ScraperStatus { _scst_succeeded = Just 137
232 , _scst_failed = Just 13
233 , _scst_remaining = Just 0
234 , _scst_events = Just []
237 addToCorpusWithForm :: FlowCmdM env err m
241 -> (ScraperStatus -> m ())
243 addToCorpusWithForm user cid (WithForm ft d l _n) logStatus = do
247 CSV_HAL -> Parser.parseFormat Parser.CsvHal
248 CSV -> Parser.parseFormat Parser.CsvGargV3
249 WOS -> Parser.parseFormat Parser.WOS
250 PresseRIS -> Parser.parseFormat Parser.RisPresse
252 logStatus ScraperStatus { _scst_succeeded = Just 1
253 , _scst_failed = Just 0
254 , _scst_remaining = Just 1
255 , _scst_events = Just []
258 printDebug "Parsing corpus: " cid
260 -- TODO granularity of the logStatus
261 docs <- liftBase $ splitEvery 500
265 printDebug "Parsing corpus finished : " cid
266 printDebug "Starting extraction : " cid
268 -- TODO granularity of the logStatus
269 _cid' <- flowCorpus user
271 (Multi $ fromMaybe EN l)
272 (map (map toHyperdataDocument) docs)
274 printDebug "Extraction finished : " cid
276 pure ScraperStatus { _scst_succeeded = Just 2
277 , _scst_failed = Just 0
278 , _scst_remaining = Just 0
279 , _scst_events = Just []
283 addToCorpusWithFile :: FlowCmdM env err m
287 -> (ScraperStatus -> m ())
289 addToCorpusWithFile cid input filetype logStatus = do
290 logStatus ScraperStatus { _scst_succeeded = Just 10
291 , _scst_failed = Just 2
292 , _scst_remaining = Just 138
293 , _scst_events = Just []
295 printDebug "addToCorpusWithFile" cid
296 _h <- postUpload cid filetype input
298 pure ScraperStatus { _scst_succeeded = Just 137
299 , _scst_failed = Just 13
300 , _scst_remaining = Just 0
301 , _scst_events = Just []