2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE DeriveGeneric #-}
18 {-# LANGUAGE DataKinds #-}
19 {-# LANGUAGE TypeOperators #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE FlexibleContexts #-}
22 {-# LANGUAGE RankNTypes #-}
23 {-# OPTIONS_GHC -fno-warn-orphans #-}
25 module Gargantext.API.Node.Corpus.New
28 import Control.Lens hiding (elements, Empty)
30 import Data.Aeson.TH (deriveJSON)
32 import Data.Maybe (fromMaybe)
34 import Data.Text (Text)
35 import GHC.Generics (Generic)
36 import Gargantext.API.Admin.Orchestrator.Types (ScraperStatus(..))
37 import qualified Gargantext.API.Admin.Orchestrator.Types as T
38 import Gargantext.API.Node.Corpus.New.File
39 import Gargantext.Core (Lang(..){-, allLangs-})
40 import Gargantext.Core.Types.Individu (User(..))
41 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
42 import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, getDataText, flowDataText, TermType(..), DataOrigin(..){-, allDataOrigins-})
43 import Gargantext.Database.Admin.Types.Node (CorpusId, ToHyperdataDocument(..), UserId)
44 import Gargantext.Prelude
46 import Servant.Job.Core
47 import Servant.Job.Types
48 import Servant.Job.Utils (jsonOptions)
49 -- import Servant.Multipart
50 -- import Test.QuickCheck (elements)
51 import Test.QuickCheck.Arbitrary
52 import Web.FormUrlEncoded (FromForm)
53 import qualified Gargantext.Text.Corpus.API as API
54 import qualified Gargantext.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
56 ------------------------------------------------------------------------
58 data Query = Query { query_query :: Text
59 , query_node_id :: Int
61 , query_databases :: [DataOrigin]
63 deriving (Eq, Generic)
65 deriveJSON (unPrefix "query_") 'Query
67 instance Arbitrary Query where
68 arbitrary = elements [ Query q n la fs
69 | q <- ["honeybee* AND collapse"
74 , fs <- take 3 $ repeat allDataOrigins
77 instance ToSchema Query where
78 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
81 ------------------------------------------------------------------------
87 type PostApi = Summary "New Corpus endpoint"
88 :> ReqBody '[JSON] Query
89 :> Post '[JSON] CorpusId
90 type GetApi = Get '[JSON] ApiInfo
93 -- | TODO manage several apis
95 -- TODO this is only the POST
97 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
98 api uid (Query q _ as) = do
99 cId <- case head as of
100 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
101 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
103 docs <- liftBase $ API.get a q (Just 1000)
104 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
110 ------------------------------------------------
111 -- TODO use this route for Client implementation
112 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
114 instance Arbitrary ApiInfo where
115 arbitrary = ApiInfo <$> arbitrary
117 deriveJSON (unPrefix "") 'ApiInfo
119 instance ToSchema ApiInfo
121 info :: FlowCmdM env err m => UserId -> m ApiInfo
122 info _u = pure $ ApiInfo API.externalAPIs
124 ------------------------------------------------------------------------
126 data Database = Empty
131 deriving (Eq, Show, Generic)
133 deriveJSON (unPrefix "") ''Database
134 instance ToSchema Database
136 database2origin :: Database -> DataOrigin
137 database2origin Empty = InternalOrigin T.IsTex
138 database2origin PubMed = ExternalOrigin T.PubMed
139 database2origin HAL = ExternalOrigin T.HAL
140 database2origin IsTex = ExternalOrigin T.IsTex
141 database2origin Isidore = ExternalOrigin T.Isidore
143 ------------------------------------------------------------------------
144 data WithQuery = WithQuery
146 , _wq_databases :: !Database
148 , _wq_node_id :: !Int
152 makeLenses ''WithQuery
153 instance FromJSON WithQuery where
154 parseJSON = genericParseJSON $ jsonOptions "_wq_"
155 instance ToSchema WithQuery where
156 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
158 -------------------------------------------------------
159 data WithForm = WithForm
160 { _wf_filetype :: !FileType
162 , _wf_lang :: !(Maybe Lang)
164 } deriving (Eq, Show, Generic)
166 makeLenses ''WithForm
167 instance FromForm WithForm
168 instance FromJSON WithForm where
169 parseJSON = genericParseJSON $ jsonOptions "_wf_"
170 instance ToSchema WithForm where
171 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")
173 ------------------------------------------------------------------------
174 type AsyncJobs event ctI input output =
175 AsyncJobsAPI' 'Unsafe 'Safe ctI '[JSON] Maybe event input output
176 ------------------------------------------------------------------------
178 type AddWithQuery = Summary "Add with Query to corpus endpoint"
180 :> Capture "corpus_id" CorpusId
182 :> AsyncJobs ScraperStatus '[JSON] WithQuery ScraperStatus
185 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
187 :> Capture "corpus_id" CorpusId
190 :> MultipartForm Mem (MultipartData Mem)
191 :> QueryParam "fileType" FileType
193 :> AsyncJobs ScraperStatus '[JSON] () ScraperStatus
196 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
198 :> Capture "corpus_id" CorpusId
202 :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
205 ------------------------------------------------------------------------
206 -- TODO WithQuery also has a corpus id
207 addToCorpusWithQuery :: FlowCmdM env err m
211 -> (ScraperStatus -> m ())
213 addToCorpusWithQuery u cid (WithQuery q dbs l _nid) logStatus = do
215 logStatus ScraperStatus { _scst_succeeded = Just 10
216 , _scst_failed = Just 2
217 , _scst_remaining = Just 138
218 , _scst_events = Just []
220 printDebug "addToCorpusWithQuery" cid
222 -- TODO if cid is folder -> create Corpus
223 -- if cid is corpus -> add to corpus
224 -- if cid is root -> create corpus in Private
225 txts <- mapM (\db -> getDataText db (Multi l) q (Just 10000)) [database2origin dbs]
226 cids <- mapM (\txt -> flowDataText u txt (Multi l) cid) txts
227 printDebug "corpus id" cids
229 pure ScraperStatus { _scst_succeeded = Just 137
230 , _scst_failed = Just 13
231 , _scst_remaining = Just 0
232 , _scst_events = Just []
235 addToCorpusWithForm :: FlowCmdM env err m
239 -> (ScraperStatus -> m ())
241 addToCorpusWithForm user cid (WithForm ft d l _n) logStatus = do
245 CSV_HAL -> Parser.parseFormat Parser.CsvHal
246 CSV -> Parser.parseFormat Parser.CsvGargV3
247 WOS -> Parser.parseFormat Parser.WOS
248 PresseRIS -> Parser.parseFormat Parser.RisPresse
250 logStatus ScraperStatus { _scst_succeeded = Just 1
251 , _scst_failed = Just 0
252 , _scst_remaining = Just 1
253 , _scst_events = Just []
256 printDebug "Parsing corpus: " cid
258 -- TODO granularity of the logStatus
259 docs <- liftBase $ splitEvery 500
263 printDebug "Parsing corpus finished : " cid
264 printDebug "Starting extraction : " cid
266 -- TODO granularity of the logStatus
267 _cid' <- flowCorpus user
269 (Multi $ fromMaybe EN l)
270 (map (map toHyperdataDocument) docs)
272 printDebug "Extraction finished : " cid
274 pure ScraperStatus { _scst_succeeded = Just 2
275 , _scst_failed = Just 0
276 , _scst_remaining = Just 0
277 , _scst_events = Just []
281 addToCorpusWithFile :: FlowCmdM env err m
285 -> (ScraperStatus -> m ())
287 addToCorpusWithFile cid input filetype logStatus = do
288 logStatus ScraperStatus { _scst_succeeded = Just 10
289 , _scst_failed = Just 2
290 , _scst_remaining = Just 138
291 , _scst_events = Just []
293 printDebug "addToCorpusWithFile" cid
294 _h <- postUpload cid filetype input
296 pure ScraperStatus { _scst_succeeded = Just 137
297 , _scst_failed = Just 13
298 , _scst_remaining = Just 0
299 , _scst_events = Just []