2 Module : Gargantext.API.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE DeriveGeneric #-}
18 {-# LANGUAGE DataKinds #-}
19 {-# LANGUAGE TypeOperators #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE FlexibleContexts #-}
22 {-# LANGUAGE RankNTypes #-}
23 {-# OPTIONS_GHC -fno-warn-orphans #-}
25 module Gargantext.API.Corpus.New
28 import Control.Lens hiding (elements)
30 import Data.Aeson.TH (deriveJSON)
31 import Data.Maybe (fromMaybe)
34 import Data.Text (Text)
35 import GHC.Generics (Generic)
36 import Gargantext.API.Corpus.New.File
37 import Gargantext.API.Orchestrator.Types
38 import Gargantext.Core (Lang(..))
39 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
40 import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, flowCorpusSearchInDatabase)
41 import Gargantext.Database.Admin.Types.Node (CorpusId, ToHyperdataDocument(..))
42 import Gargantext.Core.Types.Individu (UserId, User(..))
43 import Gargantext.Prelude
44 import qualified Gargantext.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
45 import Gargantext.Text.Terms (TermType(..))
47 import Servant.API.Flatten (Flat)
48 import Servant.Job.Core
49 import Servant.Job.Types
50 import Servant.Job.Utils (jsonOptions)
51 import Servant.Multipart
52 import Test.QuickCheck (elements)
53 import Test.QuickCheck.Arbitrary
54 import Web.FormUrlEncoded (FromForm)
55 import qualified Gargantext.Text.Corpus.API as API
57 data Query = Query { query_query :: Text
58 , query_corpus_id :: Int
59 , query_databases :: [API.ExternalAPIs]
61 deriving (Eq, Show, Generic)
63 deriveJSON (unPrefix "query_") 'Query
65 instance Arbitrary Query where
66 arbitrary = elements [ Query q n fs
69 , fs <- take 3 $ repeat API.externalAPIs
72 instance ToSchema Query where
73 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
75 ------------------------------------------------------------------------
80 type PostApi = Summary "New Corpus endpoint"
81 :> ReqBody '[JSON] Query
82 :> Post '[JSON] CorpusId
83 type GetApi = Get '[JSON] ApiInfo
85 -- | TODO manage several apis
87 -- TODO this is only the POST
88 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
89 api uid (Query q _ as) = do
90 cId <- case head as of
91 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
92 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
94 docs <- liftBase $ API.get a q (Just 1000)
95 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
100 ------------------------------------------------
101 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
103 instance Arbitrary ApiInfo where
104 arbitrary = ApiInfo <$> arbitrary
106 deriveJSON (unPrefix "") 'ApiInfo
108 instance ToSchema ApiInfo
110 info :: FlowCmdM env err m => UserId -> m ApiInfo
111 info _u = pure $ ApiInfo API.externalAPIs
113 ------------------------------------------------------------------------
114 ------------------------------------------------------------------------
115 data WithQuery = WithQuery
117 , _wq_databases :: ![ExternalAPIs]
118 , _wq_lang :: !(Maybe Lang)
122 makeLenses ''WithQuery
123 instance FromJSON WithQuery where
124 parseJSON = genericParseJSON $ jsonOptions "_wq_"
125 instance ToSchema WithQuery where
126 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
128 -------------------------------------------------------
129 data WithForm = WithForm
130 { _wf_filetype :: !FileType
132 , _wf_lang :: !(Maybe Lang)
134 } deriving (Eq, Show, Generic)
136 makeLenses ''WithForm
137 instance FromForm WithForm
138 instance FromJSON WithForm where
139 parseJSON = genericParseJSON $ jsonOptions "_wf_"
140 instance ToSchema WithForm where
141 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")
143 ------------------------------------------------------------------------
144 type AsyncJobs event ctI input output =
145 Flat (AsyncJobsAPI' 'Unsafe 'Safe ctI '[JSON] Maybe event input output)
146 ------------------------------------------------------------------------
148 type Upload = Summary "Corpus Upload endpoint"
150 :> Capture "corpus_id" CorpusId
151 :<|> "addWithquery" :> AsyncJobsAPI ScraperStatus WithQuery ScraperStatus
152 :<|> "addWithfile" :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
154 type AddWithQuery = Summary "Add with Query to corpus endpoint"
156 :> Capture "corpus_id" CorpusId
160 :> AsyncJobsAPI ScraperStatus WithQuery ScraperStatus
162 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
164 :> Capture "corpus_id" CorpusId
167 :> MultipartForm Mem (MultipartData Mem)
168 :> QueryParam "fileType" FileType
170 :> AsyncJobs ScraperStatus '[JSON] () ScraperStatus
172 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
174 :> Capture "corpus_id" CorpusId
178 :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
180 ------------------------------------------------------------------------
181 -- TODO WithQuery also has a corpus id
182 addToCorpusWithQuery :: FlowCmdM env err m
186 -> (ScraperStatus -> m ())
188 addToCorpusWithQuery u cid (WithQuery q _dbs l) logStatus = do
190 logStatus ScraperStatus { _scst_succeeded = Just 10
191 , _scst_failed = Just 2
192 , _scst_remaining = Just 138
193 , _scst_events = Just []
195 printDebug "addToCorpusWithQuery" cid
197 -- TODO if cid is folder -> create Corpus
198 -- if cid is corpus -> add to corpus
199 -- if cid is root -> create corpus in Private
200 cids <- flowCorpusSearchInDatabase u (maybe EN identity l) q
201 printDebug "corpus id" cids
203 pure ScraperStatus { _scst_succeeded = Just 137
204 , _scst_failed = Just 13
205 , _scst_remaining = Just 0
206 , _scst_events = Just []
209 addToCorpusWithFile :: FlowCmdM env err m
213 -> (ScraperStatus -> m ())
215 addToCorpusWithFile cid input filetype logStatus = do
216 logStatus ScraperStatus { _scst_succeeded = Just 10
217 , _scst_failed = Just 2
218 , _scst_remaining = Just 138
219 , _scst_events = Just []
221 printDebug "addToCorpusWithFile" cid
222 _h <- postUpload cid filetype input
224 pure ScraperStatus { _scst_succeeded = Just 137
225 , _scst_failed = Just 13
226 , _scst_remaining = Just 0
227 , _scst_events = Just []
230 {- | Model to fork the flow
231 -- This is not really optimized since it increases the need RAM
232 -- and freezes the whole system
233 -- This is mainly for documentation (see a better solution in the function below)
234 -- Each process has to be tailored
235 addToCorpusWithForm' :: FlowCmdM env err m
238 -> (ScraperStatus -> m ())
240 addToCorpusWithForm' cid (WithForm ft d l) logStatus = do
241 newStatus <- liftBase newEmptyMVar
242 s <- addToCorpusWithForm cid (WithForm ft d l) logStatus
243 _ <- liftBase $ forkIO $ putMVar newStatus s
244 s' <- liftBase $ takeMVar newStatus
247 addToCorpusWithForm :: FlowCmdM env err m
251 -> (ScraperStatus -> m ())
253 addToCorpusWithForm user cid (WithForm ft d l _n) logStatus = do
257 CSV_HAL -> Parser.parseFormat Parser.CsvHal
258 CSV -> Parser.parseFormat Parser.CsvGargV3
259 WOS -> Parser.parseFormat Parser.WOS
260 PresseRIS -> Parser.parseFormat Parser.RisPresse
262 logStatus ScraperStatus { _scst_succeeded = Just 1
263 , _scst_failed = Just 0
264 , _scst_remaining = Just 1
265 , _scst_events = Just []
268 printDebug "Parsing corpus: " cid
270 -- TODO granularity of the logStatus
271 docs <- liftBase $ splitEvery 500
275 printDebug "Parsing corpus finished : " cid
276 printDebug "Starting extraction : " cid
278 -- TODO granularity of the logStatus
279 _cid' <- flowCorpus user
281 (Multi $ fromMaybe EN l)
282 (map (map toHyperdataDocument) docs)
284 printDebug "Extraction finished : " cid
286 pure ScraperStatus { _scst_succeeded = Just 2
287 , _scst_failed = Just 0
288 , _scst_remaining = Just 0
289 , _scst_events = Just []