2 Module : Gargantext.API.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE DeriveGeneric #-}
18 {-# LANGUAGE DataKinds #-}
19 {-# LANGUAGE TypeOperators #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE FlexibleContexts #-}
22 {-# LANGUAGE RankNTypes #-}
23 {-# OPTIONS_GHC -fno-warn-orphans #-}
25 module Gargantext.API.Corpus.New
28 --import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat(..))
29 import Control.Lens hiding (elements)
31 import Data.Aeson.TH (deriveJSON)
32 import Data.Maybe (fromMaybe)
35 import Data.Text (Text)
36 import GHC.Generics (Generic)
37 import Gargantext.API.Corpus.New.File
38 import Gargantext.API.Orchestrator.Types
39 import Gargantext.Core (Lang(..))
40 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
41 import Gargantext.Database.Flow (FlowCmdM, flowCorpus)
42 import Gargantext.Database.Flow (flowCorpusSearchInDatabase)
43 import Gargantext.Database.Types.Node (CorpusId)
44 import Gargantext.Database.Types.Node (ToHyperdataDocument(..))
45 import Gargantext.Database.Types.Node (UserId)
46 import Gargantext.Prelude
47 import qualified Gargantext.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
48 import Gargantext.Text.Terms (TermType(..))
50 import Servant.API.Flatten (Flat)
51 import Servant.Job.Core
52 import Servant.Job.Types
53 import Servant.Job.Utils (jsonOptions)
54 import Servant.Multipart
55 import Test.QuickCheck (elements)
56 import Test.QuickCheck.Arbitrary
57 import Web.FormUrlEncoded (FromForm)
58 import qualified Gargantext.Text.Corpus.API as API
60 data Query = Query { query_query :: Text
61 , query_corpus_id :: Int
62 , query_databases :: [API.ExternalAPIs]
64 deriving (Eq, Show, Generic)
66 deriveJSON (unPrefix "query_") 'Query
68 instance Arbitrary Query where
69 arbitrary = elements [ Query q n fs
72 , fs <- take 3 $ repeat API.externalAPIs
75 instance ToSchema Query where
76 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
78 ------------------------------------------------------------------------
83 type PostApi = Summary "New Corpus endpoint"
84 :> ReqBody '[JSON] Query
85 :> Post '[JSON] CorpusId
86 type GetApi = Get '[JSON] ApiInfo
88 -- | TODO manage several apis
90 -- TODO this is only the POST
91 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
92 api _uId (Query q _ as) = do
93 cId <- case head as of
94 Nothing -> flowCorpusSearchInDatabase "user1" EN q
95 Just API.All -> flowCorpusSearchInDatabase "user1" EN q
97 docs <- liftBase $ API.get a q (Just 1000)
98 cId' <- flowCorpus "user1" (Left q) (Multi EN) [docs]
103 ------------------------------------------------
104 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
106 instance Arbitrary ApiInfo where
107 arbitrary = ApiInfo <$> arbitrary
109 deriveJSON (unPrefix "") 'ApiInfo
111 instance ToSchema ApiInfo
113 info :: FlowCmdM env err m => UserId -> m ApiInfo
114 info _u = pure $ ApiInfo API.externalAPIs
116 ------------------------------------------------------------------------
117 ------------------------------------------------------------------------
118 data WithQuery = WithQuery
120 , _wq_databases :: ![ExternalAPIs]
121 , _wq_lang :: !(Maybe Lang)
125 makeLenses ''WithQuery
126 instance FromJSON WithQuery where
127 parseJSON = genericParseJSON $ jsonOptions "_wq_"
128 instance ToSchema WithQuery where
129 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
131 -------------------------------------------------------
132 data WithForm = WithForm
133 { _wf_filetype :: !FileType
135 , _wf_lang :: !(Maybe Lang)
137 } deriving (Eq, Show, Generic)
139 makeLenses ''WithForm
140 instance FromForm WithForm
141 instance FromJSON WithForm where
142 parseJSON = genericParseJSON $ jsonOptions "_wf_"
143 instance ToSchema WithForm where
144 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")
146 ------------------------------------------------------------------------
147 type AsyncJobs event ctI input output =
148 Flat (AsyncJobsAPI' 'Unsafe 'Safe ctI '[JSON] Maybe event input output)
149 ------------------------------------------------------------------------
151 type Upload = Summary "Corpus Upload endpoint"
153 :> Capture "corpus_id" CorpusId
154 :<|> "addWithquery" :> AsyncJobsAPI ScraperStatus WithQuery ScraperStatus
155 :<|> "addWithfile" :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
157 type AddWithQuery = Summary "Add with Query to corpus endpoint"
159 :> Capture "corpus_id" CorpusId
163 :> AsyncJobsAPI ScraperStatus WithQuery ScraperStatus
165 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
167 :> Capture "corpus_id" CorpusId
170 :> MultipartForm Mem (MultipartData Mem)
171 :> QueryParam "fileType" FileType
173 :> AsyncJobs ScraperStatus '[JSON] () ScraperStatus
175 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
177 :> Capture "corpus_id" CorpusId
181 :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
183 ------------------------------------------------------------------------
184 -- TODO WithQuery also has a corpus id
185 addToCorpusJobFunction :: FlowCmdM env err m
188 -> (ScraperStatus -> m ())
190 addToCorpusJobFunction _cid (WithQuery _q _dbs _l) logStatus = do
192 logStatus ScraperStatus { _scst_succeeded = Just 10
193 , _scst_failed = Just 2
194 , _scst_remaining = Just 138
195 , _scst_events = Just []
198 pure ScraperStatus { _scst_succeeded = Just 137
199 , _scst_failed = Just 13
200 , _scst_remaining = Just 0
201 , _scst_events = Just []
204 addToCorpusWithFile :: FlowCmdM env err m
208 -> (ScraperStatus -> m ())
210 addToCorpusWithFile cid input filetype logStatus = do
211 logStatus ScraperStatus { _scst_succeeded = Just 10
212 , _scst_failed = Just 2
213 , _scst_remaining = Just 138
214 , _scst_events = Just []
216 printDebug "addToCorpusWithFile" cid
217 _h <- postUpload cid filetype input
219 pure ScraperStatus { _scst_succeeded = Just 137
220 , _scst_failed = Just 13
221 , _scst_remaining = Just 0
222 , _scst_events = Just []
225 {- | Model to fork the flow
226 -- This is not really optimized since it increases the need RAM
227 -- and freezes the whole system
228 -- This is mainly for documentation (see a better solution in the function below)
229 -- Each process has to be tailored
230 addToCorpusWithForm' :: FlowCmdM env err m
233 -> (ScraperStatus -> m ())
235 addToCorpusWithForm' cid (WithForm ft d l) logStatus = do
236 newStatus <- liftBase newEmptyMVar
237 s <- addToCorpusWithForm cid (WithForm ft d l) logStatus
238 _ <- liftBase $ forkIO $ putMVar newStatus s
239 s' <- liftBase $ takeMVar newStatus
242 addToCorpusWithForm :: FlowCmdM env err m
246 -> (ScraperStatus -> m ())
248 addToCorpusWithForm username cid (WithForm ft d l _n) logStatus = do
252 CSV_HAL -> Parser.parseFormat Parser.CsvHal
253 CSV -> Parser.parseFormat Parser.CsvGargV3
254 WOS -> Parser.parseFormat Parser.WOS
255 PresseRIS -> Parser.parseFormat Parser.RisPresse
257 logStatus ScraperStatus { _scst_succeeded = Just 1
258 , _scst_failed = Just 0
259 , _scst_remaining = Just 1
260 , _scst_events = Just []
263 printDebug "Parsing corpus: " cid
265 -- TODO granularity of the logStatus
266 docs <- liftBase $ splitEvery 500
270 printDebug "Parsing corpus finished : " cid
271 printDebug "Starting extraction : " cid
273 -- TODO granularity of the logStatus
274 _cid' <- flowCorpus username
276 (Multi $ fromMaybe EN l)
277 (map (map toHyperdataDocument) docs)
279 printDebug "Extraction finished : " cid
281 pure ScraperStatus { _scst_succeeded = Just 2
282 , _scst_failed = Just 0
283 , _scst_remaining = Just 0
284 , _scst_events = Just []