2 Module : Gargantext.API.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE DeriveGeneric #-}
18 {-# LANGUAGE DataKinds #-}
19 {-# LANGUAGE TypeOperators #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE FlexibleContexts #-}
22 {-# LANGUAGE RankNTypes #-}
23 {-# OPTIONS_GHC -fno-warn-orphans #-}
25 module Gargantext.API.Corpus.New
28 --import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat(..))
29 import Control.Lens hiding (elements)
30 import Control.Monad.IO.Class (liftIO)
32 import Data.Aeson.TH (deriveJSON)
33 import Data.Maybe (fromMaybe)
36 import Data.Text (Text)
37 import GHC.Generics (Generic)
38 import Gargantext.API.Corpus.New.File
39 import Gargantext.API.Orchestrator.Types
40 import Gargantext.Core (Lang(..))
41 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
42 import Gargantext.Database.Flow (FlowCmdM, flowCorpus)
43 import Gargantext.Database.Flow (flowCorpusSearchInDatabase)
44 import Gargantext.Database.Types.Node (CorpusId)
45 import Gargantext.Database.Types.Node (ToHyperdataDocument(..))
46 import Gargantext.Database.Types.Node (UserId)
47 import Gargantext.Prelude
48 import qualified Gargantext.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
49 import Gargantext.Text.Terms (TermType(..))
51 import Servant.API.Flatten (Flat)
52 import Servant.Job.Core
53 import Servant.Job.Types
54 import Servant.Job.Utils (jsonOptions)
55 import Servant.Multipart
56 import Test.QuickCheck (elements)
57 import Test.QuickCheck.Arbitrary
58 import Web.FormUrlEncoded (FromForm)
59 import qualified Gargantext.Text.Corpus.API as API
61 data Query = Query { query_query :: Text
62 , query_corpus_id :: Int
63 , query_databases :: [API.ExternalAPIs]
65 deriving (Eq, Show, Generic)
67 deriveJSON (unPrefix "query_") 'Query
69 instance Arbitrary Query where
70 arbitrary = elements [ Query q n fs
73 , fs <- take 3 $ repeat API.externalAPIs
76 instance ToSchema Query where
77 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
79 ------------------------------------------------------------------------
84 type PostApi = Summary "New Corpus endpoint"
85 :> ReqBody '[JSON] Query
86 :> Post '[JSON] CorpusId
87 type GetApi = Get '[JSON] ApiInfo
89 -- | TODO manage several apis
91 -- TODO this is only the POST
92 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
93 api _uId (Query q _ as) = do
94 cId <- case head as of
95 Nothing -> flowCorpusSearchInDatabase "user1" EN q
96 Just API.All -> flowCorpusSearchInDatabase "user1" EN q
98 docs <- liftIO $ API.get a q (Just 1000)
99 cId' <- flowCorpus "user1" (Left q) (Multi EN) [docs]
104 ------------------------------------------------
105 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
107 instance Arbitrary ApiInfo where
108 arbitrary = ApiInfo <$> arbitrary
110 deriveJSON (unPrefix "") 'ApiInfo
112 instance ToSchema ApiInfo
114 info :: FlowCmdM env err m => UserId -> m ApiInfo
115 info _u = pure $ ApiInfo API.externalAPIs
117 ------------------------------------------------------------------------
118 ------------------------------------------------------------------------
119 data WithQuery = WithQuery
121 , _wq_databases :: ![ExternalAPIs]
122 , _wq_lang :: !(Maybe Lang)
126 makeLenses ''WithQuery
127 instance FromJSON WithQuery where
128 parseJSON = genericParseJSON $ jsonOptions "_wq_"
129 instance ToSchema WithQuery where
130 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
132 -------------------------------------------------------
133 data WithForm = WithForm
134 { _wf_filetype :: !FileType
136 , _wf_lang :: !(Maybe Lang)
138 } deriving (Eq, Show, Generic)
140 makeLenses ''WithForm
141 instance FromForm WithForm
142 instance FromJSON WithForm where
143 parseJSON = genericParseJSON $ jsonOptions "_wf_"
144 instance ToSchema WithForm where
145 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")
147 ------------------------------------------------------------------------
148 type AsyncJobs event ctI input output =
149 Flat (AsyncJobsAPI' 'Unsafe 'Safe ctI '[JSON] Maybe event input output)
150 ------------------------------------------------------------------------
152 type Upload = Summary "Corpus Upload endpoint"
154 :> Capture "corpus_id" CorpusId
155 :<|> "addWithquery" :> AsyncJobsAPI ScraperStatus WithQuery ScraperStatus
156 :<|> "addWithfile" :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
158 type AddWithQuery = Summary "Add with Query to corpus endpoint"
160 :> Capture "corpus_id" CorpusId
164 :> AsyncJobsAPI ScraperStatus WithQuery ScraperStatus
166 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
168 :> Capture "corpus_id" CorpusId
171 :> MultipartForm Mem (MultipartData Mem)
172 :> QueryParam "fileType" FileType
174 :> AsyncJobs ScraperStatus '[JSON] () ScraperStatus
176 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
178 :> Capture "corpus_id" CorpusId
182 :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
184 ------------------------------------------------------------------------
185 -- TODO WithQuery also has a corpus id
186 addToCorpusJobFunction :: FlowCmdM env err m
189 -> (ScraperStatus -> m ())
191 addToCorpusJobFunction _cid (WithQuery _q _dbs _l) logStatus = do
193 logStatus ScraperStatus { _scst_succeeded = Just 10
194 , _scst_failed = Just 2
195 , _scst_remaining = Just 138
196 , _scst_events = Just []
199 pure ScraperStatus { _scst_succeeded = Just 137
200 , _scst_failed = Just 13
201 , _scst_remaining = Just 0
202 , _scst_events = Just []
205 addToCorpusWithFile :: FlowCmdM env err m
209 -> (ScraperStatus -> m ())
211 addToCorpusWithFile cid input filetype logStatus = do
212 logStatus ScraperStatus { _scst_succeeded = Just 10
213 , _scst_failed = Just 2
214 , _scst_remaining = Just 138
215 , _scst_events = Just []
217 _h <- postUpload cid filetype input
219 pure ScraperStatus { _scst_succeeded = Just 137
220 , _scst_failed = Just 13
221 , _scst_remaining = Just 0
222 , _scst_events = Just []
225 {- | Model to fork the flow
226 -- This is not really optimized since it increases the need RAM
227 -- and freezes the whole system
228 -- This is mainly for documentation (see a better solution in the function below)
229 -- Each process has to be tailored
230 addToCorpusWithForm' :: FlowCmdM env err m
233 -> (ScraperStatus -> m ())
235 addToCorpusWithForm' cid (WithForm ft d l) logStatus = do
236 newStatus <- liftIO newEmptyMVar
237 s <- addToCorpusWithForm cid (WithForm ft d l) logStatus
238 _ <- liftIO $ forkIO $ putMVar newStatus s
239 s' <- liftIO $ takeMVar newStatus
242 addToCorpusWithForm :: FlowCmdM env err m
245 -> (ScraperStatus -> m ())
247 addToCorpusWithForm cid (WithForm ft d l _n) logStatus = do
251 CSV_HAL -> Parser.parseFormat Parser.CsvHal
252 CSV -> Parser.parseFormat Parser.CsvGargV3
253 WOS -> Parser.parseFormat Parser.WOS
254 PresseRIS -> Parser.parseFormat Parser.RisPresse
256 logStatus ScraperStatus { _scst_succeeded = Just 1
257 , _scst_failed = Just 0
258 , _scst_remaining = Just 1
259 , _scst_events = Just []
262 printDebug "Parsing corpus: " cid
264 -- TODO granularity of the logStatus
265 docs <- liftIO $ splitEvery 500
269 printDebug "Parsing corpus finished : " cid
270 printDebug "Starting extraction : " cid
272 -- TODO granularity of the logStatus
273 _cid' <- flowCorpus "user1"
275 (Multi $ fromMaybe EN l)
276 (map (map toHyperdataDocument) docs)
278 printDebug "Extraction finished : " cid
280 pure ScraperStatus { _scst_succeeded = Just 2
281 , _scst_failed = Just 0
282 , _scst_remaining = Just 0
283 , _scst_events = Just []