2 Module : Gargantext.API.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE DeriveGeneric #-}
18 {-# LANGUAGE DataKinds #-}
19 {-# LANGUAGE TypeOperators #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE FlexibleContexts #-}
22 {-# LANGUAGE RankNTypes #-}
24 module Gargantext.API.Corpus.New
27 --import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat(..))
28 import Control.Lens hiding (elements)
29 import Control.Monad.IO.Class (liftIO)
30 import Control.Concurrent
32 import Data.Aeson.TH (deriveJSON)
33 import Data.Maybe (fromMaybe)
36 import Data.Text (Text)
37 import GHC.Generics (Generic)
38 import Gargantext.API.Corpus.New.File
39 import Gargantext.API.Orchestrator.Types
40 import Gargantext.Core (Lang(..))
41 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
42 import Gargantext.Database.Flow (FlowCmdM, flowCorpus)
43 import Gargantext.Database.Flow (flowCorpusSearchInDatabase)
44 import Gargantext.Database.Types.Node (CorpusId)
45 import Gargantext.Database.Types.Node (ToHyperdataDocument(..))
46 import Gargantext.Database.Types.Node (UserId)
47 import Gargantext.Prelude
48 import qualified Gargantext.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
49 import Gargantext.Text.Terms (TermType(..))
51 import Servant.API.Flatten (Flat)
52 import Servant.Job.Core
53 import Servant.Job.Types
54 import Servant.Job.Utils (jsonOptions)
55 import Servant.Multipart
56 import Test.QuickCheck (elements)
57 import Test.QuickCheck.Arbitrary
58 import Web.FormUrlEncoded (FromForm)
59 import qualified Gargantext.Text.Corpus.API as API
61 data Query = Query { query_query :: Text
62 , query_corpus_id :: Int
63 , query_databases :: [API.ExternalAPIs]
65 deriving (Eq, Show, Generic)
67 deriveJSON (unPrefix "query_") 'Query
70 instance Arbitrary Query where
71 arbitrary = elements [ Query q n fs
74 , fs <- take 3 $ repeat API.externalAPIs
77 instance ToSchema Query where
78 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
80 type Api = Summary "New Corpus endpoint"
81 :> ReqBody '[JSON] Query
82 :> Post '[JSON] CorpusId
83 :<|> Get '[JSON] ApiInfo
85 -- | TODO manage several apis
87 -- TODO this is only the POST
88 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
89 api _uId (Query q _ as) = do
90 cId <- case head as of
91 Nothing -> flowCorpusSearchInDatabase "user1" EN q
92 Just API.All -> flowCorpusSearchInDatabase "user1" EN q
94 docs <- liftIO $ API.get a q (Just 1000)
95 cId' <- flowCorpus "user1" (Left q) (Multi EN) [docs]
100 ------------------------------------------------
101 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
103 instance Arbitrary ApiInfo where
104 arbitrary = ApiInfo <$> arbitrary
106 deriveJSON (unPrefix "") 'ApiInfo
108 instance ToSchema ApiInfo
110 info :: FlowCmdM env err m => UserId -> m ApiInfo
111 info _u = pure $ ApiInfo API.externalAPIs
113 ------------------------------------------------------------------------
114 ------------------------------------------------------------------------
115 data WithQuery = WithQuery
117 , _wq_databases :: ![ExternalAPIs]
118 , _wq_lang :: !(Maybe Lang)
122 makeLenses ''WithQuery
123 instance FromJSON WithQuery where
124 parseJSON = genericParseJSON $ jsonOptions "_wq_"
125 instance ToSchema WithQuery where
126 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
128 -------------------------------------------------------
129 data WithForm = WithForm
130 { _wf_filetype :: !FileType
132 , _wf_lang :: !(Maybe Lang)
133 } deriving (Eq, Show, Generic)
135 makeLenses ''WithForm
136 instance FromForm WithForm
137 instance FromJSON WithForm where
138 parseJSON = genericParseJSON $ jsonOptions "_wf_"
139 instance ToSchema WithForm where
140 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")
142 ------------------------------------------------------------------------
143 type AsyncJobs event ctI input output =
144 Flat (AsyncJobsAPI' 'Unsafe 'Safe ctI '[JSON] Maybe event input output)
145 ------------------------------------------------------------------------
147 type Upload = Summary "Corpus Upload endpoint"
149 :> Capture "corpus_id" CorpusId
150 :<|> "addWithquery" :> AsyncJobsAPI ScraperStatus WithQuery ScraperStatus
151 :<|> "addWithfile" :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
154 type AddWithQuery = Summary "Add with Query to corpus endpoint"
156 :> Capture "corpus_id" CorpusId
160 :> AsyncJobsAPI ScraperStatus WithQuery ScraperStatus
162 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
164 :> Capture "corpus_id" CorpusId
167 :> MultipartForm Mem (MultipartData Mem)
168 :> QueryParam "fileType" FileType
170 :> AsyncJobs ScraperStatus '[JSON] () ScraperStatus
172 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
174 :> Capture "corpus_id" CorpusId
178 :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
180 ------------------------------------------------------------------------
181 -- TODO WithQuery also has a corpus id
182 addToCorpusJobFunction :: FlowCmdM env err m
185 -> (ScraperStatus -> m ())
187 addToCorpusJobFunction _cid (WithQuery _q _dbs _l) logStatus = do
189 logStatus ScraperStatus { _scst_succeeded = Just 10
190 , _scst_failed = Just 2
191 , _scst_remaining = Just 138
192 , _scst_events = Just []
195 pure ScraperStatus { _scst_succeeded = Just 137
196 , _scst_failed = Just 13
197 , _scst_remaining = Just 0
198 , _scst_events = Just []
202 addToCorpusWithFile :: FlowCmdM env err m
206 -> (ScraperStatus -> m ())
208 addToCorpusWithFile cid input filetype logStatus = do
209 logStatus ScraperStatus { _scst_succeeded = Just 10
210 , _scst_failed = Just 2
211 , _scst_remaining = Just 138
212 , _scst_events = Just []
214 _h <- postUpload cid filetype input
216 pure ScraperStatus { _scst_succeeded = Just 137
217 , _scst_failed = Just 13
218 , _scst_remaining = Just 0
219 , _scst_events = Just []
222 {- | Model to fork the flow
223 -- This is not really optimized since it increases the need RAM
224 -- and freezes the whole system
225 -- This is mainly for documentation (see a better solution in the function below)
226 -- Each process has to be tailored
227 addToCorpusWithForm' :: FlowCmdM env err m
230 -> (ScraperStatus -> m ())
232 addToCorpusWithForm' cid (WithForm ft d l) logStatus = do
233 newStatus <- liftIO newEmptyMVar
234 s <- addToCorpusWithForm cid (WithForm ft d l) logStatus
235 _ <- liftIO $ forkIO $ putMVar newStatus s
236 s' <- liftIO $ takeMVar newStatus
239 addToCorpusWithForm :: FlowCmdM env err m
242 -> (ScraperStatus -> m ())
244 addToCorpusWithForm cid (WithForm ft d l) logStatus = do
250 CSV_HAL -> Parser.parseFormat Parser.CsvHal
251 CSV -> Parser.parseFormat Parser.CsvGargV3
252 WOS -> Parser.parseFormat Parser.WOS
253 PresseRIS -> Parser.parseFormat Parser.RisPresse
255 newDocs <- liftIO newEmptyMVar
260 _ <- liftIO $ forkIO $ putMVar newDocs docs
262 logStatus ScraperStatus { _scst_succeeded = Just 1
263 , _scst_failed = Just 0
264 , _scst_remaining = Just 1
265 , _scst_events = Just []
268 docs' <- liftIO $ takeMVar newDocs
269 newCid <- liftIO newEmptyMVar
270 cid' <- flowCorpus "user1"
272 (Multi $ fromMaybe EN l)
273 (map (map toHyperdataDocument) docs')
274 _ <- liftIO $ forkIO $ putMVar newCid cid'
276 cid'' <- liftIO $ takeMVar newCid
277 printDebug "cid'" cid''
279 pure ScraperStatus { _scst_succeeded = Just 2
280 , _scst_failed = Just 0
281 , _scst_remaining = Just 0
282 , _scst_events = Just []