2 Module : Gargantext.API.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE DeriveGeneric #-}
18 {-# LANGUAGE DataKinds #-}
19 {-# LANGUAGE TypeOperators #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE FlexibleContexts #-}
22 {-# LANGUAGE RankNTypes #-}
23 {-# OPTIONS_GHC -fno-warn-orphans #-}
25 module Gargantext.API.Corpus.New
28 --import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat(..))
29 import Control.Lens hiding (elements)
30 import Control.Monad.IO.Class (liftIO)
31 import Control.Concurrent
33 import Data.Aeson.TH (deriveJSON)
34 import Data.Maybe (fromMaybe)
37 import Data.Text (Text)
38 import GHC.Generics (Generic)
39 import Gargantext.API.Corpus.New.File
40 import Gargantext.API.Orchestrator.Types
41 import Gargantext.Core (Lang(..))
42 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
43 import Gargantext.Database.Flow (FlowCmdM, flowCorpus)
44 import Gargantext.Database.Flow (flowCorpusSearchInDatabase)
45 import Gargantext.Database.Types.Node (CorpusId)
46 import Gargantext.Database.Types.Node (ToHyperdataDocument(..))
47 import Gargantext.Database.Types.Node (UserId)
48 import Gargantext.Prelude
49 import qualified Gargantext.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
50 import Gargantext.Text.Terms (TermType(..))
52 import Servant.API.Flatten (Flat)
53 import Servant.Job.Core
54 import Servant.Job.Types
55 import Servant.Job.Utils (jsonOptions)
56 import Servant.Multipart
57 import Test.QuickCheck (elements)
58 import Test.QuickCheck.Arbitrary
59 import Web.FormUrlEncoded (FromForm)
60 import qualified Gargantext.Text.Corpus.API as API
62 data Query = Query { query_query :: Text
63 , query_corpus_id :: Int
64 , query_databases :: [API.ExternalAPIs]
66 deriving (Eq, Show, Generic)
68 deriveJSON (unPrefix "query_") 'Query
71 instance Arbitrary Query where
72 arbitrary = elements [ Query q n fs
75 , fs <- take 3 $ repeat API.externalAPIs
78 instance ToSchema Query where
79 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
81 ------------------------------------------------------------------------
86 type PostApi = Summary "New Corpus endpoint"
87 :> ReqBody '[JSON] Query
88 :> Post '[JSON] CorpusId
89 type GetApi = Get '[JSON] ApiInfo
91 -- | TODO manage several apis
93 -- TODO this is only the POST
94 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
95 api _uId (Query q _ as) = do
96 cId <- case head as of
97 Nothing -> flowCorpusSearchInDatabase "user1" EN q
98 Just API.All -> flowCorpusSearchInDatabase "user1" EN q
100 docs <- liftIO $ API.get a q (Just 1000)
101 cId' <- flowCorpus "user1" (Left q) (Multi EN) [docs]
106 ------------------------------------------------
107 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
109 instance Arbitrary ApiInfo where
110 arbitrary = ApiInfo <$> arbitrary
112 deriveJSON (unPrefix "") 'ApiInfo
114 instance ToSchema ApiInfo
116 info :: FlowCmdM env err m => UserId -> m ApiInfo
117 info _u = pure $ ApiInfo API.externalAPIs
119 ------------------------------------------------------------------------
120 ------------------------------------------------------------------------
121 data WithQuery = WithQuery
123 , _wq_databases :: ![ExternalAPIs]
124 , _wq_lang :: !(Maybe Lang)
128 makeLenses ''WithQuery
129 instance FromJSON WithQuery where
130 parseJSON = genericParseJSON $ jsonOptions "_wq_"
131 instance ToSchema WithQuery where
132 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
134 -------------------------------------------------------
135 data WithForm = WithForm
136 { _wf_filetype :: !FileType
138 , _wf_lang :: !(Maybe Lang)
140 } deriving (Eq, Show, Generic)
142 makeLenses ''WithForm
143 instance FromForm WithForm
144 instance FromJSON WithForm where
145 parseJSON = genericParseJSON $ jsonOptions "_wf_"
146 instance ToSchema WithForm where
147 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")
149 ------------------------------------------------------------------------
150 type AsyncJobs event ctI input output =
151 Flat (AsyncJobsAPI' 'Unsafe 'Safe ctI '[JSON] Maybe event input output)
152 ------------------------------------------------------------------------
154 type Upload = Summary "Corpus Upload endpoint"
156 :> Capture "corpus_id" CorpusId
157 :<|> "addWithquery" :> AsyncJobsAPI ScraperStatus WithQuery ScraperStatus
158 :<|> "addWithfile" :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
161 type AddWithQuery = Summary "Add with Query to corpus endpoint"
163 :> Capture "corpus_id" CorpusId
167 :> AsyncJobsAPI ScraperStatus WithQuery ScraperStatus
169 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
171 :> Capture "corpus_id" CorpusId
174 :> MultipartForm Mem (MultipartData Mem)
175 :> QueryParam "fileType" FileType
177 :> AsyncJobs ScraperStatus '[JSON] () ScraperStatus
179 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
181 :> Capture "corpus_id" CorpusId
185 :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
187 ------------------------------------------------------------------------
188 -- TODO WithQuery also has a corpus id
189 addToCorpusJobFunction :: FlowCmdM env err m
192 -> (ScraperStatus -> m ())
194 addToCorpusJobFunction _cid (WithQuery _q _dbs _l) logStatus = do
196 logStatus ScraperStatus { _scst_succeeded = Just 10
197 , _scst_failed = Just 2
198 , _scst_remaining = Just 138
199 , _scst_events = Just []
202 pure ScraperStatus { _scst_succeeded = Just 137
203 , _scst_failed = Just 13
204 , _scst_remaining = Just 0
205 , _scst_events = Just []
209 addToCorpusWithFile :: FlowCmdM env err m
213 -> (ScraperStatus -> m ())
215 addToCorpusWithFile cid input filetype logStatus = do
216 logStatus ScraperStatus { _scst_succeeded = Just 10
217 , _scst_failed = Just 2
218 , _scst_remaining = Just 138
219 , _scst_events = Just []
221 _h <- postUpload cid filetype input
223 pure ScraperStatus { _scst_succeeded = Just 137
224 , _scst_failed = Just 13
225 , _scst_remaining = Just 0
226 , _scst_events = Just []
229 {- | Model to fork the flow
230 -- This is not really optimized since it increases the need RAM
231 -- and freezes the whole system
232 -- This is mainly for documentation (see a better solution in the function below)
233 -- Each process has to be tailored
234 addToCorpusWithForm' :: FlowCmdM env err m
237 -> (ScraperStatus -> m ())
239 addToCorpusWithForm' cid (WithForm ft d l) logStatus = do
240 newStatus <- liftIO newEmptyMVar
241 s <- addToCorpusWithForm cid (WithForm ft d l) logStatus
242 _ <- liftIO $ forkIO $ putMVar newStatus s
243 s' <- liftIO $ takeMVar newStatus
246 addToCorpusWithForm :: FlowCmdM env err m
249 -> (ScraperStatus -> m ())
251 addToCorpusWithForm cid (WithForm ft d l _n) logStatus = do
257 CSV_HAL -> Parser.parseFormat Parser.CsvHal
258 CSV -> Parser.parseFormat Parser.CsvGargV3
259 WOS -> Parser.parseFormat Parser.WOS
260 PresseRIS -> Parser.parseFormat Parser.RisPresse
262 newDocs <- liftIO newEmptyMVar
267 _ <- liftIO $ forkIO $ putMVar newDocs docs
269 logStatus ScraperStatus { _scst_succeeded = Just 1
270 , _scst_failed = Just 0
271 , _scst_remaining = Just 1
272 , _scst_events = Just []
275 docs' <- liftIO $ takeMVar newDocs
276 newCid <- liftIO newEmptyMVar
277 cid' <- flowCorpus "user1"
279 (Multi $ fromMaybe EN l)
280 (map (map toHyperdataDocument) docs')
281 _ <- liftIO $ forkIO $ putMVar newCid cid'
283 cid'' <- liftIO $ takeMVar newCid
284 printDebug "cid'" cid''
286 pure ScraperStatus { _scst_succeeded = Just 2
287 , _scst_failed = Just 0
288 , _scst_remaining = Just 0
289 , _scst_events = Just []