2 Module : Gargantext.API.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE DeriveGeneric #-}
18 {-# LANGUAGE DataKinds #-}
19 {-# LANGUAGE TypeOperators #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE FlexibleContexts #-}
22 {-# LANGUAGE RankNTypes #-}
24 module Gargantext.API.Corpus.New
27 --import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat(..))
28 import Control.Lens hiding (elements)
29 import Control.Monad.IO.Class (liftIO)
31 import Data.Aeson.TH (deriveJSON)
34 import Data.Text (Text)
35 import GHC.Generics (Generic)
36 import Gargantext.API.Corpus.New.File
37 import Gargantext.API.Orchestrator.Types
38 import Gargantext.Core (Lang(..))
39 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
40 import Gargantext.Database.Flow (FlowCmdM, flowCorpus)
41 import Gargantext.Database.Flow (flowCorpusSearchInDatabase)
42 import Gargantext.Database.Types.Node (CorpusId)
43 import Gargantext.Database.Types.Node (ToHyperdataDocument(..))
44 import Gargantext.Database.Types.Node (UserId)
45 import Gargantext.Prelude
46 import Gargantext.Text.Corpus.Parsers.CSV (parseHal')
47 import Gargantext.Text.Terms (TermType(..))
49 import Servant.API.Flatten (Flat)
50 import Servant.Job.Core
51 import Servant.Job.Types
52 import Servant.Job.Utils (jsonOptions)
53 import Servant.Multipart
54 import Test.QuickCheck (elements)
55 import Test.QuickCheck.Arbitrary
56 import Web.FormUrlEncoded (FromForm)
57 import qualified Gargantext.Text.Corpus.API as API
59 data Query = Query { query_query :: Text
60 , query_corpus_id :: Int
61 , query_databases :: [API.ExternalAPIs]
63 deriving (Eq, Show, Generic)
65 deriveJSON (unPrefix "query_") 'Query
68 instance Arbitrary Query where
69 arbitrary = elements [ Query q n fs
72 , fs <- take 3 $ repeat API.externalAPIs
75 instance ToSchema Query where
76 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
78 type Api = Summary "New Corpus endpoint"
79 :> ReqBody '[JSON] Query
80 :> Post '[JSON] CorpusId
81 :<|> Get '[JSON] ApiInfo
83 -- | TODO manage several apis
85 -- TODO this is only the POST
86 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
87 api _uId (Query q _ as) = do
88 cId <- case head as of
89 Nothing -> flowCorpusSearchInDatabase "user1" EN q
90 Just API.All -> flowCorpusSearchInDatabase "user1" EN q
92 docs <- liftIO $ API.get a q (Just 1000)
93 cId' <- flowCorpus "user1" (Left q) (Multi EN) [docs]
98 ------------------------------------------------
99 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
101 instance Arbitrary ApiInfo where
102 arbitrary = ApiInfo <$> arbitrary
104 deriveJSON (unPrefix "") 'ApiInfo
106 instance ToSchema ApiInfo
108 info :: FlowCmdM env err m => UserId -> m ApiInfo
109 info _u = pure $ ApiInfo API.externalAPIs
111 ------------------------------------------------------------------------
112 ------------------------------------------------------------------------
113 data WithQuery = WithQuery
115 , _wq_databases :: ![ExternalAPIs]
119 makeLenses ''WithQuery
120 instance FromJSON WithQuery where
121 parseJSON = genericParseJSON $ jsonOptions "_wq_"
122 instance ToSchema WithQuery where
123 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
125 -------------------------------------------------------
126 data WithForm = WithForm
127 { _wf_filetype :: !FileType
129 } deriving (Eq, Show, Generic)
131 makeLenses ''WithForm
132 instance FromForm WithForm
133 instance FromJSON WithForm where
134 parseJSON = genericParseJSON $ jsonOptions "_wf_"
135 instance ToSchema WithForm where
136 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")
138 ------------------------------------------------------------------------
139 type AsyncJobs event ctI input output =
140 Flat (AsyncJobsAPI' 'Unsafe 'Safe ctI '[JSON] Maybe event input output)
141 ------------------------------------------------------------------------
142 type AddWithQuery = Summary "Add with Query to corpus endpoint"
144 :> Capture "corpus_id" CorpusId
148 :> AsyncJobsAPI ScraperStatus WithQuery ScraperStatus
150 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
152 :> Capture "corpus_id" CorpusId
155 :> MultipartForm Mem (MultipartData Mem)
156 :> QueryParam "fileType" FileType
158 :> AsyncJobs ScraperStatus '[JSON] () ScraperStatus
160 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
162 :> Capture "corpus_id" CorpusId
166 :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
168 ------------------------------------------------------------------------
169 -- TODO WithQuery also has a corpus id
170 addToCorpusJobFunction :: FlowCmdM env err m
173 -> (ScraperStatus -> m ())
175 addToCorpusJobFunction _cid (WithQuery _q _dbs) logStatus = do
177 logStatus ScraperStatus { _scst_succeeded = Just 10
178 , _scst_failed = Just 2
179 , _scst_remaining = Just 138
180 , _scst_events = Just []
183 pure ScraperStatus { _scst_succeeded = Just 137
184 , _scst_failed = Just 13
185 , _scst_remaining = Just 0
186 , _scst_events = Just []
190 addToCorpusWithFile :: FlowCmdM env err m
194 -> (ScraperStatus -> m ())
196 addToCorpusWithFile cid input filetype logStatus = do
197 logStatus ScraperStatus { _scst_succeeded = Just 10
198 , _scst_failed = Just 2
199 , _scst_remaining = Just 138
200 , _scst_events = Just []
202 _h <- postUpload cid filetype input
204 pure ScraperStatus { _scst_succeeded = Just 137
205 , _scst_failed = Just 13
206 , _scst_remaining = Just 0
207 , _scst_events = Just []
210 addToCorpusWithForm :: FlowCmdM env err m
213 -> (ScraperStatus -> m ())
215 addToCorpusWithForm cid (WithForm _ft d) logStatus = do
217 let docs = splitEvery 500
221 logStatus ScraperStatus { _scst_succeeded = Just 1
222 , _scst_failed = Just 0
223 , _scst_remaining = Just 1
224 , _scst_events = Just []
227 cid' <- flowCorpus "user1" (Right [cid]) (Multi EN) (map (map toHyperdataDocument) docs)
228 printDebug "cid'" cid'
230 pure ScraperStatus { _scst_succeeded = Just 2
231 , _scst_failed = Just 0
232 , _scst_remaining = Just 0
233 , _scst_events = Just []