2 Module : Gargantext.API.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE DeriveGeneric #-}
18 {-# LANGUAGE DataKinds #-}
19 {-# LANGUAGE TypeOperators #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE FlexibleContexts #-}
22 {-# LANGUAGE RankNTypes #-}
24 module Gargantext.API.Corpus.New
27 --import Debug.Trace (trace)
28 import Web.FormUrlEncoded (FromForm)
30 import Control.Monad.IO.Class (liftIO)
31 import Data.Aeson.TH (deriveJSON)
33 import Servant.Job.Utils (jsonOptions)
34 import Control.Lens hiding (elements)
35 import Servant.Multipart
37 import Data.Text (Text)
38 import GHC.Generics (Generic)
39 import Servant.Job.Types
40 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
41 import Gargantext.Database.Flow (flowCorpusSearchInDatabase)
42 import Gargantext.Database.Types.Node (CorpusId)
43 import Gargantext.Text.Terms (TermType(..))
44 import Gargantext.Prelude
45 import Gargantext.API.Orchestrator.Types
47 -- import Servant.Job.Server
48 import Test.QuickCheck (elements)
49 import Test.QuickCheck.Arbitrary
50 import Gargantext.Core (Lang(..))
51 import Gargantext.Database.Flow (FlowCmdM, flowCorpus)
52 import qualified Gargantext.Text.Corpus.API as API
53 --import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat(..))
54 import Gargantext.Text.Corpus.Parsers.CSV (parseHal')
55 import Gargantext.Database.Types.Node (ToHyperdataDocument(..))
56 import Gargantext.Database.Types.Node (UserId)
57 import Gargantext.API.Corpus.New.File
59 data Query = Query { query_query :: Text
60 , query_corpus_id :: Int
61 , query_databases :: [API.ExternalAPIs]
63 deriving (Eq, Show, Generic)
65 deriveJSON (unPrefix "query_") 'Query
68 instance Arbitrary Query where
69 arbitrary = elements [ Query q n fs
72 , fs <- take 3 $ repeat API.externalAPIs
75 instance ToSchema Query where
76 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
78 type Api = Summary "New Corpus endpoint"
79 :> ReqBody '[JSON] Query
80 :> Post '[JSON] CorpusId
81 :<|> Get '[JSON] ApiInfo
83 -- | TODO manage several apis
85 -- TODO this is only the POST
86 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
87 api _uId (Query q _ as) = do
88 cId <- case head as of
89 Nothing -> flowCorpusSearchInDatabase "user1" EN q
90 Just API.All -> flowCorpusSearchInDatabase "user1" EN q
92 docs <- liftIO $ API.get a q (Just 1000)
93 cId' <- flowCorpus "user1" (Left q) (Multi EN) [docs]
98 ------------------------------------------------
99 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
101 instance Arbitrary ApiInfo where
102 arbitrary = ApiInfo <$> arbitrary
104 deriveJSON (unPrefix "") 'ApiInfo
106 instance ToSchema ApiInfo
108 info :: FlowCmdM env err m => UserId -> m ApiInfo
109 info _u = pure $ ApiInfo API.externalAPIs
112 -- Proposal to replace the Query type which seems to generically named.
113 data ScraperInput = ScraperInput
114 { _scin_query :: !Text
115 , _scin_corpus_id :: !Int
116 , _scin_databases :: [API.ExternalAPIs]
118 deriving (Eq, Show, Generic)
120 makeLenses ''ScraperInput
122 deriveJSON (unPrefix "_scin_") 'ScraperInput
124 data ScraperEvent = ScraperEvent
125 { _scev_message :: !(Maybe Text)
126 , _scev_level :: !(Maybe Text)
127 , _scev_date :: !(Maybe Text)
131 deriveJSON (unPrefix "_scev_") 'ScraperEvent
133 data ScraperStatus = ScraperStatus
134 { _scst_succeeded :: !(Maybe Int)
135 , _scst_failed :: !(Maybe Int)
136 , _scst_remaining :: !(Maybe Int)
137 , _scst_events :: !(Maybe [ScraperEvent])
141 deriveJSON (unPrefix "_scst_") 'ScraperStatus
146 ------------------------------------------------------------------------
147 ------------------------------------------------------------------------
148 data WithQuery = WithQuery
150 , _wq_databases :: ![ExternalAPIs]
154 makeLenses ''WithQuery
156 instance FromJSON WithQuery where
157 parseJSON = genericParseJSON $ jsonOptions "_wq_"
159 instance ToSchema WithQuery
160 -------------------------------------------------------
161 data WithForm = WithForm
162 { _wf_filetype :: !FileType
164 } deriving (Eq, Show, Generic)
166 makeLenses ''WithForm
168 instance FromJSON WithForm where
169 parseJSON = genericParseJSON $ jsonOptions "_wf_"
170 instance ToSchema WithForm
171 instance FromForm WithForm
174 ------------------------------------------------------------------------
176 AddAPI withInput = AsyncJobsAPI ScraperStatus withInput ScraperStatus
177 ------------------------------------------------------------------------
179 type AddWithQuery = Summary "Add with Query to corpus endpoint"
181 :> Capture "corpus_id" CorpusId
187 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
189 :> Capture "corpus_id" CorpusId
192 :> MultipartForm Mem (MultipartData Mem)
193 :> QueryParam "fileType" FileType
197 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
199 :> Capture "corpus_id" CorpusId
202 :> ReqBody '[FormUrlEncoded] WithForm
206 ------------------------------------------------------------------------
207 -- TODO WithQuery also has a corpus id
208 addToCorpusJobFunction :: FlowCmdM env err m
211 -> (ScraperStatus -> m ())
213 addToCorpusJobFunction _cid _input logStatus = do
215 logStatus ScraperStatus { _scst_succeeded = Just 10
216 , _scst_failed = Just 2
217 , _scst_remaining = Just 138
218 , _scst_events = Just []
221 pure ScraperStatus { _scst_succeeded = Just 137
222 , _scst_failed = Just 13
223 , _scst_remaining = Just 0
224 , _scst_events = Just []
228 addToCorpusWithFile :: FlowCmdM env err m
232 -> (ScraperStatus -> m ())
234 addToCorpusWithFile cid input filetype logStatus = do
235 logStatus ScraperStatus { _scst_succeeded = Just 10
236 , _scst_failed = Just 2
237 , _scst_remaining = Just 138
238 , _scst_events = Just []
240 _h <- postUpload cid filetype input
242 pure ScraperStatus { _scst_succeeded = Just 137
243 , _scst_failed = Just 13
244 , _scst_remaining = Just 0
245 , _scst_events = Just []
248 addToCorpusWithForm :: FlowCmdM env err m
251 -> (ScraperStatus -> m ())
253 addToCorpusWithForm cid (WithForm _ft d) logStatus = do
255 let docs = splitEvery 500
259 cid' <- flowCorpus "user1" (Right [cid]) (Multi EN) (map (map toHyperdataDocument) docs)
260 printDebug "cid'" cid'
262 logStatus ScraperStatus { _scst_succeeded = Just 10
263 , _scst_failed = Just 2
264 , _scst_remaining = Just 138
265 , _scst_events = Just []
268 pure ScraperStatus { _scst_succeeded = Just 137
269 , _scst_failed = Just 13
270 , _scst_remaining = Just 0
271 , _scst_events = Just []