2 Module : Gargantext.API.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE DeriveGeneric #-}
18 {-# LANGUAGE DataKinds #-}
19 {-# LANGUAGE TypeOperators #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE FlexibleContexts #-}
22 {-# LANGUAGE RankNTypes #-}
23 {-# OPTIONS_GHC -fno-warn-orphans #-}
25 module Gargantext.API.Corpus.New
28 import Control.Lens hiding (elements)
30 import Data.Aeson.TH (deriveJSON)
32 import Data.Maybe (fromMaybe)
34 import Data.Text (Text)
35 import GHC.Generics (Generic)
36 import Gargantext.API.Admin.Orchestrator.Types
37 import Gargantext.API.Corpus.New.File
38 import Gargantext.Core (Lang(..), allLangs)
39 import Gargantext.Core.Types.Individu (UserId, User(..))
40 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
41 import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, getDataText, flowDataText, TermType(..), DataOrigin(..), allDataOrigins)
42 import Gargantext.Database.Admin.Types.Node (CorpusId, ToHyperdataDocument(..))
43 import Gargantext.Prelude
45 import Servant.API.Flatten (Flat)
46 import Servant.Job.Core
47 import Servant.Job.Types
48 import Servant.Job.Utils (jsonOptions)
49 import Servant.Multipart
50 import Test.QuickCheck (elements)
51 import Test.QuickCheck.Arbitrary
52 import Web.FormUrlEncoded (FromForm)
53 import qualified Gargantext.Text.Corpus.API as API
54 import qualified Gargantext.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
56 ------------------------------------------------------------------------
57 data Query = Query { query_query :: Text
58 , query_node_id :: Int
60 , query_databases :: [DataOrigin]
62 deriving (Eq, Generic)
64 deriveJSON (unPrefix "query_") 'Query
66 instance Arbitrary Query where
67 arbitrary = elements [ Query q n la fs
68 | q <- ["honeybee* AND collapse"
73 , fs <- take 3 $ repeat allDataOrigins
76 instance ToSchema Query where
77 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
79 ------------------------------------------------------------------------
84 type PostApi = Summary "New Corpus endpoint"
85 :> ReqBody '[JSON] Query
86 :> Post '[JSON] CorpusId
87 type GetApi = Get '[JSON] ApiInfo
89 -- | TODO manage several apis
91 -- TODO this is only the POST
93 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
94 api uid (Query q _ as) = do
95 cId <- case head as of
96 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
97 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
99 docs <- liftBase $ API.get a q (Just 1000)
100 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
106 ------------------------------------------------
107 -- TODO use this route for Client implementation
108 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
110 instance Arbitrary ApiInfo where
111 arbitrary = ApiInfo <$> arbitrary
113 deriveJSON (unPrefix "") 'ApiInfo
115 instance ToSchema ApiInfo
117 info :: FlowCmdM env err m => UserId -> m ApiInfo
118 info _u = pure $ ApiInfo API.externalAPIs
120 ------------------------------------------------------------------------
121 ------------------------------------------------------------------------
122 data WithQuery = WithQuery
124 , _wq_databases :: ![DataOrigin]
125 , _wq_lang :: !(Maybe (TermType Lang))
129 makeLenses ''WithQuery
130 instance FromJSON WithQuery where
131 parseJSON = genericParseJSON $ jsonOptions "_wq_"
132 instance ToSchema WithQuery where
133 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
135 -------------------------------------------------------
136 data WithForm = WithForm
137 { _wf_filetype :: !FileType
139 , _wf_lang :: !(Maybe Lang)
141 } deriving (Eq, Show, Generic)
143 makeLenses ''WithForm
144 instance FromForm WithForm
145 instance FromJSON WithForm where
146 parseJSON = genericParseJSON $ jsonOptions "_wf_"
147 instance ToSchema WithForm where
148 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")
150 ------------------------------------------------------------------------
151 type AsyncJobs event ctI input output =
152 Flat (AsyncJobsAPI' 'Unsafe 'Safe ctI '[JSON] Maybe event input output)
153 ------------------------------------------------------------------------
155 type Upload = Summary "Corpus Upload endpoint"
157 :> Capture "corpus_id" CorpusId
159 :> AsyncJobsAPI ScraperStatus WithQuery ScraperStatus
161 :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
163 type AddWithQuery = Summary "Add with Query to corpus endpoint"
165 :> Capture "corpus_id" CorpusId
169 :> AsyncJobsAPI ScraperStatus WithQuery ScraperStatus
171 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
173 :> Capture "corpus_id" CorpusId
176 :> MultipartForm Mem (MultipartData Mem)
177 :> QueryParam "fileType" FileType
179 :> AsyncJobs ScraperStatus '[JSON] () ScraperStatus
181 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
183 :> Capture "corpus_id" CorpusId
187 :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
189 ------------------------------------------------------------------------
190 -- TODO WithQuery also has a corpus id
191 addToCorpusWithQuery :: FlowCmdM env err m
195 -> (ScraperStatus -> m ())
197 addToCorpusWithQuery u cid (WithQuery q dbs l) logStatus = do
199 logStatus ScraperStatus { _scst_succeeded = Just 10
200 , _scst_failed = Just 2
201 , _scst_remaining = Just 138
202 , _scst_events = Just []
204 printDebug "addToCorpusWithQuery" cid
206 -- TODO if cid is folder -> create Corpus
207 -- if cid is corpus -> add to corpus
208 -- if cid is root -> create corpus in Private
209 txts <- mapM (\db -> getDataText db (fromMaybe (Multi EN) l) q (Just 10000)) dbs
210 cids <- mapM (\txt -> flowDataText u txt (fromMaybe (Multi EN) l) cid) txts
211 printDebug "corpus id" cids
213 pure ScraperStatus { _scst_succeeded = Just 137
214 , _scst_failed = Just 13
215 , _scst_remaining = Just 0
216 , _scst_events = Just []
219 addToCorpusWithFile :: FlowCmdM env err m
223 -> (ScraperStatus -> m ())
225 addToCorpusWithFile cid input filetype logStatus = do
226 logStatus ScraperStatus { _scst_succeeded = Just 10
227 , _scst_failed = Just 2
228 , _scst_remaining = Just 138
229 , _scst_events = Just []
231 printDebug "addToCorpusWithFile" cid
232 _h <- postUpload cid filetype input
234 pure ScraperStatus { _scst_succeeded = Just 137
235 , _scst_failed = Just 13
236 , _scst_remaining = Just 0
237 , _scst_events = Just []
240 addToCorpusWithForm :: FlowCmdM env err m
244 -> (ScraperStatus -> m ())
246 addToCorpusWithForm user cid (WithForm ft d l _n) logStatus = do
250 CSV_HAL -> Parser.parseFormat Parser.CsvHal
251 CSV -> Parser.parseFormat Parser.CsvGargV3
252 WOS -> Parser.parseFormat Parser.WOS
253 PresseRIS -> Parser.parseFormat Parser.RisPresse
255 logStatus ScraperStatus { _scst_succeeded = Just 1
256 , _scst_failed = Just 0
257 , _scst_remaining = Just 1
258 , _scst_events = Just []
261 printDebug "Parsing corpus: " cid
263 -- TODO granularity of the logStatus
264 docs <- liftBase $ splitEvery 500
268 printDebug "Parsing corpus finished : " cid
269 printDebug "Starting extraction : " cid
271 -- TODO granularity of the logStatus
272 _cid' <- flowCorpus user
274 (Multi $ fromMaybe EN l)
275 (map (map toHyperdataDocument) docs)
277 printDebug "Extraction finished : " cid
279 pure ScraperStatus { _scst_succeeded = Just 2
280 , _scst_failed = Just 0
281 , _scst_remaining = Just 0
282 , _scst_events = Just []