2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
17 {-# OPTIONS_GHC -fno-warn-orphans #-}
19 module Gargantext.API.Node.Corpus.New
22 import Control.Lens hiding (elements, Empty)
24 import Data.Aeson.TH (deriveJSON)
26 import Data.Maybe (fromMaybe)
28 import Data.Text (Text)
29 import GHC.Generics (Generic)
30 import Gargantext.API.Admin.Orchestrator.Types (ScraperStatus(..))
31 import qualified Gargantext.API.Admin.Orchestrator.Types as T
32 import Gargantext.API.Node.Corpus.New.File
33 import Gargantext.Core (Lang(..){-, allLangs-})
34 import Gargantext.Core.Types.Individu (User(..))
35 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
36 import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, getDataText, flowDataText, TermType(..), DataOrigin(..){-, allDataOrigins-})
37 import Gargantext.Database.Admin.Types.Node (CorpusId, ToHyperdataDocument(..), UserId)
38 import Gargantext.Prelude
40 import Servant.Job.Core
41 import Servant.Job.Types
42 import Servant.Job.Utils (jsonOptions)
43 -- import Servant.Multipart
44 -- import Test.QuickCheck (elements)
45 import Test.QuickCheck.Arbitrary
46 import Web.FormUrlEncoded (FromForm)
47 import qualified Gargantext.Text.Corpus.API as API
48 import qualified Gargantext.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
50 ------------------------------------------------------------------------
52 data Query = Query { query_query :: Text
53 , query_node_id :: Int
55 , query_databases :: [DataOrigin]
57 deriving (Eq, Generic)
59 deriveJSON (unPrefix "query_") 'Query
61 instance Arbitrary Query where
62 arbitrary = elements [ Query q n la fs
63 | q <- ["honeybee* AND collapse"
68 , fs <- take 3 $ repeat allDataOrigins
71 instance ToSchema Query where
72 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
75 ------------------------------------------------------------------------
81 type PostApi = Summary "New Corpus endpoint"
82 :> ReqBody '[JSON] Query
83 :> Post '[JSON] CorpusId
84 type GetApi = Get '[JSON] ApiInfo
87 -- | TODO manage several apis
89 -- TODO this is only the POST
91 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
92 api uid (Query q _ as) = do
93 cId <- case head as of
94 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
95 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
97 docs <- liftBase $ API.get a q (Just 1000)
98 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
104 ------------------------------------------------
105 -- TODO use this route for Client implementation
106 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
108 instance Arbitrary ApiInfo where
109 arbitrary = ApiInfo <$> arbitrary
111 deriveJSON (unPrefix "") 'ApiInfo
113 instance ToSchema ApiInfo
115 info :: FlowCmdM env err m => UserId -> m ApiInfo
116 info _u = pure $ ApiInfo API.externalAPIs
118 ------------------------------------------------------------------------
120 data Database = Empty
125 deriving (Eq, Show, Generic)
127 deriveJSON (unPrefix "") ''Database
128 instance ToSchema Database
130 database2origin :: Database -> DataOrigin
131 database2origin Empty = InternalOrigin T.IsTex
132 database2origin PubMed = ExternalOrigin T.PubMed
133 database2origin HAL = ExternalOrigin T.HAL
134 database2origin IsTex = ExternalOrigin T.IsTex
135 database2origin Isidore = ExternalOrigin T.Isidore
137 ------------------------------------------------------------------------
138 data WithQuery = WithQuery
140 , _wq_databases :: !Database
142 , _wq_node_id :: !Int
146 makeLenses ''WithQuery
147 instance FromJSON WithQuery where
148 parseJSON = genericParseJSON $ jsonOptions "_wq_"
149 instance ToSchema WithQuery where
150 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
152 -------------------------------------------------------
153 data NewWithForm = NewWithForm
154 { _wf_filetype :: !FileType
156 , _wf_lang :: !(Maybe Lang)
158 } deriving (Eq, Show, Generic)
160 makeLenses ''NewWithForm
161 instance FromForm NewWithForm
162 instance FromJSON NewWithForm where
163 parseJSON = genericParseJSON $ jsonOptions "_wf_"
164 instance ToSchema NewWithForm where
165 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")
167 ------------------------------------------------------------------------
168 type AsyncJobs event ctI input output =
169 AsyncJobsAPI' 'Unsafe 'Safe ctI '[JSON] Maybe event input output
170 ------------------------------------------------------------------------
172 type AddWithQuery = Summary "Add with Query to corpus endpoint"
174 :> Capture "corpus_id" CorpusId
176 :> AsyncJobs ScraperStatus '[JSON] WithQuery ScraperStatus
179 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
181 :> Capture "corpus_id" CorpusId
184 :> MultipartForm Mem (MultipartData Mem)
185 :> QueryParam "fileType" FileType
187 :> AsyncJobs ScraperStatus '[JSON] () ScraperStatus
190 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
192 :> Capture "corpus_id" CorpusId
196 :> AsyncJobs ScraperStatus '[FormUrlEncoded] NewWithForm ScraperStatus
199 ------------------------------------------------------------------------
200 -- TODO WithQuery also has a corpus id
201 addToCorpusWithQuery :: FlowCmdM env err m
205 -> (ScraperStatus -> m ())
207 addToCorpusWithQuery u cid (WithQuery q dbs l _nid) logStatus = do
209 logStatus ScraperStatus { _scst_succeeded = Just 10
210 , _scst_failed = Just 2
211 , _scst_remaining = Just 138
212 , _scst_events = Just []
214 printDebug "addToCorpusWithQuery" cid
216 -- TODO if cid is folder -> create Corpus
217 -- if cid is corpus -> add to corpus
218 -- if cid is root -> create corpus in Private
219 txts <- mapM (\db -> getDataText db (Multi l) q (Just 10000)) [database2origin dbs]
220 cids <- mapM (\txt -> flowDataText u txt (Multi l) cid) txts
221 printDebug "corpus id" cids
223 pure ScraperStatus { _scst_succeeded = Just 137
224 , _scst_failed = Just 13
225 , _scst_remaining = Just 0
226 , _scst_events = Just []
229 addToCorpusWithForm :: FlowCmdM env err m
233 -> (ScraperStatus -> m ())
235 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus = do
239 CSV_HAL -> Parser.parseFormat Parser.CsvHal
240 CSV -> Parser.parseFormat Parser.CsvGargV3
241 WOS -> Parser.parseFormat Parser.WOS
242 PresseRIS -> Parser.parseFormat Parser.RisPresse
244 logStatus ScraperStatus { _scst_succeeded = Just 1
245 , _scst_failed = Just 0
246 , _scst_remaining = Just 1
247 , _scst_events = Just []
250 printDebug "Parsing corpus: " cid
252 -- TODO granularity of the logStatus
253 docs <- liftBase $ splitEvery 500
257 printDebug "Parsing corpus finished : " cid
258 printDebug "Starting extraction : " cid
260 -- TODO granularity of the logStatus
261 _cid' <- flowCorpus user
263 (Multi $ fromMaybe EN l)
264 (map (map toHyperdataDocument) docs)
266 printDebug "Extraction finished : " cid
268 pure ScraperStatus { _scst_succeeded = Just 2
269 , _scst_failed = Just 0
270 , _scst_remaining = Just 0
271 , _scst_events = Just []
275 addToCorpusWithFile :: FlowCmdM env err m
279 -> (ScraperStatus -> m ())
281 addToCorpusWithFile cid input filetype logStatus = do
282 logStatus ScraperStatus { _scst_succeeded = Just 10
283 , _scst_failed = Just 2
284 , _scst_remaining = Just 138
285 , _scst_events = Just []
287 printDebug "addToCorpusWithFile" cid
288 _h <- postUpload cid filetype input
290 pure ScraperStatus { _scst_succeeded = Just 137
291 , _scst_failed = Just 13
292 , _scst_remaining = Just 0
293 , _scst_events = Just []