]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Corpus/New.hs
[API] Facto and mkdir Admin
[gargantext.git] / src / Gargantext / API / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE DeriveGeneric #-}
18 {-# LANGUAGE DataKinds #-}
19 {-# LANGUAGE TypeOperators #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE FlexibleContexts #-}
22 {-# LANGUAGE RankNTypes #-}
23 {-# OPTIONS_GHC -fno-warn-orphans #-}
24
25 module Gargantext.API.Corpus.New
26 where
27
28 import Control.Lens hiding (elements)
29 import Data.Aeson
30 import Data.Aeson.TH (deriveJSON)
31 import Data.Either
32 import Data.Maybe (fromMaybe)
33 import Data.Swagger
34 import Data.Text (Text)
35 import GHC.Generics (Generic)
36 import Gargantext.API.Admin.Orchestrator.Types
37 import Gargantext.API.Corpus.New.File
38 import Gargantext.Core (Lang(..))
39 import Gargantext.Core.Types.Individu (UserId, User(..))
40 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
41 import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, flowCorpusSearchInDatabase)
42 import Gargantext.Database.Admin.Types.Node (CorpusId, ToHyperdataDocument(..))
43 import Gargantext.Prelude
44 import Gargantext.Text.Terms (TermType(..))
45 import Servant
46 import Servant.API.Flatten (Flat)
47 import Servant.Job.Core
48 import Servant.Job.Types
49 import Servant.Job.Utils (jsonOptions)
50 import Servant.Multipart
51 import Test.QuickCheck (elements)
52 import Test.QuickCheck.Arbitrary
53 import Web.FormUrlEncoded (FromForm)
54 import qualified Gargantext.Text.Corpus.API as API
55 import qualified Gargantext.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
56
57 ------------------------------------------------------------------------
58 data Query = Query { query_query :: Text
59 , query_corpus_id :: Int
60 , query_databases :: [API.ExternalAPIs]
61 }
62 deriving (Eq, Show, Generic)
63
64 deriveJSON (unPrefix "query_") 'Query
65
66 instance Arbitrary Query where
67 arbitrary = elements [ Query q n fs
68 | q <- ["honeybee* AND collopase"
69 ,"covid 19"]
70 , n <- [0..10]
71 , fs <- take 3 $ repeat API.externalAPIs
72 ]
73
74 instance ToSchema Query where
75 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
76
77 ------------------------------------------------------------------------
78
79 type Api = PostApi
80 :<|> GetApi
81
82 type PostApi = Summary "New Corpus endpoint"
83 :> ReqBody '[JSON] Query
84 :> Post '[JSON] CorpusId
85 type GetApi = Get '[JSON] ApiInfo
86
87 -- | TODO manage several apis
88 -- TODO-ACCESS
89 -- TODO this is only the POST
90 {-
91 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
92 api uid (Query q _ as) = do
93 cId <- case head as of
94 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
95 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
96 Just a -> do
97 docs <- liftBase $ API.get a q (Just 1000)
98 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
99 pure cId'
100
101 pure cId
102 -}
103
104 ------------------------------------------------
105 -- TODO use this route for Client implementation
106 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
107 deriving (Generic)
108 instance Arbitrary ApiInfo where
109 arbitrary = ApiInfo <$> arbitrary
110
111 deriveJSON (unPrefix "") 'ApiInfo
112
113 instance ToSchema ApiInfo
114
115 info :: FlowCmdM env err m => UserId -> m ApiInfo
116 info _u = pure $ ApiInfo API.externalAPIs
117
118 ------------------------------------------------------------------------
119 ------------------------------------------------------------------------
120 data WithQuery = WithQuery
121 { _wq_query :: !Text
122 , _wq_databases :: ![ExternalAPIs]
123 , _wq_lang :: !(Maybe Lang)
124 }
125 deriving Generic
126
127 makeLenses ''WithQuery
128 instance FromJSON WithQuery where
129 parseJSON = genericParseJSON $ jsonOptions "_wq_"
130 instance ToSchema WithQuery where
131 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
132
133 -------------------------------------------------------
134 data WithForm = WithForm
135 { _wf_filetype :: !FileType
136 , _wf_data :: !Text
137 , _wf_lang :: !(Maybe Lang)
138 , _wf_name :: !Text
139 } deriving (Eq, Show, Generic)
140
141 makeLenses ''WithForm
142 instance FromForm WithForm
143 instance FromJSON WithForm where
144 parseJSON = genericParseJSON $ jsonOptions "_wf_"
145 instance ToSchema WithForm where
146 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")
147
148 ------------------------------------------------------------------------
149 type AsyncJobs event ctI input output =
150 Flat (AsyncJobsAPI' 'Unsafe 'Safe ctI '[JSON] Maybe event input output)
151 ------------------------------------------------------------------------
152
153 type Upload = Summary "Corpus Upload endpoint"
154 :> "corpus"
155 :> Capture "corpus_id" CorpusId
156 :<|> "addWithquery"
157 :> AsyncJobsAPI ScraperStatus WithQuery ScraperStatus
158 :<|> "addWithfile"
159 :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
160
161 type AddWithQuery = Summary "Add with Query to corpus endpoint"
162 :> "corpus"
163 :> Capture "corpus_id" CorpusId
164 :> "add"
165 :> "query"
166 :> "async"
167 :> AsyncJobsAPI ScraperStatus WithQuery ScraperStatus
168
169 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
170 :> "corpus"
171 :> Capture "corpus_id" CorpusId
172 :> "add"
173 :> "file"
174 :> MultipartForm Mem (MultipartData Mem)
175 :> QueryParam "fileType" FileType
176 :> "async"
177 :> AsyncJobs ScraperStatus '[JSON] () ScraperStatus
178
179 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
180 :> "corpus"
181 :> Capture "corpus_id" CorpusId
182 :> "add"
183 :> "form"
184 :> "async"
185 :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
186
187 ------------------------------------------------------------------------
188 -- TODO WithQuery also has a corpus id
189 addToCorpusWithQuery :: FlowCmdM env err m
190 => User
191 -> CorpusId
192 -> WithQuery
193 -> (ScraperStatus -> m ())
194 -> m ScraperStatus
195 addToCorpusWithQuery u cid (WithQuery q _dbs l) logStatus = do
196 -- TODO ...
197 logStatus ScraperStatus { _scst_succeeded = Just 10
198 , _scst_failed = Just 2
199 , _scst_remaining = Just 138
200 , _scst_events = Just []
201 }
202 printDebug "addToCorpusWithQuery" cid
203 -- TODO add cid
204 -- TODO if cid is folder -> create Corpus
205 -- if cid is corpus -> add to corpus
206 -- if cid is root -> create corpus in Private
207 cids <- flowCorpusSearchInDatabase u (maybe EN identity l) q
208 printDebug "corpus id" cids
209 -- TODO ...
210 pure ScraperStatus { _scst_succeeded = Just 137
211 , _scst_failed = Just 13
212 , _scst_remaining = Just 0
213 , _scst_events = Just []
214 }
215
216 addToCorpusWithFile :: FlowCmdM env err m
217 => CorpusId
218 -> MultipartData Mem
219 -> Maybe FileType
220 -> (ScraperStatus -> m ())
221 -> m ScraperStatus
222 addToCorpusWithFile cid input filetype logStatus = do
223 logStatus ScraperStatus { _scst_succeeded = Just 10
224 , _scst_failed = Just 2
225 , _scst_remaining = Just 138
226 , _scst_events = Just []
227 }
228 printDebug "addToCorpusWithFile" cid
229 _h <- postUpload cid filetype input
230
231 pure ScraperStatus { _scst_succeeded = Just 137
232 , _scst_failed = Just 13
233 , _scst_remaining = Just 0
234 , _scst_events = Just []
235 }
236
237 addToCorpusWithForm :: FlowCmdM env err m
238 => User
239 -> CorpusId
240 -> WithForm
241 -> (ScraperStatus -> m ())
242 -> m ScraperStatus
243 addToCorpusWithForm user cid (WithForm ft d l _n) logStatus = do
244
245 let
246 parse = case ft of
247 CSV_HAL -> Parser.parseFormat Parser.CsvHal
248 CSV -> Parser.parseFormat Parser.CsvGargV3
249 WOS -> Parser.parseFormat Parser.WOS
250 PresseRIS -> Parser.parseFormat Parser.RisPresse
251
252 logStatus ScraperStatus { _scst_succeeded = Just 1
253 , _scst_failed = Just 0
254 , _scst_remaining = Just 1
255 , _scst_events = Just []
256 }
257
258 printDebug "Parsing corpus: " cid
259
260 -- TODO granularity of the logStatus
261 docs <- liftBase $ splitEvery 500
262 <$> take 1000000
263 <$> parse (cs d)
264
265 printDebug "Parsing corpus finished : " cid
266 printDebug "Starting extraction : " cid
267
268 -- TODO granularity of the logStatus
269 _cid' <- flowCorpus user
270 (Right [cid])
271 (Multi $ fromMaybe EN l)
272 (map (map toHyperdataDocument) docs)
273
274 printDebug "Extraction finished : " cid
275
276 pure ScraperStatus { _scst_succeeded = Just 2
277 , _scst_failed = Just 0
278 , _scst_remaining = Just 0
279 , _scst_events = Just []
280 }
281