]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/New.hs
[API] PostNodeAsync funs, before refactoring
[gargantext.git] / src / Gargantext / API / Node / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE DeriveGeneric #-}
18 {-# LANGUAGE DataKinds #-}
19 {-# LANGUAGE TypeOperators #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE FlexibleContexts #-}
22 {-# LANGUAGE RankNTypes #-}
23 {-# OPTIONS_GHC -fno-warn-orphans #-}
24
25 module Gargantext.API.Node.Corpus.New
26 where
27
28 import Control.Lens hiding (elements, Empty)
29 import Data.Aeson
30 import Data.Aeson.TH (deriveJSON)
31 import Data.Either
32 import Data.Maybe (fromMaybe)
33 import Data.Swagger
34 import Data.Text (Text)
35 import GHC.Generics (Generic)
36 import Gargantext.API.Admin.Orchestrator.Types (ScraperStatus(..))
37 import qualified Gargantext.API.Admin.Orchestrator.Types as T
38 import Gargantext.API.Node.Corpus.New.File
39 import Gargantext.Core (Lang(..){-, allLangs-})
40 import Gargantext.Core.Types.Individu (User(..))
41 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
42 import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, getDataText, flowDataText, TermType(..), DataOrigin(..){-, allDataOrigins-})
43 import Gargantext.Database.Admin.Types.Node (CorpusId, ToHyperdataDocument(..), UserId)
44 import Gargantext.Prelude
45 import Servant
46 import Servant.API.Flatten (Flat)
47 import Servant.Job.Core
48 import Servant.Job.Types
49 import Servant.Job.Utils (jsonOptions)
50 -- import Servant.Multipart
51 -- import Test.QuickCheck (elements)
52 import Test.QuickCheck.Arbitrary
53 import Web.FormUrlEncoded (FromForm)
54 import qualified Gargantext.Text.Corpus.API as API
55 import qualified Gargantext.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
56
57 ------------------------------------------------------------------------
58 {-
59 data Query = Query { query_query :: Text
60 , query_node_id :: Int
61 , query_lang :: Lang
62 , query_databases :: [DataOrigin]
63 }
64 deriving (Eq, Generic)
65
66 deriveJSON (unPrefix "query_") 'Query
67
68 instance Arbitrary Query where
69 arbitrary = elements [ Query q n la fs
70 | q <- ["honeybee* AND collapse"
71 ,"covid 19"
72 ]
73 , n <- [0..10]
74 , la <- allLangs
75 , fs <- take 3 $ repeat allDataOrigins
76 ]
77
78 instance ToSchema Query where
79 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
80 -}
81
82 ------------------------------------------------------------------------
83
84 {-
85 type Api = PostApi
86 :<|> GetApi
87
88 type PostApi = Summary "New Corpus endpoint"
89 :> ReqBody '[JSON] Query
90 :> Post '[JSON] CorpusId
91 type GetApi = Get '[JSON] ApiInfo
92 -}
93
94 -- | TODO manage several apis
95 -- TODO-ACCESS
96 -- TODO this is only the POST
97 {-
98 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
99 api uid (Query q _ as) = do
100 cId <- case head as of
101 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
102 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
103 Just a -> do
104 docs <- liftBase $ API.get a q (Just 1000)
105 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
106 pure cId'
107
108 pure cId
109 -}
110
111 ------------------------------------------------
112 -- TODO use this route for Client implementation
113 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
114 deriving (Generic)
115 instance Arbitrary ApiInfo where
116 arbitrary = ApiInfo <$> arbitrary
117
118 deriveJSON (unPrefix "") 'ApiInfo
119
120 instance ToSchema ApiInfo
121
122 info :: FlowCmdM env err m => UserId -> m ApiInfo
123 info _u = pure $ ApiInfo API.externalAPIs
124
125 ------------------------------------------------------------------------
126
127 data Database = Empty
128 | PubMed
129 | HAL
130 | IsTex
131 | Isidore
132 deriving (Eq, Show, Generic)
133
134 deriveJSON (unPrefix "") ''Database
135 instance ToSchema Database
136
137 database2origin :: Database -> DataOrigin
138 database2origin Empty = InternalOrigin T.IsTex
139 database2origin PubMed = ExternalOrigin T.PubMed
140 database2origin HAL = ExternalOrigin T.HAL
141 database2origin IsTex = ExternalOrigin T.IsTex
142 database2origin Isidore = ExternalOrigin T.Isidore
143
144 ------------------------------------------------------------------------
145 data WithQuery = WithQuery
146 { _wq_query :: !Text
147 , _wq_databases :: !Database
148 , _wq_lang :: !Lang
149 , _wq_node_id :: !Int
150 }
151 deriving Generic
152
153 makeLenses ''WithQuery
154 instance FromJSON WithQuery where
155 parseJSON = genericParseJSON $ jsonOptions "_wq_"
156 instance ToSchema WithQuery where
157 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
158
159 -------------------------------------------------------
160 data WithForm = WithForm
161 { _wf_filetype :: !FileType
162 , _wf_data :: !Text
163 , _wf_lang :: !(Maybe Lang)
164 , _wf_name :: !Text
165 } deriving (Eq, Show, Generic)
166
167 makeLenses ''WithForm
168 instance FromForm WithForm
169 instance FromJSON WithForm where
170 parseJSON = genericParseJSON $ jsonOptions "_wf_"
171 instance ToSchema WithForm where
172 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")
173
174 ------------------------------------------------------------------------
175 type AsyncJobs event ctI input output =
176 Flat (AsyncJobsAPI' 'Unsafe 'Safe ctI '[JSON] Maybe event input output)
177 ------------------------------------------------------------------------
178
179 type AddWithQuery = Summary "Add with Query to corpus endpoint"
180 :> "corpus"
181 :> Capture "corpus_id" CorpusId
182 :> "query"
183 :> AsyncJobs ScraperStatus '[JSON] WithQuery ScraperStatus
184
185 {-
186 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
187 :> "corpus"
188 :> Capture "corpus_id" CorpusId
189 :> "add"
190 :> "file"
191 :> MultipartForm Mem (MultipartData Mem)
192 :> QueryParam "fileType" FileType
193 :> "async"
194 :> AsyncJobs ScraperStatus '[JSON] () ScraperStatus
195 -}
196
197 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
198 :> "corpus"
199 :> Capture "corpus_id" CorpusId
200 :> "add"
201 :> "form"
202 :> "async"
203 :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
204
205
206 ------------------------------------------------------------------------
207 -- TODO WithQuery also has a corpus id
208 addToCorpusWithQuery :: FlowCmdM env err m
209 => User
210 -> CorpusId
211 -> WithQuery
212 -> (ScraperStatus -> m ())
213 -> m ScraperStatus
214 addToCorpusWithQuery u cid (WithQuery q dbs l _nid) logStatus = do
215 -- TODO ...
216 logStatus ScraperStatus { _scst_succeeded = Just 10
217 , _scst_failed = Just 2
218 , _scst_remaining = Just 138
219 , _scst_events = Just []
220 }
221 printDebug "addToCorpusWithQuery" cid
222 -- TODO add cid
223 -- TODO if cid is folder -> create Corpus
224 -- if cid is corpus -> add to corpus
225 -- if cid is root -> create corpus in Private
226 txts <- mapM (\db -> getDataText db (Multi l) q (Just 10000)) [database2origin dbs]
227 cids <- mapM (\txt -> flowDataText u txt (Multi l) cid) txts
228 printDebug "corpus id" cids
229 -- TODO ...
230 pure ScraperStatus { _scst_succeeded = Just 137
231 , _scst_failed = Just 13
232 , _scst_remaining = Just 0
233 , _scst_events = Just []
234 }
235
236 addToCorpusWithForm :: FlowCmdM env err m
237 => User
238 -> CorpusId
239 -> WithForm
240 -> (ScraperStatus -> m ())
241 -> m ScraperStatus
242 addToCorpusWithForm user cid (WithForm ft d l _n) logStatus = do
243
244 let
245 parse = case ft of
246 CSV_HAL -> Parser.parseFormat Parser.CsvHal
247 CSV -> Parser.parseFormat Parser.CsvGargV3
248 WOS -> Parser.parseFormat Parser.WOS
249 PresseRIS -> Parser.parseFormat Parser.RisPresse
250
251 logStatus ScraperStatus { _scst_succeeded = Just 1
252 , _scst_failed = Just 0
253 , _scst_remaining = Just 1
254 , _scst_events = Just []
255 }
256
257 printDebug "Parsing corpus: " cid
258
259 -- TODO granularity of the logStatus
260 docs <- liftBase $ splitEvery 500
261 <$> take 1000000
262 <$> parse (cs d)
263
264 printDebug "Parsing corpus finished : " cid
265 printDebug "Starting extraction : " cid
266
267 -- TODO granularity of the logStatus
268 _cid' <- flowCorpus user
269 (Right [cid])
270 (Multi $ fromMaybe EN l)
271 (map (map toHyperdataDocument) docs)
272
273 printDebug "Extraction finished : " cid
274
275 pure ScraperStatus { _scst_succeeded = Just 2
276 , _scst_failed = Just 0
277 , _scst_remaining = Just 0
278 , _scst_events = Just []
279 }
280
281 {-
282 addToCorpusWithFile :: FlowCmdM env err m
283 => CorpusId
284 -> MultipartData Mem
285 -> Maybe FileType
286 -> (ScraperStatus -> m ())
287 -> m ScraperStatus
288 addToCorpusWithFile cid input filetype logStatus = do
289 logStatus ScraperStatus { _scst_succeeded = Just 10
290 , _scst_failed = Just 2
291 , _scst_remaining = Just 138
292 , _scst_events = Just []
293 }
294 printDebug "addToCorpusWithFile" cid
295 _h <- postUpload cid filetype input
296
297 pure ScraperStatus { _scst_succeeded = Just 137
298 , _scst_failed = Just 13
299 , _scst_remaining = Just 0
300 , _scst_events = Just []
301 }
302 -}
303
304