]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Corpus/New.hs
[DB/REFACT] getNodeWith Proxy
[gargantext.git] / src / Gargantext / API / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE DeriveGeneric #-}
18 {-# LANGUAGE DataKinds #-}
19 {-# LANGUAGE TypeOperators #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE FlexibleContexts #-}
22 {-# LANGUAGE RankNTypes #-}
23 {-# OPTIONS_GHC -fno-warn-orphans #-}
24
25 module Gargantext.API.Corpus.New
26 where
27
28 import Control.Lens hiding (elements, Empty)
29 import Data.Aeson
30 import Data.Aeson.TH (deriveJSON)
31 import Data.Either
32 import Data.Maybe (fromMaybe)
33 import Data.Swagger
34 import Data.Text (Text)
35 import GHC.Generics (Generic)
36 import Gargantext.API.Admin.Orchestrator.Types (ScraperStatus(..))
37 import qualified Gargantext.API.Admin.Orchestrator.Types as T
38 import Gargantext.API.Corpus.New.File
39 import Gargantext.Core (Lang(..){-, allLangs-})
40 import Gargantext.Core.Types.Individu (UserId, User(..))
41 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
42 import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, getDataText, flowDataText, TermType(..), DataOrigin(..){-, allDataOrigins-})
43 import Gargantext.Database.Admin.Types.Node (CorpusId, ToHyperdataDocument(..))
44 import Gargantext.Prelude
45 import Servant
46 import Servant.API.Flatten (Flat)
47 import Servant.Job.Core
48 import Servant.Job.Types
49 import Servant.Job.Utils (jsonOptions)
50 -- import Servant.Multipart
51 -- import Test.QuickCheck (elements)
52 import Test.QuickCheck.Arbitrary
53 import Web.FormUrlEncoded (FromForm)
54 import qualified Gargantext.Text.Corpus.API as API
55 import qualified Gargantext.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
56
57 ------------------------------------------------------------------------
58 {-
59 data Query = Query { query_query :: Text
60 , query_node_id :: Int
61 , query_lang :: Lang
62 , query_databases :: [DataOrigin]
63 }
64 deriving (Eq, Generic)
65
66 deriveJSON (unPrefix "query_") 'Query
67
68 instance Arbitrary Query where
69 arbitrary = elements [ Query q n la fs
70 | q <- ["honeybee* AND collapse"
71 ,"covid 19"
72 ]
73 , n <- [0..10]
74 , la <- allLangs
75 , fs <- take 3 $ repeat allDataOrigins
76 ]
77
78 instance ToSchema Query where
79 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
80 -}
81
82 ------------------------------------------------------------------------
83
84 {-
85 type Api = PostApi
86 :<|> GetApi
87
88 type PostApi = Summary "New Corpus endpoint"
89 :> ReqBody '[JSON] Query
90 :> Post '[JSON] CorpusId
91 type GetApi = Get '[JSON] ApiInfo
92 -}
93
94 -- | TODO manage several apis
95 -- TODO-ACCESS
96 -- TODO this is only the POST
97 {-
98 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
99 api uid (Query q _ as) = do
100 cId <- case head as of
101 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
102 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
103 Just a -> do
104 docs <- liftBase $ API.get a q (Just 1000)
105 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
106 pure cId'
107
108 pure cId
109 -}
110
111 ------------------------------------------------
112 -- TODO use this route for Client implementation
113 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
114 deriving (Generic)
115 instance Arbitrary ApiInfo where
116 arbitrary = ApiInfo <$> arbitrary
117
118 deriveJSON (unPrefix "") 'ApiInfo
119
120 instance ToSchema ApiInfo
121
122 info :: FlowCmdM env err m => UserId -> m ApiInfo
123 info _u = pure $ ApiInfo API.externalAPIs
124
125 ------------------------------------------------------------------------
126
127 data Database = Empty
128 | PubMed
129 | HAL
130 | IsTex
131 | Isidore
132 deriving (Eq, Show, Generic)
133
134 deriveJSON (unPrefix "") ''Database
135 instance ToSchema Database
136
137 database2origin :: Database -> DataOrigin
138 database2origin Empty = InternalOrigin T.IsTex
139 database2origin PubMed = ExternalOrigin T.PubMed
140 database2origin HAL = ExternalOrigin T.HAL
141 database2origin IsTex = ExternalOrigin T.IsTex
142 database2origin Isidore = ExternalOrigin T.Isidore
143
144 ------------------------------------------------------------------------
145 data WithQuery = WithQuery
146 { _wq_query :: !Text
147 , _wq_databases :: !Database
148 , _wq_lang :: !Lang
149 , _wq_node_id :: !Int
150 }
151 deriving Generic
152
153 makeLenses ''WithQuery
154 instance FromJSON WithQuery where
155 parseJSON = genericParseJSON $ jsonOptions "_wq_"
156 instance ToSchema WithQuery where
157 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
158
159 -------------------------------------------------------
160 data WithForm = WithForm
161 { _wf_filetype :: !FileType
162 , _wf_data :: !Text
163 , _wf_lang :: !(Maybe Lang)
164 , _wf_name :: !Text
165 } deriving (Eq, Show, Generic)
166
167 makeLenses ''WithForm
168 instance FromForm WithForm
169 instance FromJSON WithForm where
170 parseJSON = genericParseJSON $ jsonOptions "_wf_"
171 instance ToSchema WithForm where
172 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")
173
174 ------------------------------------------------------------------------
175 type AsyncJobs event ctI input output =
176 Flat (AsyncJobsAPI' 'Unsafe 'Safe ctI '[JSON] Maybe event input output)
177 ------------------------------------------------------------------------
178
179 type AddWithQuery = Summary "Add with Query to corpus endpoint"
180 :> "corpus"
181 :> Capture "corpus_id" CorpusId
182 :> "query"
183 :> AsyncJobs ScraperStatus '[JSON] WithQuery ScraperStatus
184
185 {-
186 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
187 :> "corpus"
188 :> Capture "corpus_id" CorpusId
189 :> "add"
190 :> "file"
191 :> MultipartForm Mem (MultipartData Mem)
192 :> QueryParam "fileType" FileType
193 :> "async"
194 :> AsyncJobs ScraperStatus '[JSON] () ScraperStatus
195 -}
196
197 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
198 :> "corpus"
199 :> Capture "corpus_id" CorpusId
200 :> "add"
201 :> "form"
202 :> "async"
203 :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
204
205 ------------------------------------------------------------------------
206 -- TODO WithQuery also has a corpus id
207 addToCorpusWithQuery :: FlowCmdM env err m
208 => User
209 -> CorpusId
210 -> WithQuery
211 -> (ScraperStatus -> m ())
212 -> m ScraperStatus
213 addToCorpusWithQuery u cid (WithQuery q dbs l _nid) logStatus = do
214 -- TODO ...
215 logStatus ScraperStatus { _scst_succeeded = Just 10
216 , _scst_failed = Just 2
217 , _scst_remaining = Just 138
218 , _scst_events = Just []
219 }
220 printDebug "addToCorpusWithQuery" cid
221 -- TODO add cid
222 -- TODO if cid is folder -> create Corpus
223 -- if cid is corpus -> add to corpus
224 -- if cid is root -> create corpus in Private
225 txts <- mapM (\db -> getDataText db (Multi l) q (Just 10000)) [database2origin dbs]
226 cids <- mapM (\txt -> flowDataText u txt (Multi l) cid) txts
227 printDebug "corpus id" cids
228 -- TODO ...
229 pure ScraperStatus { _scst_succeeded = Just 137
230 , _scst_failed = Just 13
231 , _scst_remaining = Just 0
232 , _scst_events = Just []
233 }
234
235 addToCorpusWithForm :: FlowCmdM env err m
236 => User
237 -> CorpusId
238 -> WithForm
239 -> (ScraperStatus -> m ())
240 -> m ScraperStatus
241 addToCorpusWithForm user cid (WithForm ft d l _n) logStatus = do
242
243 let
244 parse = case ft of
245 CSV_HAL -> Parser.parseFormat Parser.CsvHal
246 CSV -> Parser.parseFormat Parser.CsvGargV3
247 WOS -> Parser.parseFormat Parser.WOS
248 PresseRIS -> Parser.parseFormat Parser.RisPresse
249
250 logStatus ScraperStatus { _scst_succeeded = Just 1
251 , _scst_failed = Just 0
252 , _scst_remaining = Just 1
253 , _scst_events = Just []
254 }
255
256 printDebug "Parsing corpus: " cid
257
258 -- TODO granularity of the logStatus
259 docs <- liftBase $ splitEvery 500
260 <$> take 1000000
261 <$> parse (cs d)
262
263 printDebug "Parsing corpus finished : " cid
264 printDebug "Starting extraction : " cid
265
266 -- TODO granularity of the logStatus
267 _cid' <- flowCorpus user
268 (Right [cid])
269 (Multi $ fromMaybe EN l)
270 (map (map toHyperdataDocument) docs)
271
272 printDebug "Extraction finished : " cid
273
274 pure ScraperStatus { _scst_succeeded = Just 2
275 , _scst_failed = Just 0
276 , _scst_remaining = Just 0
277 , _scst_events = Just []
278 }
279
280 {-
281 addToCorpusWithFile :: FlowCmdM env err m
282 => CorpusId
283 -> MultipartData Mem
284 -> Maybe FileType
285 -> (ScraperStatus -> m ())
286 -> m ScraperStatus
287 addToCorpusWithFile cid input filetype logStatus = do
288 logStatus ScraperStatus { _scst_succeeded = Just 10
289 , _scst_failed = Just 2
290 , _scst_remaining = Just 138
291 , _scst_events = Just []
292 }
293 printDebug "addToCorpusWithFile" cid
294 _h <- postUpload cid filetype input
295
296 pure ScraperStatus { _scst_succeeded = Just 137
297 , _scst_failed = Just 13
298 , _scst_remaining = Just 0
299 , _scst_events = Just []
300 }
301 -}
302
303