]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/New.hs
[version] bump to 0.0.1.4.2
[gargantext.git] / src / Gargantext / API / Node / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE DeriveGeneric #-}
18 {-# LANGUAGE DataKinds #-}
19 {-# LANGUAGE TypeOperators #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE FlexibleContexts #-}
22 {-# LANGUAGE RankNTypes #-}
23 {-# OPTIONS_GHC -fno-warn-orphans #-}
24
25 module Gargantext.API.Node.Corpus.New
26 where
27
28 import Control.Lens hiding (elements, Empty)
29 import Data.Aeson
30 import Data.Aeson.TH (deriveJSON)
31 import Data.Either
32 import Data.Maybe (fromMaybe)
33 import Data.Swagger
34 import Data.Text (Text)
35 import GHC.Generics (Generic)
36 import Gargantext.API.Admin.Orchestrator.Types (ScraperStatus(..))
37 import qualified Gargantext.API.Admin.Orchestrator.Types as T
38 import Gargantext.API.Node.Corpus.New.File
39 import Gargantext.Core (Lang(..){-, allLangs-})
40 import Gargantext.Core.Types.Individu (User(..))
41 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
42 import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, getDataText, flowDataText, TermType(..), DataOrigin(..){-, allDataOrigins-})
43 import Gargantext.Database.Admin.Types.Node (CorpusId, ToHyperdataDocument(..), UserId)
44 import Gargantext.Prelude
45 import Servant
46 import Servant.Job.Core
47 import Servant.Job.Types
48 import Servant.Job.Utils (jsonOptions)
49 -- import Servant.Multipart
50 -- import Test.QuickCheck (elements)
51 import Test.QuickCheck.Arbitrary
52 import Web.FormUrlEncoded (FromForm)
53 import qualified Gargantext.Text.Corpus.API as API
54 import qualified Gargantext.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
55
56 ------------------------------------------------------------------------
57 {-
58 data Query = Query { query_query :: Text
59 , query_node_id :: Int
60 , query_lang :: Lang
61 , query_databases :: [DataOrigin]
62 }
63 deriving (Eq, Generic)
64
65 deriveJSON (unPrefix "query_") 'Query
66
67 instance Arbitrary Query where
68 arbitrary = elements [ Query q n la fs
69 | q <- ["honeybee* AND collapse"
70 ,"covid 19"
71 ]
72 , n <- [0..10]
73 , la <- allLangs
74 , fs <- take 3 $ repeat allDataOrigins
75 ]
76
77 instance ToSchema Query where
78 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
79 -}
80
81 ------------------------------------------------------------------------
82
83 {-
84 type Api = PostApi
85 :<|> GetApi
86
87 type PostApi = Summary "New Corpus endpoint"
88 :> ReqBody '[JSON] Query
89 :> Post '[JSON] CorpusId
90 type GetApi = Get '[JSON] ApiInfo
91 -}
92
93 -- | TODO manage several apis
94 -- TODO-ACCESS
95 -- TODO this is only the POST
96 {-
97 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
98 api uid (Query q _ as) = do
99 cId <- case head as of
100 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
101 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
102 Just a -> do
103 docs <- liftBase $ API.get a q (Just 1000)
104 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
105 pure cId'
106
107 pure cId
108 -}
109
110 ------------------------------------------------
111 -- TODO use this route for Client implementation
112 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
113 deriving (Generic)
114 instance Arbitrary ApiInfo where
115 arbitrary = ApiInfo <$> arbitrary
116
117 deriveJSON (unPrefix "") 'ApiInfo
118
119 instance ToSchema ApiInfo
120
121 info :: FlowCmdM env err m => UserId -> m ApiInfo
122 info _u = pure $ ApiInfo API.externalAPIs
123
124 ------------------------------------------------------------------------
125
126 data Database = Empty
127 | PubMed
128 | HAL
129 | IsTex
130 | Isidore
131 deriving (Eq, Show, Generic)
132
133 deriveJSON (unPrefix "") ''Database
134 instance ToSchema Database
135
136 database2origin :: Database -> DataOrigin
137 database2origin Empty = InternalOrigin T.IsTex
138 database2origin PubMed = ExternalOrigin T.PubMed
139 database2origin HAL = ExternalOrigin T.HAL
140 database2origin IsTex = ExternalOrigin T.IsTex
141 database2origin Isidore = ExternalOrigin T.Isidore
142
143 ------------------------------------------------------------------------
144 data WithQuery = WithQuery
145 { _wq_query :: !Text
146 , _wq_databases :: !Database
147 , _wq_lang :: !Lang
148 , _wq_node_id :: !Int
149 }
150 deriving Generic
151
152 makeLenses ''WithQuery
153 instance FromJSON WithQuery where
154 parseJSON = genericParseJSON $ jsonOptions "_wq_"
155 instance ToSchema WithQuery where
156 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
157
158 -------------------------------------------------------
159 data WithForm = WithForm
160 { _wf_filetype :: !FileType
161 , _wf_data :: !Text
162 , _wf_lang :: !(Maybe Lang)
163 , _wf_name :: !Text
164 } deriving (Eq, Show, Generic)
165
166 makeLenses ''WithForm
167 instance FromForm WithForm
168 instance FromJSON WithForm where
169 parseJSON = genericParseJSON $ jsonOptions "_wf_"
170 instance ToSchema WithForm where
171 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")
172
173 ------------------------------------------------------------------------
174 type AsyncJobs event ctI input output =
175 AsyncJobsAPI' 'Unsafe 'Safe ctI '[JSON] Maybe event input output
176 ------------------------------------------------------------------------
177
178 type AddWithQuery = Summary "Add with Query to corpus endpoint"
179 :> "corpus"
180 :> Capture "corpus_id" CorpusId
181 :> "query"
182 :> AsyncJobs ScraperStatus '[JSON] WithQuery ScraperStatus
183
184 {-
185 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
186 :> "corpus"
187 :> Capture "corpus_id" CorpusId
188 :> "add"
189 :> "file"
190 :> MultipartForm Mem (MultipartData Mem)
191 :> QueryParam "fileType" FileType
192 :> "async"
193 :> AsyncJobs ScraperStatus '[JSON] () ScraperStatus
194 -}
195
196 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
197 :> "corpus"
198 :> Capture "corpus_id" CorpusId
199 :> "add"
200 :> "form"
201 :> "async"
202 :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
203
204
205 ------------------------------------------------------------------------
206 -- TODO WithQuery also has a corpus id
207 addToCorpusWithQuery :: FlowCmdM env err m
208 => User
209 -> CorpusId
210 -> WithQuery
211 -> (ScraperStatus -> m ())
212 -> m ScraperStatus
213 addToCorpusWithQuery u cid (WithQuery q dbs l _nid) logStatus = do
214 -- TODO ...
215 logStatus ScraperStatus { _scst_succeeded = Just 10
216 , _scst_failed = Just 2
217 , _scst_remaining = Just 138
218 , _scst_events = Just []
219 }
220 printDebug "addToCorpusWithQuery" cid
221 -- TODO add cid
222 -- TODO if cid is folder -> create Corpus
223 -- if cid is corpus -> add to corpus
224 -- if cid is root -> create corpus in Private
225 txts <- mapM (\db -> getDataText db (Multi l) q (Just 10000)) [database2origin dbs]
226 cids <- mapM (\txt -> flowDataText u txt (Multi l) cid) txts
227 printDebug "corpus id" cids
228 -- TODO ...
229 pure ScraperStatus { _scst_succeeded = Just 137
230 , _scst_failed = Just 13
231 , _scst_remaining = Just 0
232 , _scst_events = Just []
233 }
234
235 addToCorpusWithForm :: FlowCmdM env err m
236 => User
237 -> CorpusId
238 -> WithForm
239 -> (ScraperStatus -> m ())
240 -> m ScraperStatus
241 addToCorpusWithForm user cid (WithForm ft d l _n) logStatus = do
242
243 let
244 parse = case ft of
245 CSV_HAL -> Parser.parseFormat Parser.CsvHal
246 CSV -> Parser.parseFormat Parser.CsvGargV3
247 WOS -> Parser.parseFormat Parser.WOS
248 PresseRIS -> Parser.parseFormat Parser.RisPresse
249
250 logStatus ScraperStatus { _scst_succeeded = Just 1
251 , _scst_failed = Just 0
252 , _scst_remaining = Just 1
253 , _scst_events = Just []
254 }
255
256 printDebug "Parsing corpus: " cid
257
258 -- TODO granularity of the logStatus
259 docs <- liftBase $ splitEvery 500
260 <$> take 1000000
261 <$> parse (cs d)
262
263 printDebug "Parsing corpus finished : " cid
264 printDebug "Starting extraction : " cid
265
266 -- TODO granularity of the logStatus
267 _cid' <- flowCorpus user
268 (Right [cid])
269 (Multi $ fromMaybe EN l)
270 (map (map toHyperdataDocument) docs)
271
272 printDebug "Extraction finished : " cid
273
274 pure ScraperStatus { _scst_succeeded = Just 2
275 , _scst_failed = Just 0
276 , _scst_remaining = Just 0
277 , _scst_events = Just []
278 }
279
280 {-
281 addToCorpusWithFile :: FlowCmdM env err m
282 => CorpusId
283 -> MultipartData Mem
284 -> Maybe FileType
285 -> (ScraperStatus -> m ())
286 -> m ScraperStatus
287 addToCorpusWithFile cid input filetype logStatus = do
288 logStatus ScraperStatus { _scst_succeeded = Just 10
289 , _scst_failed = Just 2
290 , _scst_remaining = Just 138
291 , _scst_events = Just []
292 }
293 printDebug "addToCorpusWithFile" cid
294 _h <- postUpload cid filetype input
295
296 pure ScraperStatus { _scst_succeeded = Just 137
297 , _scst_failed = Just 13
298 , _scst_remaining = Just 0
299 , _scst_events = Just []
300 }
301 -}
302
303