]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/New.hs
[frameCalc upload] fetches node now with its ethercalc url
[gargantext.git] / src / Gargantext / API / Node / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
17
18 module Gargantext.API.Node.Corpus.New
19 where
20
21 import Control.Lens hiding (elements, Empty)
22 import Data.Aeson
23 import Data.Aeson.TH (deriveJSON)
24 import Data.Either
25 import Data.Maybe (fromMaybe)
26 import Data.Swagger
27 import Data.Text (Text)
28 import qualified Data.Text as T
29 import GHC.Generics (Generic)
30 import Servant
31 import Servant.Job.Utils (jsonOptions)
32 -- import Servant.Multipart
33 -- import Test.QuickCheck (elements)
34 import Test.QuickCheck.Arbitrary
35
36 import Gargantext.Prelude
37
38 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
39 import qualified Gargantext.API.Admin.Orchestrator.Types as T
40 import Gargantext.API.Admin.Types (HasSettings)
41 import Gargantext.API.Node.Corpus.New.File
42 import Gargantext.API.Node.Types
43 import Gargantext.Core (Lang(..){-, allLangs-})
44 import Gargantext.Database.Action.Mail (sendMail)
45 import Gargantext.Core.Types.Individu (User(..))
46 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
47 import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, getDataText, flowDataText, TermType(..), DataOrigin(..){-, allDataOrigins-})
48 import Gargantext.Database.Action.User (getUserId)
49 import Gargantext.Database.Action.Node (mkNodeWithParent)
50 import Gargantext.Database.Admin.Types.Hyperdata
51 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
52 import Gargantext.Database.Query.Table.Node (getNodeWith)
53 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
54 import Gargantext.Database.Schema.Node (node_hyperdata)
55 import qualified Gargantext.Database.GargDB as GargDB
56 import qualified Gargantext.Core.Text.Corpus.API as API
57 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
58
59 ------------------------------------------------------------------------
60 {-
61 data Query = Query { query_query :: Text
62 , query_node_id :: Int
63 , query_lang :: Lang
64 , query_databases :: [DataOrigin]
65 }
66 deriving (Eq, Generic)
67
68 deriveJSON (unPrefix "query_") 'Query
69
70 instance Arbitrary Query where
71 arbitrary = elements [ Query q n la fs
72 | q <- ["honeybee* AND collapse"
73 ,"covid 19"
74 ]
75 , n <- [0..10]
76 , la <- allLangs
77 , fs <- take 3 $ repeat allDataOrigins
78 ]
79
80 instance ToSchema Query where
81 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
82 -}
83
84 ------------------------------------------------------------------------
85
86 {-
87 type Api = PostApi
88 :<|> GetApi
89
90 type PostApi = Summary "New Corpus endpoint"
91 :> ReqBody '[JSON] Query
92 :> Post '[JSON] CorpusId
93 type GetApi = Get '[JSON] ApiInfo
94 -}
95
96 -- | TODO manage several apis
97 -- TODO-ACCESS
98 -- TODO this is only the POST
99 {-
100 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
101 api uid (Query q _ as) = do
102 cId <- case head as of
103 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
104 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
105 Just a -> do
106 docs <- liftBase $ API.get a q (Just 1000)
107 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
108 pure cId'
109
110 pure cId
111 -}
112
113 ------------------------------------------------
114 -- TODO use this route for Client implementation
115 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
116 deriving (Generic)
117 instance Arbitrary ApiInfo where
118 arbitrary = ApiInfo <$> arbitrary
119
120 deriveJSON (unPrefix "") 'ApiInfo
121
122 instance ToSchema ApiInfo
123
124 info :: FlowCmdM env err m => UserId -> m ApiInfo
125 info _u = pure $ ApiInfo API.externalAPIs
126
127 ------------------------------------------------------------------------
128
129 data Database = Empty
130 | PubMed
131 | HAL
132 | IsTex
133 | Isidore
134 deriving (Eq, Show, Generic)
135
136 deriveJSON (unPrefix "") ''Database
137 instance ToSchema Database
138
139 database2origin :: Database -> DataOrigin
140 database2origin Empty = InternalOrigin T.IsTex
141 database2origin PubMed = ExternalOrigin T.PubMed
142 database2origin HAL = ExternalOrigin T.HAL
143 database2origin IsTex = ExternalOrigin T.IsTex
144 database2origin Isidore = ExternalOrigin T.Isidore
145
146 ------------------------------------------------------------------------
147 data WithQuery = WithQuery
148 { _wq_query :: !Text
149 , _wq_databases :: !Database
150 , _wq_lang :: !Lang
151 , _wq_node_id :: !Int
152 }
153 deriving Generic
154
155 makeLenses ''WithQuery
156 instance FromJSON WithQuery where
157 parseJSON = genericParseJSON $ jsonOptions "_wq_"
158 instance ToJSON WithQuery where
159 toJSON = genericToJSON $ jsonOptions "_wq_"
160 instance ToSchema WithQuery where
161 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
162
163 ------------------------------------------------------------------------
164
165 type AddWithQuery = Summary "Add with Query to corpus endpoint"
166 :> "corpus"
167 :> Capture "corpus_id" CorpusId
168 :> "query"
169 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
170
171 {-
172 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
173 :> "corpus"
174 :> Capture "corpus_id" CorpusId
175 :> "add"
176 :> "file"
177 :> MultipartForm Mem (MultipartData Mem)
178 :> QueryParam "fileType" FileType
179 :> "async"
180 :> AsyncJobs JobLog '[JSON] () JobLog
181 -}
182
183
184 ------------------------------------------------------------------------
185 -- TODO WithQuery also has a corpus id
186 addToCorpusWithQuery :: FlowCmdM env err m
187 => User
188 -> CorpusId
189 -> WithQuery
190 -> Maybe Integer
191 -> (JobLog -> m ())
192 -> m JobLog
193 addToCorpusWithQuery user cid (WithQuery q dbs l _nid) maybeLimit logStatus = do
194 -- TODO ...
195 logStatus JobLog { _scst_succeeded = Just 0
196 , _scst_failed = Just 0
197 , _scst_remaining = Just 5
198 , _scst_events = Just []
199 }
200 printDebug "addToCorpusWithQuery" (cid, dbs)
201 -- TODO add cid
202 -- TODO if cid is folder -> create Corpus
203 -- if cid is corpus -> add to corpus
204 -- if cid is root -> create corpus in Private
205 txts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
206
207 logStatus JobLog { _scst_succeeded = Just 2
208 , _scst_failed = Just 0
209 , _scst_remaining = Just 1
210 , _scst_events = Just []
211 }
212
213 cids <- mapM (\txt -> flowDataText user txt (Multi l) cid) txts
214 printDebug "corpus id" cids
215 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
216 sendMail user
217 -- TODO ...
218 pure JobLog { _scst_succeeded = Just 3
219 , _scst_failed = Just 0
220 , _scst_remaining = Just 0
221 , _scst_events = Just []
222 }
223
224
225 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
226 :> "corpus"
227 :> Capture "corpus_id" CorpusId
228 :> "add"
229 :> "form"
230 :> "async"
231 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
232
233 addToCorpusWithForm :: FlowCmdM env err m
234 => User
235 -> CorpusId
236 -> NewWithForm
237 -> (JobLog -> m ())
238 -> m JobLog
239 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus = do
240 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
241 printDebug "[addToCorpusWithForm] fileType" ft
242 logStatus JobLog { _scst_succeeded = Just 0
243 , _scst_failed = Just 0
244 , _scst_remaining = Just 2
245 , _scst_events = Just []
246 }
247 let
248 parse = case ft of
249 CSV_HAL -> Parser.parseFormat Parser.CsvHal
250 CSV -> Parser.parseFormat Parser.CsvGargV3
251 WOS -> Parser.parseFormat Parser.WOS
252 PresseRIS -> Parser.parseFormat Parser.RisPresse
253
254 -- TODO granularity of the logStatus
255 docs <- liftBase $ splitEvery 500
256 <$> take 1000000
257 <$> parse (cs d)
258
259 printDebug "Parsing corpus finished : " cid
260 logStatus JobLog { _scst_succeeded = Just 1
261 , _scst_failed = Just 0
262 , _scst_remaining = Just 1
263 , _scst_events = Just []
264 }
265
266
267 printDebug "Starting extraction : " cid
268 -- TODO granularity of the logStatus
269 _cid' <- flowCorpus user
270 (Right [cid])
271 (Multi $ fromMaybe EN l)
272 (map (map toHyperdataDocument) docs)
273
274 printDebug "Extraction finished : " cid
275 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
276 sendMail user
277
278 pure JobLog { _scst_succeeded = Just 2
279 , _scst_failed = Just 0
280 , _scst_remaining = Just 0
281 , _scst_events = Just []
282 }
283
284 {-
285 addToCorpusWithFile :: FlowCmdM env err m
286 => CorpusId
287 -> MultipartData Mem
288 -> Maybe FileType
289 -> (JobLog -> m ())
290 -> m JobLog
291 addToCorpusWithFile cid input filetype logStatus = do
292 logStatus JobLog { _scst_succeeded = Just 10
293 , _scst_failed = Just 2
294 , _scst_remaining = Just 138
295 , _scst_events = Just []
296 }
297 printDebug "addToCorpusWithFile" cid
298 _h <- postUpload cid filetype input
299
300 pure JobLog { _scst_succeeded = Just 137
301 , _scst_failed = Just 13
302 , _scst_remaining = Just 0
303 , _scst_events = Just []
304 }
305 -}
306
307
308
309 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
310 :> "corpus"
311 :> Capture "corpus_id" CorpusId
312 :> "add"
313 :> "file"
314 :> "async"
315 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
316
317 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
318 => User
319 -> CorpusId
320 -> NewWithFile
321 -> (JobLog -> m ())
322 -> m JobLog
323 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
324
325 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
326 logStatus JobLog { _scst_succeeded = Just 0
327 , _scst_failed = Just 0
328 , _scst_remaining = Just 1
329 , _scst_events = Just []
330 }
331
332 fPath <- GargDB.writeFile nwf
333 printDebug "[addToCorpusWithFile] File saved as: " fPath
334
335 uId <- getUserId user
336 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
337
338 _ <- case nIds of
339 [nId] -> do
340 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
341 let hl = node ^. node_hyperdata
342 _ <- updateHyperdata nId $ hl { _hff_name = fName
343 , _hff_path = T.pack fPath }
344
345 printDebug "[addToCorpusWithFile] Created node with id: " nId
346 _ -> pure ()
347
348 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
349
350 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
351 sendMail user
352
353 pure $ JobLog { _scst_succeeded = Just 1
354 , _scst_failed = Just 0
355 , _scst_remaining = Just 0
356 , _scst_events = Just []
357 }