]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/New.hs
[CLEAN] fix imports / explicit instances / warnings
[gargantext.git] / src / Gargantext / API / Node / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
17
18 module Gargantext.API.Node.Corpus.New
19 where
20
21 -- import Servant.Multipart
22 -- import Test.QuickCheck (elements)
23 import Control.Lens hiding (elements, Empty)
24 import Data.Aeson
25 import Data.Aeson.TH (deriveJSON)
26 import Data.Either
27 import Data.Maybe (fromMaybe)
28 import Data.Swagger
29 import Data.Text (Text)
30 import GHC.Generics (Generic)
31 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
32 import Gargantext.API.Admin.Types (HasSettings)
33 import Gargantext.API.Node.Corpus.New.File
34 import Gargantext.API.Node.Types
35 import Gargantext.Core (Lang(..){-, allLangs-})
36 import Gargantext.Core.Types.Individu (User(..))
37 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
38 import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..), DataOrigin(..){-, allDataOrigins-})
39 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
40 import Gargantext.Database.Action.Mail (sendMail)
41 import Gargantext.Database.Action.Node (mkNodeWithParent)
42 import Gargantext.Database.Action.User (getUserId)
43 import Gargantext.Database.Admin.Types.Hyperdata
44 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
45 import Gargantext.Database.Query.Table.Node (getNodeWith)
46 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
47 import Gargantext.Database.Schema.Node (node_hyperdata)
48 import Gargantext.Prelude
49 import Servant
50 import Servant.Job.Utils (jsonOptions)
51 import Test.QuickCheck.Arbitrary
52 import qualified Data.Text as T
53 import qualified Gargantext.API.Admin.Orchestrator.Types as T
54 import qualified Gargantext.Core.Text.Corpus.API as API
55 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
56 import qualified Gargantext.Database.GargDB as GargDB
57
58 ------------------------------------------------------------------------
59 {-
60 data Query = Query { query_query :: Text
61 , query_node_id :: Int
62 , query_lang :: Lang
63 , query_databases :: [DataOrigin]
64 }
65 deriving (Eq, Generic)
66
67 deriveJSON (unPrefix "query_") 'Query
68
69 instance Arbitrary Query where
70 arbitrary = elements [ Query q n la fs
71 | q <- ["honeybee* AND collapse"
72 ,"covid 19"
73 ]
74 , n <- [0..10]
75 , la <- allLangs
76 , fs <- take 3 $ repeat allDataOrigins
77 ]
78
79 instance ToSchema Query where
80 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
81 -}
82
83 ------------------------------------------------------------------------
84
85 {-
86 type Api = PostApi
87 :<|> GetApi
88
89 type PostApi = Summary "New Corpus endpoint"
90 :> ReqBody '[JSON] Query
91 :> Post '[JSON] CorpusId
92 type GetApi = Get '[JSON] ApiInfo
93 -}
94
95 -- | TODO manage several apis
96 -- TODO-ACCESS
97 -- TODO this is only the POST
98 {-
99 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
100 api uid (Query q _ as) = do
101 cId <- case head as of
102 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
103 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
104 Just a -> do
105 docs <- liftBase $ API.get a q (Just 1000)
106 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
107 pure cId'
108
109 pure cId
110 -}
111
112 ------------------------------------------------
113 -- TODO use this route for Client implementation
114 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
115 deriving (Generic)
116 instance Arbitrary ApiInfo where
117 arbitrary = ApiInfo <$> arbitrary
118
119 deriveJSON (unPrefix "") 'ApiInfo
120
121 instance ToSchema ApiInfo
122
123 info :: FlowCmdM env err m => UserId -> m ApiInfo
124 info _u = pure $ ApiInfo API.externalAPIs
125
126 ------------------------------------------------------------------------
127
128 data Database = Empty
129 | PubMed
130 | HAL
131 | IsTex
132 | Isidore
133 deriving (Eq, Show, Generic)
134
135 deriveJSON (unPrefix "") ''Database
136 instance ToSchema Database
137
138 database2origin :: Database -> DataOrigin
139 database2origin Empty = InternalOrigin T.IsTex
140 database2origin PubMed = ExternalOrigin T.PubMed
141 database2origin HAL = ExternalOrigin T.HAL
142 database2origin IsTex = ExternalOrigin T.IsTex
143 database2origin Isidore = ExternalOrigin T.Isidore
144
145 ------------------------------------------------------------------------
146 data WithQuery = WithQuery
147 { _wq_query :: !Text
148 , _wq_databases :: !Database
149 , _wq_lang :: !Lang
150 , _wq_node_id :: !Int
151 }
152 deriving Generic
153
154 makeLenses ''WithQuery
155 instance FromJSON WithQuery where
156 parseJSON = genericParseJSON $ jsonOptions "_wq_"
157 instance ToJSON WithQuery where
158 toJSON = genericToJSON $ jsonOptions "_wq_"
159 instance ToSchema WithQuery where
160 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
161
162 ------------------------------------------------------------------------
163
164 type AddWithQuery = Summary "Add with Query to corpus endpoint"
165 :> "corpus"
166 :> Capture "corpus_id" CorpusId
167 :> "query"
168 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
169
170 {-
171 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
172 :> "corpus"
173 :> Capture "corpus_id" CorpusId
174 :> "add"
175 :> "file"
176 :> MultipartForm Mem (MultipartData Mem)
177 :> QueryParam "fileType" FileType
178 :> "async"
179 :> AsyncJobs JobLog '[JSON] () JobLog
180 -}
181
182
183 ------------------------------------------------------------------------
184 -- TODO WithQuery also has a corpus id
185 addToCorpusWithQuery :: FlowCmdM env err m
186 => User
187 -> CorpusId
188 -> WithQuery
189 -> Maybe Integer
190 -> (JobLog -> m ())
191 -> m JobLog
192 addToCorpusWithQuery user cid (WithQuery q dbs l _nid) maybeLimit logStatus = do
193 -- TODO ...
194 logStatus JobLog { _scst_succeeded = Just 0
195 , _scst_failed = Just 0
196 , _scst_remaining = Just 5
197 , _scst_events = Just []
198 }
199 printDebug "addToCorpusWithQuery" (cid, dbs)
200 -- TODO add cid
201 -- TODO if cid is folder -> create Corpus
202 -- if cid is corpus -> add to corpus
203 -- if cid is root -> create corpus in Private
204 txts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
205
206 logStatus JobLog { _scst_succeeded = Just 2
207 , _scst_failed = Just 0
208 , _scst_remaining = Just 1
209 , _scst_events = Just []
210 }
211
212 cids <- mapM (\txt -> flowDataText user txt (Multi l) cid) txts
213 printDebug "corpus id" cids
214 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
215 sendMail user
216 -- TODO ...
217 pure JobLog { _scst_succeeded = Just 3
218 , _scst_failed = Just 0
219 , _scst_remaining = Just 0
220 , _scst_events = Just []
221 }
222
223
224 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
225 :> "corpus"
226 :> Capture "corpus_id" CorpusId
227 :> "add"
228 :> "form"
229 :> "async"
230 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
231
232 addToCorpusWithForm :: FlowCmdM env err m
233 => User
234 -> CorpusId
235 -> NewWithForm
236 -> (JobLog -> m ())
237 -> m JobLog
238 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus = do
239
240 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
241 printDebug "[addToCorpusWithForm] fileType" ft
242 logStatus JobLog { _scst_succeeded = Just 0
243 , _scst_failed = Just 0
244 , _scst_remaining = Just 2
245 , _scst_events = Just []
246 }
247 let
248 parse = case ft of
249 CSV_HAL -> Parser.parseFormat Parser.CsvHal
250 CSV -> Parser.parseFormat Parser.CsvGargV3
251 WOS -> Parser.parseFormat Parser.WOS
252 PresseRIS -> Parser.parseFormat Parser.RisPresse
253
254 -- TODO granularity of the logStatus
255 docs <- liftBase $ splitEvery 500
256 <$> take 1000000
257 <$> parse (cs d)
258
259 printDebug "Parsing corpus finished : " cid
260 logStatus JobLog { _scst_succeeded = Just 1
261 , _scst_failed = Just 0
262 , _scst_remaining = Just 1
263 , _scst_events = Just []
264 }
265
266
267 printDebug "Starting extraction : " cid
268 -- TODO granularity of the logStatus
269 _cid' <- flowCorpus user
270 (Right [cid])
271 (Multi $ fromMaybe EN l)
272 (map (map toHyperdataDocument) docs)
273
274 printDebug "Extraction finished : " cid
275 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
276 sendMail user
277
278 pure JobLog { _scst_succeeded = Just 2
279 , _scst_failed = Just 0
280 , _scst_remaining = Just 0
281 , _scst_events = Just []
282 }
283
284 {-
285 addToCorpusWithFile :: FlowCmdM env err m
286 => CorpusId
287 -> MultipartData Mem
288 -> Maybe FileType
289 -> (JobLog -> m ())
290 -> m JobLog
291 addToCorpusWithFile cid input filetype logStatus = do
292 logStatus JobLog { _scst_succeeded = Just 10
293 , _scst_failed = Just 2
294 , _scst_remaining = Just 138
295 , _scst_events = Just []
296 }
297 printDebug "addToCorpusWithFile" cid
298 _h <- postUpload cid filetype input
299
300 pure JobLog { _scst_succeeded = Just 137
301 , _scst_failed = Just 13
302 , _scst_remaining = Just 0
303 , _scst_events = Just []
304 }
305 -}
306
307
308
309 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
310 :> "corpus"
311 :> Capture "corpus_id" CorpusId
312 :> "add"
313 :> "file"
314 :> "async"
315 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
316
317 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
318 => User
319 -> CorpusId
320 -> NewWithFile
321 -> (JobLog -> m ())
322 -> m JobLog
323 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
324
325 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
326 logStatus JobLog { _scst_succeeded = Just 0
327 , _scst_failed = Just 0
328 , _scst_remaining = Just 1
329 , _scst_events = Just []
330 }
331
332 fPath <- GargDB.writeFile nwf
333 printDebug "[addToCorpusWithFile] File saved as: " fPath
334
335 uId <- getUserId user
336 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
337
338 _ <- case nIds of
339 [nId] -> do
340 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
341 let hl = node ^. node_hyperdata
342 _ <- updateHyperdata nId $ hl { _hff_name = fName
343 , _hff_path = T.pack fPath }
344
345 printDebug "[addToCorpusWithFile] Created node with id: " nId
346 _ -> pure ()
347
348 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
349
350 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
351 sendMail user
352
353 pure $ JobLog { _scst_succeeded = Just 1
354 , _scst_failed = Just 0
355 , _scst_remaining = Just 0
356 , _scst_events = Just []
357 }