]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/New.hs
[FIX] merge with dev-phylo
[gargantext.git] / src / Gargantext / API / Node / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
17
18 module Gargantext.API.Node.Corpus.New
19 where
20
21 import Control.Lens hiding (elements, Empty)
22 import Data.Aeson
23 import Data.Aeson.TH (deriveJSON)
24 import Data.Either
25 import Data.Maybe (fromMaybe)
26 import Data.Swagger
27 import Data.Text (Text)
28 import qualified Data.Text as T
29 import GHC.Generics (Generic)
30 import Servant
31 import Servant.Job.Utils (jsonOptions)
32 -- import Servant.Multipart
33 -- import Test.QuickCheck (elements)
34 import Test.QuickCheck.Arbitrary
35
36 import Gargantext.Prelude
37
38 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
39 import qualified Gargantext.API.Admin.Orchestrator.Types as T
40 import Gargantext.API.Admin.Types (HasSettings)
41 import Gargantext.API.Node.Corpus.New.File
42 import Gargantext.API.Node.Types
43 import Gargantext.Core (Lang(..){-, allLangs-})
44 import Gargantext.Database.Action.Mail (sendMail)
45 import Gargantext.Core.Types.Individu (User(..))
46 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
47 import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, getDataText, flowDataText, TermType(..), DataOrigin(..){-, allDataOrigins-})
48 import Gargantext.Database.Action.User (getUserId)
49 import Gargantext.Database.Action.Node (mkNodeWithParent)
50 import Gargantext.Database.Admin.Types.Hyperdata
51 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
52 import Gargantext.Database.Query.Table.Node (getNodeWith)
53 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
54 import Gargantext.Database.Schema.Node (node_hyperdata)
55 import qualified Gargantext.Prelude.GargDB as GargDB
56 import qualified Gargantext.Core.Text.Corpus.API as API
57 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
58
59 ------------------------------------------------------------------------
60 {-
61 data Query = Query { query_query :: Text
62 , query_node_id :: Int
63 , query_lang :: Lang
64 , query_databases :: [DataOrigin]
65 }
66 deriving (Eq, Generic)
67
68 deriveJSON (unPrefix "query_") 'Query
69
70 instance Arbitrary Query where
71 arbitrary = elements [ Query q n la fs
72 | q <- ["honeybee* AND collapse"
73 ,"covid 19"
74 ]
75 , n <- [0..10]
76 , la <- allLangs
77 , fs <- take 3 $ repeat allDataOrigins
78 ]
79
80 instance ToSchema Query where
81 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
82 -}
83
84 ------------------------------------------------------------------------
85
86 {-
87 type Api = PostApi
88 :<|> GetApi
89
90 type PostApi = Summary "New Corpus endpoint"
91 :> ReqBody '[JSON] Query
92 :> Post '[JSON] CorpusId
93 type GetApi = Get '[JSON] ApiInfo
94 -}
95
96 -- | TODO manage several apis
97 -- TODO-ACCESS
98 -- TODO this is only the POST
99 {-
100 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
101 api uid (Query q _ as) = do
102 cId <- case head as of
103 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
104 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
105 Just a -> do
106 docs <- liftBase $ API.get a q (Just 1000)
107 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
108 pure cId'
109
110 pure cId
111 -}
112
113 ------------------------------------------------
114 -- TODO use this route for Client implementation
115 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
116 deriving (Generic)
117 instance Arbitrary ApiInfo where
118 arbitrary = ApiInfo <$> arbitrary
119
120 deriveJSON (unPrefix "") 'ApiInfo
121
122 instance ToSchema ApiInfo
123
124 info :: FlowCmdM env err m => UserId -> m ApiInfo
125 info _u = pure $ ApiInfo API.externalAPIs
126
127 ------------------------------------------------------------------------
128
129 data Database = Empty
130 | PubMed
131 | HAL
132 | IsTex
133 | Isidore
134 deriving (Eq, Show, Generic)
135
136 deriveJSON (unPrefix "") ''Database
137 instance ToSchema Database
138
139 database2origin :: Database -> DataOrigin
140 database2origin Empty = InternalOrigin T.IsTex
141 database2origin PubMed = ExternalOrigin T.PubMed
142 database2origin HAL = ExternalOrigin T.HAL
143 database2origin IsTex = ExternalOrigin T.IsTex
144 database2origin Isidore = ExternalOrigin T.Isidore
145
146 ------------------------------------------------------------------------
147 data WithQuery = WithQuery
148 { _wq_query :: !Text
149 , _wq_databases :: !Database
150 , _wq_lang :: !Lang
151 , _wq_node_id :: !Int
152 }
153 deriving Generic
154
155 makeLenses ''WithQuery
156 instance FromJSON WithQuery where
157 parseJSON = genericParseJSON $ jsonOptions "_wq_"
158 instance ToJSON WithQuery where
159 toJSON = genericToJSON $ jsonOptions "_wq_"
160 instance ToSchema WithQuery where
161 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
162
163 ------------------------------------------------------------------------
164
165 type AddWithQuery = Summary "Add with Query to corpus endpoint"
166 :> "corpus"
167 :> Capture "corpus_id" CorpusId
168 :> "query"
169 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
170
171 {-
172 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
173 :> "corpus"
174 :> Capture "corpus_id" CorpusId
175 :> "add"
176 :> "file"
177 :> MultipartForm Mem (MultipartData Mem)
178 :> QueryParam "fileType" FileType
179 :> "async"
180 :> AsyncJobs JobLog '[JSON] () JobLog
181 -}
182
183
184 ------------------------------------------------------------------------
185 -- TODO WithQuery also has a corpus id
186 addToCorpusWithQuery :: FlowCmdM env err m
187 => User
188 -> CorpusId
189 -> WithQuery
190 -> Maybe Integer
191 -> (JobLog -> m ())
192 -> m JobLog
193 addToCorpusWithQuery user cid (WithQuery q dbs l _nid) maybeLimit logStatus = do
194 -- TODO ...
195 logStatus JobLog { _scst_succeeded = Just 0
196 , _scst_failed = Just 0
197 , _scst_remaining = Just 5
198 , _scst_events = Just []
199 }
200 printDebug "addToCorpusWithQuery" (cid, dbs)
201 -- TODO add cid
202 -- TODO if cid is folder -> create Corpus
203 -- if cid is corpus -> add to corpus
204 -- if cid is root -> create corpus in Private
205 txts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
206
207 logStatus JobLog { _scst_succeeded = Just 2
208 , _scst_failed = Just 0
209 , _scst_remaining = Just 1
210 , _scst_events = Just []
211 }
212
213 cids <- mapM (\txt -> flowDataText user txt (Multi l) cid) txts
214 printDebug "corpus id" cids
215 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
216 sendMail user
217 -- TODO ...
218 pure JobLog { _scst_succeeded = Just 3
219 , _scst_failed = Just 0
220 , _scst_remaining = Just 0
221 , _scst_events = Just []
222 }
223
224
225 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
226 :> "corpus"
227 :> Capture "corpus_id" CorpusId
228 :> "add"
229 :> "form"
230 :> "async"
231 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
232
233 addToCorpusWithForm :: FlowCmdM env err m
234 => User
235 -> CorpusId
236 -> NewWithForm
237 -> (JobLog -> m ())
238 -> m JobLog
239 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus = do
240
241 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
242 printDebug "[addToCorpusWithForm] fileType" ft
243 logStatus JobLog { _scst_succeeded = Just 0
244 , _scst_failed = Just 0
245 , _scst_remaining = Just 2
246 , _scst_events = Just []
247 }
248 let
249 parse = case ft of
250 CSV_HAL -> Parser.parseFormat Parser.CsvHal
251 CSV -> Parser.parseFormat Parser.CsvGargV3
252 WOS -> Parser.parseFormat Parser.WOS
253 PresseRIS -> Parser.parseFormat Parser.RisPresse
254
255 -- TODO granularity of the logStatus
256 docs <- liftBase $ splitEvery 500
257 <$> take 1000000
258 <$> parse (cs d)
259
260 printDebug "Parsing corpus finished : " cid
261 logStatus JobLog { _scst_succeeded = Just 1
262 , _scst_failed = Just 0
263 , _scst_remaining = Just 1
264 , _scst_events = Just []
265 }
266
267
268 printDebug "Starting extraction : " cid
269 -- TODO granularity of the logStatus
270 _cid' <- flowCorpus user
271 (Right [cid])
272 (Multi $ fromMaybe EN l)
273 (map (map toHyperdataDocument) docs)
274
275 printDebug "Extraction finished : " cid
276 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
277 sendMail user
278
279 pure JobLog { _scst_succeeded = Just 2
280 , _scst_failed = Just 0
281 , _scst_remaining = Just 0
282 , _scst_events = Just []
283 }
284
285 {-
286 addToCorpusWithFile :: FlowCmdM env err m
287 => CorpusId
288 -> MultipartData Mem
289 -> Maybe FileType
290 -> (JobLog -> m ())
291 -> m JobLog
292 addToCorpusWithFile cid input filetype logStatus = do
293 logStatus JobLog { _scst_succeeded = Just 10
294 , _scst_failed = Just 2
295 , _scst_remaining = Just 138
296 , _scst_events = Just []
297 }
298 printDebug "addToCorpusWithFile" cid
299 _h <- postUpload cid filetype input
300
301 pure JobLog { _scst_succeeded = Just 137
302 , _scst_failed = Just 13
303 , _scst_remaining = Just 0
304 , _scst_events = Just []
305 }
306 -}
307
308
309
310 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
311 :> "corpus"
312 :> Capture "corpus_id" CorpusId
313 :> "add"
314 :> "file"
315 :> "async"
316 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
317
318 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
319 => User
320 -> CorpusId
321 -> NewWithFile
322 -> (JobLog -> m ())
323 -> m JobLog
324 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
325
326 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
327 logStatus JobLog { _scst_succeeded = Just 0
328 , _scst_failed = Just 0
329 , _scst_remaining = Just 1
330 , _scst_events = Just []
331 }
332
333 fPath <- GargDB.writeFile nwf
334 printDebug "[addToCorpusWithFile] File saved as: " fPath
335
336 uId <- getUserId user
337 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
338
339 _ <- case nIds of
340 [nId] -> do
341 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
342 let hl = node ^. node_hyperdata
343 _ <- updateHyperdata nId $ hl { _hff_name = fName
344 , _hff_path = T.pack fPath }
345
346 printDebug "[addToCorpusWithFile] Created node with id: " nId
347 _ -> pure ()
348
349 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
350
351 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
352 sendMail user
353
354 pure $ JobLog { _scst_succeeded = Just 1
355 , _scst_failed = Just 0
356 , _scst_remaining = Just 0
357 , _scst_events = Just []
358 }