refactoring 2021, remove old phylo files, keep API
[gargantext.git] / src / Gargantext / API / Node / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
17
18 module Gargantext.API.Node.Corpus.New
19 where
20
21 import Control.Lens hiding (elements, Empty)
22 import Data.Aeson
23 import Data.Aeson.TH (deriveJSON)
24 import Data.Either
25 import Data.Maybe (fromMaybe)
26 import Data.Swagger
27 import Data.Text (Text)
28 import qualified Data.Text as T
29 import GHC.Generics (Generic)
30 import Servant
31 import Servant.Job.Utils (jsonOptions)
32 -- import Servant.Multipart
33 -- import Test.QuickCheck (elements)
34 import Test.QuickCheck.Arbitrary
35
36 import Gargantext.Prelude
37
38 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
39 import qualified Gargantext.API.Admin.Orchestrator.Types as T
40 import Gargantext.API.Admin.Types (HasSettings)
41 import Gargantext.API.Node.Corpus.New.File
42 import Gargantext.API.Node.Types
43 import Gargantext.Core (Lang(..){-, allLangs-})
44 import Gargantext.Core.Types.Individu (User(..))
45 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
46 import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, getDataText, flowDataText, TermType(..), DataOrigin(..){-, allDataOrigins-})
47 import Gargantext.Database.Action.User (getUserId)
48 import Gargantext.Database.Action.Node (mkNodeWithParent)
49 import Gargantext.Database.Admin.Types.Hyperdata
50 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
51 import Gargantext.Database.Query.Table.Node (getNodeWith)
52 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
53 import Gargantext.Database.Schema.Node (node_hyperdata)
54 import qualified Gargantext.Prelude.Utils as GPU
55 import qualified Gargantext.Core.Text.Corpus.API as API
56 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
57
58 ------------------------------------------------------------------------
59 {-
60 data Query = Query { query_query :: Text
61 , query_node_id :: Int
62 , query_lang :: Lang
63 , query_databases :: [DataOrigin]
64 }
65 deriving (Eq, Generic)
66
67 deriveJSON (unPrefix "query_") 'Query
68
69 instance Arbitrary Query where
70 arbitrary = elements [ Query q n la fs
71 | q <- ["honeybee* AND collapse"
72 ,"covid 19"
73 ]
74 , n <- [0..10]
75 , la <- allLangs
76 , fs <- take 3 $ repeat allDataOrigins
77 ]
78
79 instance ToSchema Query where
80 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
81 -}
82
83 ------------------------------------------------------------------------
84
85 {-
86 type Api = PostApi
87 :<|> GetApi
88
89 type PostApi = Summary "New Corpus endpoint"
90 :> ReqBody '[JSON] Query
91 :> Post '[JSON] CorpusId
92 type GetApi = Get '[JSON] ApiInfo
93 -}
94
95 -- | TODO manage several apis
96 -- TODO-ACCESS
97 -- TODO this is only the POST
98 {-
99 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
100 api uid (Query q _ as) = do
101 cId <- case head as of
102 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
103 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
104 Just a -> do
105 docs <- liftBase $ API.get a q (Just 1000)
106 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
107 pure cId'
108
109 pure cId
110 -}
111
112 ------------------------------------------------
113 -- TODO use this route for Client implementation
114 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
115 deriving (Generic)
116 instance Arbitrary ApiInfo where
117 arbitrary = ApiInfo <$> arbitrary
118
119 deriveJSON (unPrefix "") 'ApiInfo
120
121 instance ToSchema ApiInfo
122
123 info :: FlowCmdM env err m => UserId -> m ApiInfo
124 info _u = pure $ ApiInfo API.externalAPIs
125
126 ------------------------------------------------------------------------
127
128 data Database = Empty
129 | PubMed
130 | HAL
131 | IsTex
132 | Isidore
133 deriving (Eq, Show, Generic)
134
135 deriveJSON (unPrefix "") ''Database
136 instance ToSchema Database
137
138 database2origin :: Database -> DataOrigin
139 database2origin Empty = InternalOrigin T.IsTex
140 database2origin PubMed = ExternalOrigin T.PubMed
141 database2origin HAL = ExternalOrigin T.HAL
142 database2origin IsTex = ExternalOrigin T.IsTex
143 database2origin Isidore = ExternalOrigin T.Isidore
144
145 ------------------------------------------------------------------------
146 data WithQuery = WithQuery
147 { _wq_query :: !Text
148 , _wq_databases :: !Database
149 , _wq_lang :: !Lang
150 , _wq_node_id :: !Int
151 }
152 deriving Generic
153
154 makeLenses ''WithQuery
155 instance FromJSON WithQuery where
156 parseJSON = genericParseJSON $ jsonOptions "_wq_"
157 instance ToSchema WithQuery where
158 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
159
160 ------------------------------------------------------------------------
161
162 type AddWithQuery = Summary "Add with Query to corpus endpoint"
163 :> "corpus"
164 :> Capture "corpus_id" CorpusId
165 :> "query"
166 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
167
168 {-
169 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
170 :> "corpus"
171 :> Capture "corpus_id" CorpusId
172 :> "add"
173 :> "file"
174 :> MultipartForm Mem (MultipartData Mem)
175 :> QueryParam "fileType" FileType
176 :> "async"
177 :> AsyncJobs JobLog '[JSON] () JobLog
178 -}
179
180
181 ------------------------------------------------------------------------
182 -- TODO WithQuery also has a corpus id
183 addToCorpusWithQuery :: FlowCmdM env err m
184 => User
185 -> CorpusId
186 -> WithQuery
187 -> Maybe Integer
188 -> (JobLog -> m ())
189 -> m JobLog
190 addToCorpusWithQuery u cid (WithQuery q dbs l _nid) maybeLimit logStatus = do
191 -- TODO ...
192 logStatus JobLog { _scst_succeeded = Just 0
193 , _scst_failed = Just 0
194 , _scst_remaining = Just 5
195 , _scst_events = Just []
196 }
197 printDebug "addToCorpusWithQuery" (cid, dbs)
198 -- TODO add cid
199 -- TODO if cid is folder -> create Corpus
200 -- if cid is corpus -> add to corpus
201 -- if cid is root -> create corpus in Private
202 txts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
203
204 logStatus JobLog { _scst_succeeded = Just 2
205 , _scst_failed = Just 0
206 , _scst_remaining = Just 1
207 , _scst_events = Just []
208 }
209
210 cids <- mapM (\txt -> flowDataText u txt (Multi l) cid) txts
211 printDebug "corpus id" cids
212 -- TODO ...
213 pure JobLog { _scst_succeeded = Just 3
214 , _scst_failed = Just 0
215 , _scst_remaining = Just 0
216 , _scst_events = Just []
217 }
218
219
220 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
221 :> "corpus"
222 :> Capture "corpus_id" CorpusId
223 :> "add"
224 :> "form"
225 :> "async"
226 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
227
228 addToCorpusWithForm :: FlowCmdM env err m
229 => User
230 -> CorpusId
231 -> NewWithForm
232 -> (JobLog -> m ())
233 -> m JobLog
234 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus = do
235
236 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
237 printDebug "[addToCorpusWithForm] fileType" ft
238 logStatus JobLog { _scst_succeeded = Just 0
239 , _scst_failed = Just 0
240 , _scst_remaining = Just 2
241 , _scst_events = Just []
242 }
243 let
244 parse = case ft of
245 CSV_HAL -> Parser.parseFormat Parser.CsvHal
246 CSV -> Parser.parseFormat Parser.CsvGargV3
247 WOS -> Parser.parseFormat Parser.WOS
248 PresseRIS -> Parser.parseFormat Parser.RisPresse
249
250 -- TODO granularity of the logStatus
251 docs <- liftBase $ splitEvery 500
252 <$> take 1000000
253 <$> parse (cs d)
254
255 printDebug "Parsing corpus finished : " cid
256 logStatus JobLog { _scst_succeeded = Just 1
257 , _scst_failed = Just 0
258 , _scst_remaining = Just 1
259 , _scst_events = Just []
260 }
261
262
263 printDebug "Starting extraction : " cid
264 -- TODO granularity of the logStatus
265 _cid' <- flowCorpus user
266 (Right [cid])
267 (Multi $ fromMaybe EN l)
268 (map (map toHyperdataDocument) docs)
269
270 printDebug "Extraction finished : " cid
271 pure JobLog { _scst_succeeded = Just 2
272 , _scst_failed = Just 0
273 , _scst_remaining = Just 0
274 , _scst_events = Just []
275 }
276
277 {-
278 addToCorpusWithFile :: FlowCmdM env err m
279 => CorpusId
280 -> MultipartData Mem
281 -> Maybe FileType
282 -> (JobLog -> m ())
283 -> m JobLog
284 addToCorpusWithFile cid input filetype logStatus = do
285 logStatus JobLog { _scst_succeeded = Just 10
286 , _scst_failed = Just 2
287 , _scst_remaining = Just 138
288 , _scst_events = Just []
289 }
290 printDebug "addToCorpusWithFile" cid
291 _h <- postUpload cid filetype input
292
293 pure JobLog { _scst_succeeded = Just 137
294 , _scst_failed = Just 13
295 , _scst_remaining = Just 0
296 , _scst_events = Just []
297 }
298 -}
299
300
301
302 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
303 :> "corpus"
304 :> Capture "corpus_id" CorpusId
305 :> "add"
306 :> "file"
307 :> "async"
308 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
309
310 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
311 => User
312 -> CorpusId
313 -> NewWithFile
314 -> (JobLog -> m ())
315 -> m JobLog
316 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
317
318 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
319 logStatus JobLog { _scst_succeeded = Just 0
320 , _scst_failed = Just 0
321 , _scst_remaining = Just 1
322 , _scst_events = Just []
323 }
324
325 fPath <- GPU.writeFile nwf
326 printDebug "[addToCorpusWithFile] File saved as: " fPath
327
328 uId <- getUserId user
329 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
330
331 _ <- case nIds of
332 [nId] -> do
333 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
334 let hl = node ^. node_hyperdata
335 _ <- updateHyperdata nId $ hl { _hff_name = fName
336 , _hff_path = T.pack fPath }
337
338 printDebug "[addToCorpusWithFile] Created node with id: " nId
339 _ -> pure ()
340
341 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
342 pure $ JobLog { _scst_succeeded = Just 1
343 , _scst_failed = Just 0
344 , _scst_remaining = Just 0
345 , _scst_events = Just []
346 }