]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/New.hs
[Merge] dev -> dev-phylo ready
[gargantext.git] / src / Gargantext / API / Node / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
17
18 module Gargantext.API.Node.Corpus.New
19 where
20
21 import Control.Lens hiding (elements, Empty)
22 import Data.Aeson
23 import Data.Aeson.TH (deriveJSON)
24 import Data.Either
25 import Data.Maybe (fromMaybe)
26 import Data.Swagger
27 import Data.Text (Text)
28 import qualified Data.Text as T
29 import GHC.Generics (Generic)
30 import Servant
31 import Servant.Job.Utils (jsonOptions)
32 -- import Servant.Multipart
33 -- import Test.QuickCheck (elements)
34 import Test.QuickCheck.Arbitrary
35
36 import Gargantext.Prelude
37
38 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
39 import qualified Gargantext.API.Admin.Orchestrator.Types as T
40 import Gargantext.API.Admin.Settings (HasSettings)
41 import Gargantext.API.Node.Corpus.New.File
42 import Gargantext.API.Node.Types
43 import Gargantext.Core (Lang(..){-, allLangs-})
44 import Gargantext.Core.Types.Individu (User(..))
45 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
46 import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, getDataText, flowDataText, TermType(..), DataOrigin(..){-, allDataOrigins-})
47 import Gargantext.Database.Action.Flow.Utils (getUserId)
48 import Gargantext.Database.Action.Node (mkNodeWithParent)
49 import Gargantext.Database.Admin.Types.Hyperdata
50 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
51 import Gargantext.Database.Query.Table.Node (getNodeWith)
52 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
53 import Gargantext.Database.Schema.Node (node_hyperdata)
54 import qualified Gargantext.Prelude.Utils as GPU
55 import qualified Gargantext.Core.Text.Corpus.API as API
56 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
57
58 ------------------------------------------------------------------------
59 {-
60 data Query = Query { query_query :: Text
61 , query_node_id :: Int
62 , query_lang :: Lang
63 , query_databases :: [DataOrigin]
64 }
65 deriving (Eq, Generic)
66
67 deriveJSON (unPrefix "query_") 'Query
68
69 instance Arbitrary Query where
70 arbitrary = elements [ Query q n la fs
71 | q <- ["honeybee* AND collapse"
72 ,"covid 19"
73 ]
74 , n <- [0..10]
75 , la <- allLangs
76 , fs <- take 3 $ repeat allDataOrigins
77 ]
78
79 instance ToSchema Query where
80 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
81 -}
82
83 ------------------------------------------------------------------------
84
85 {-
86 type Api = PostApi
87 :<|> GetApi
88
89 type PostApi = Summary "New Corpus endpoint"
90 :> ReqBody '[JSON] Query
91 :> Post '[JSON] CorpusId
92 type GetApi = Get '[JSON] ApiInfo
93 -}
94
95 -- | TODO manage several apis
96 -- TODO-ACCESS
97 -- TODO this is only the POST
98 {-
99 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
100 api uid (Query q _ as) = do
101 cId <- case head as of
102 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
103 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
104 Just a -> do
105 docs <- liftBase $ API.get a q (Just 1000)
106 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
107 pure cId'
108
109 pure cId
110 -}
111
112 ------------------------------------------------
113 -- TODO use this route for Client implementation
114 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
115 deriving (Generic)
116 instance Arbitrary ApiInfo where
117 arbitrary = ApiInfo <$> arbitrary
118
119 deriveJSON (unPrefix "") 'ApiInfo
120
121 instance ToSchema ApiInfo
122
123 info :: FlowCmdM env err m => UserId -> m ApiInfo
124 info _u = pure $ ApiInfo API.externalAPIs
125
126 ------------------------------------------------------------------------
127
128 data Database = Empty
129 | PubMed
130 | HAL
131 | IsTex
132 | Isidore
133 deriving (Eq, Show, Generic)
134
135 deriveJSON (unPrefix "") ''Database
136 instance ToSchema Database
137
138 database2origin :: Database -> DataOrigin
139 database2origin Empty = InternalOrigin T.IsTex
140 database2origin PubMed = ExternalOrigin T.PubMed
141 database2origin HAL = ExternalOrigin T.HAL
142 database2origin IsTex = ExternalOrigin T.IsTex
143 database2origin Isidore = ExternalOrigin T.Isidore
144
145 ------------------------------------------------------------------------
146 data WithQuery = WithQuery
147 { _wq_query :: !Text
148 , _wq_databases :: !Database
149 , _wq_lang :: !Lang
150 , _wq_node_id :: !Int
151 }
152 deriving Generic
153
154 makeLenses ''WithQuery
155 instance FromJSON WithQuery where
156 parseJSON = genericParseJSON $ jsonOptions "_wq_"
157 instance ToSchema WithQuery where
158 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
159
160 ------------------------------------------------------------------------
161
162 type AddWithQuery = Summary "Add with Query to corpus endpoint"
163 :> "corpus"
164 :> Capture "corpus_id" CorpusId
165 :> "query"
166 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
167
168 {-
169 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
170 :> "corpus"
171 :> Capture "corpus_id" CorpusId
172 :> "add"
173 :> "file"
174 :> MultipartForm Mem (MultipartData Mem)
175 :> QueryParam "fileType" FileType
176 :> "async"
177 :> AsyncJobs JobLog '[JSON] () JobLog
178 -}
179
180
181 ------------------------------------------------------------------------
182 -- TODO WithQuery also has a corpus id
183 addToCorpusWithQuery :: FlowCmdM env err m
184 => User
185 -> CorpusId
186 -> WithQuery
187 -> (JobLog -> m ())
188 -> m JobLog
189 addToCorpusWithQuery u cid (WithQuery q dbs l _nid) logStatus = do
190 -- TODO ...
191 logStatus JobLog { _scst_succeeded = Just 0
192 , _scst_failed = Just 0
193 , _scst_remaining = Just 5
194 , _scst_events = Just []
195 }
196 printDebug "addToCorpusWithQuery" (cid, dbs)
197 -- TODO add cid
198 -- TODO if cid is folder -> create Corpus
199 -- if cid is corpus -> add to corpus
200 -- if cid is root -> create corpus in Private
201 txts <- mapM (\db -> getDataText db (Multi l) q Nothing) [database2origin dbs]
202
203 logStatus JobLog { _scst_succeeded = Just 2
204 , _scst_failed = Just 0
205 , _scst_remaining = Just 1
206 , _scst_events = Just []
207 }
208
209 cids <- mapM (\txt -> flowDataText u txt (Multi l) cid) txts
210 printDebug "corpus id" cids
211 -- TODO ...
212 pure JobLog { _scst_succeeded = Just 3
213 , _scst_failed = Just 0
214 , _scst_remaining = Just 0
215 , _scst_events = Just []
216 }
217
218
219 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
220 :> "corpus"
221 :> Capture "corpus_id" CorpusId
222 :> "add"
223 :> "form"
224 :> "async"
225 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
226
227 addToCorpusWithForm :: FlowCmdM env err m
228 => User
229 -> CorpusId
230 -> NewWithForm
231 -> (JobLog -> m ())
232 -> m JobLog
233 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus = do
234
235 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
236 printDebug "[addToCorpusWithForm] fileType" ft
237 logStatus JobLog { _scst_succeeded = Just 0
238 , _scst_failed = Just 0
239 , _scst_remaining = Just 2
240 , _scst_events = Just []
241 }
242 let
243 parse = case ft of
244 CSV_HAL -> Parser.parseFormat Parser.CsvHal
245 CSV -> Parser.parseFormat Parser.CsvGargV3
246 WOS -> Parser.parseFormat Parser.WOS
247 PresseRIS -> Parser.parseFormat Parser.RisPresse
248
249 -- TODO granularity of the logStatus
250 docs <- liftBase $ splitEvery 500
251 <$> take 1000000
252 <$> parse (cs d)
253
254 printDebug "Parsing corpus finished : " cid
255 logStatus JobLog { _scst_succeeded = Just 1
256 , _scst_failed = Just 0
257 , _scst_remaining = Just 1
258 , _scst_events = Just []
259 }
260
261
262 printDebug "Starting extraction : " cid
263 -- TODO granularity of the logStatus
264 _cid' <- flowCorpus user
265 (Right [cid])
266 (Multi $ fromMaybe EN l)
267 (map (map toHyperdataDocument) docs)
268
269 printDebug "Extraction finished : " cid
270 pure JobLog { _scst_succeeded = Just 2
271 , _scst_failed = Just 0
272 , _scst_remaining = Just 0
273 , _scst_events = Just []
274 }
275
276 {-
277 addToCorpusWithFile :: FlowCmdM env err m
278 => CorpusId
279 -> MultipartData Mem
280 -> Maybe FileType
281 -> (JobLog -> m ())
282 -> m JobLog
283 addToCorpusWithFile cid input filetype logStatus = do
284 logStatus JobLog { _scst_succeeded = Just 10
285 , _scst_failed = Just 2
286 , _scst_remaining = Just 138
287 , _scst_events = Just []
288 }
289 printDebug "addToCorpusWithFile" cid
290 _h <- postUpload cid filetype input
291
292 pure JobLog { _scst_succeeded = Just 137
293 , _scst_failed = Just 13
294 , _scst_remaining = Just 0
295 , _scst_events = Just []
296 }
297 -}
298
299
300
301 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
302 :> "corpus"
303 :> Capture "corpus_id" CorpusId
304 :> "add"
305 :> "file"
306 :> "async"
307 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
308
309 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
310 => User
311 -> CorpusId
312 -> NewWithFile
313 -> (JobLog -> m ())
314 -> m JobLog
315 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
316
317 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
318 logStatus JobLog { _scst_succeeded = Just 0
319 , _scst_failed = Just 0
320 , _scst_remaining = Just 1
321 , _scst_events = Just []
322 }
323
324 fPath <- GPU.writeFile nwf
325 printDebug "[addToCorpusWithFile] File saved as: " fPath
326
327 uId <- getUserId user
328 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
329
330 _ <- case nIds of
331 [nId] -> do
332 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
333 let hl = node ^. node_hyperdata
334 _ <- updateHyperdata nId $ hl { _hff_name = fName
335 , _hff_path = T.pack fPath }
336
337 printDebug "[addToCorpusWithFile] Created node with id: " nId
338 _ -> pure ()
339
340 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
341 pure $ JobLog { _scst_succeeded = Just 1
342 , _scst_failed = Just 0
343 , _scst_remaining = Just 0
344 , _scst_events = Just []
345 }