]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/New.hs
[FIX] clustering, order 2 similarity, ok
[gargantext.git] / src / Gargantext / API / Node / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
17
18 module Gargantext.API.Node.Corpus.New
19 where
20
21 import Control.Lens hiding (elements, Empty)
22 import Data.Aeson
23 import Data.Aeson.TH (deriveJSON)
24 import Data.Either
25 import Data.Maybe (fromMaybe)
26 import Data.Swagger
27 import Data.Text (Text)
28 import qualified Data.Text as T
29 import GHC.Generics (Generic)
30 import Servant
31 import Servant.Job.Utils (jsonOptions)
32 -- import Servant.Multipart
33 -- import Test.QuickCheck (elements)
34 import Test.QuickCheck.Arbitrary
35
36 import Gargantext.Prelude
37
38 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
39 import qualified Gargantext.API.Admin.Orchestrator.Types as T
40 import Gargantext.API.Admin.Types (HasSettings)
41 import Gargantext.API.Node.Corpus.New.File
42 import Gargantext.API.Node.Types
43 import Gargantext.Core (Lang(..){-, allLangs-})
44 import Gargantext.Database.Action.Mail (sendMail)
45 import Gargantext.Core.Types.Individu (User(..))
46 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
47 import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, getDataText, flowDataText, TermType(..), DataOrigin(..){-, allDataOrigins-})
48 import Gargantext.Database.Action.User (getUserId)
49 import Gargantext.Database.Action.Node (mkNodeWithParent)
50 import Gargantext.Database.Admin.Types.Hyperdata
51 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
52 import Gargantext.Database.Query.Table.Node (getNodeWith)
53 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
54 import Gargantext.Database.Schema.Node (node_hyperdata)
55 import qualified Gargantext.Prelude.Utils as GPU
56 import qualified Gargantext.Core.Text.Corpus.API as API
57 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
58
59 ------------------------------------------------------------------------
60 {-
61 data Query = Query { query_query :: Text
62 , query_node_id :: Int
63 , query_lang :: Lang
64 , query_databases :: [DataOrigin]
65 }
66 deriving (Eq, Generic)
67
68 deriveJSON (unPrefix "query_") 'Query
69
70 instance Arbitrary Query where
71 arbitrary = elements [ Query q n la fs
72 | q <- ["honeybee* AND collapse"
73 ,"covid 19"
74 ]
75 , n <- [0..10]
76 , la <- allLangs
77 , fs <- take 3 $ repeat allDataOrigins
78 ]
79
80 instance ToSchema Query where
81 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
82 -}
83
84 ------------------------------------------------------------------------
85
86 {-
87 type Api = PostApi
88 :<|> GetApi
89
90 type PostApi = Summary "New Corpus endpoint"
91 :> ReqBody '[JSON] Query
92 :> Post '[JSON] CorpusId
93 type GetApi = Get '[JSON] ApiInfo
94 -}
95
96 -- | TODO manage several apis
97 -- TODO-ACCESS
98 -- TODO this is only the POST
99 {-
100 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
101 api uid (Query q _ as) = do
102 cId <- case head as of
103 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
104 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
105 Just a -> do
106 docs <- liftBase $ API.get a q (Just 1000)
107 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
108 pure cId'
109
110 pure cId
111 -}
112
113 ------------------------------------------------
114 -- TODO use this route for Client implementation
115 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
116 deriving (Generic)
117 instance Arbitrary ApiInfo where
118 arbitrary = ApiInfo <$> arbitrary
119
120 deriveJSON (unPrefix "") 'ApiInfo
121
122 instance ToSchema ApiInfo
123
124 info :: FlowCmdM env err m => UserId -> m ApiInfo
125 info _u = pure $ ApiInfo API.externalAPIs
126
127 ------------------------------------------------------------------------
128
129 data Database = Empty
130 | PubMed
131 | HAL
132 | IsTex
133 | Isidore
134 deriving (Eq, Show, Generic)
135
136 deriveJSON (unPrefix "") ''Database
137 instance ToSchema Database
138
139 database2origin :: Database -> DataOrigin
140 database2origin Empty = InternalOrigin T.IsTex
141 database2origin PubMed = ExternalOrigin T.PubMed
142 database2origin HAL = ExternalOrigin T.HAL
143 database2origin IsTex = ExternalOrigin T.IsTex
144 database2origin Isidore = ExternalOrigin T.Isidore
145
146 ------------------------------------------------------------------------
147 data WithQuery = WithQuery
148 { _wq_query :: !Text
149 , _wq_databases :: !Database
150 , _wq_lang :: !Lang
151 , _wq_node_id :: !Int
152 }
153 deriving Generic
154
155 makeLenses ''WithQuery
156 instance FromJSON WithQuery where
157 parseJSON = genericParseJSON $ jsonOptions "_wq_"
158 instance ToSchema WithQuery where
159 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
160
161 ------------------------------------------------------------------------
162
163 type AddWithQuery = Summary "Add with Query to corpus endpoint"
164 :> "corpus"
165 :> Capture "corpus_id" CorpusId
166 :> "query"
167 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
168
169 {-
170 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
171 :> "corpus"
172 :> Capture "corpus_id" CorpusId
173 :> "add"
174 :> "file"
175 :> MultipartForm Mem (MultipartData Mem)
176 :> QueryParam "fileType" FileType
177 :> "async"
178 :> AsyncJobs JobLog '[JSON] () JobLog
179 -}
180
181
182 ------------------------------------------------------------------------
183 -- TODO WithQuery also has a corpus id
184 addToCorpusWithQuery :: FlowCmdM env err m
185 => User
186 -> CorpusId
187 -> WithQuery
188 -> Maybe Integer
189 -> (JobLog -> m ())
190 -> m JobLog
191 addToCorpusWithQuery user cid (WithQuery q dbs l _nid) maybeLimit logStatus = do
192 -- TODO ...
193 logStatus JobLog { _scst_succeeded = Just 0
194 , _scst_failed = Just 0
195 , _scst_remaining = Just 5
196 , _scst_events = Just []
197 }
198 printDebug "addToCorpusWithQuery" (cid, dbs)
199 -- TODO add cid
200 -- TODO if cid is folder -> create Corpus
201 -- if cid is corpus -> add to corpus
202 -- if cid is root -> create corpus in Private
203 txts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
204
205 logStatus JobLog { _scst_succeeded = Just 2
206 , _scst_failed = Just 0
207 , _scst_remaining = Just 1
208 , _scst_events = Just []
209 }
210
211 cids <- mapM (\txt -> flowDataText user txt (Multi l) cid) txts
212 printDebug "corpus id" cids
213 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
214 sendMail user
215 -- TODO ...
216 pure JobLog { _scst_succeeded = Just 3
217 , _scst_failed = Just 0
218 , _scst_remaining = Just 0
219 , _scst_events = Just []
220 }
221
222
223 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
224 :> "corpus"
225 :> Capture "corpus_id" CorpusId
226 :> "add"
227 :> "form"
228 :> "async"
229 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
230
231 addToCorpusWithForm :: FlowCmdM env err m
232 => User
233 -> CorpusId
234 -> NewWithForm
235 -> (JobLog -> m ())
236 -> m JobLog
237 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus = do
238
239 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
240 printDebug "[addToCorpusWithForm] fileType" ft
241 logStatus JobLog { _scst_succeeded = Just 0
242 , _scst_failed = Just 0
243 , _scst_remaining = Just 2
244 , _scst_events = Just []
245 }
246 let
247 parse = case ft of
248 CSV_HAL -> Parser.parseFormat Parser.CsvHal
249 CSV -> Parser.parseFormat Parser.CsvGargV3
250 WOS -> Parser.parseFormat Parser.WOS
251 PresseRIS -> Parser.parseFormat Parser.RisPresse
252
253 -- TODO granularity of the logStatus
254 docs <- liftBase $ splitEvery 500
255 <$> take 1000000
256 <$> parse (cs d)
257
258 printDebug "Parsing corpus finished : " cid
259 logStatus JobLog { _scst_succeeded = Just 1
260 , _scst_failed = Just 0
261 , _scst_remaining = Just 1
262 , _scst_events = Just []
263 }
264
265
266 printDebug "Starting extraction : " cid
267 -- TODO granularity of the logStatus
268 _cid' <- flowCorpus user
269 (Right [cid])
270 (Multi $ fromMaybe EN l)
271 (map (map toHyperdataDocument) docs)
272
273 printDebug "Extraction finished : " cid
274 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
275 sendMail user
276
277 pure JobLog { _scst_succeeded = Just 2
278 , _scst_failed = Just 0
279 , _scst_remaining = Just 0
280 , _scst_events = Just []
281 }
282
283 {-
284 addToCorpusWithFile :: FlowCmdM env err m
285 => CorpusId
286 -> MultipartData Mem
287 -> Maybe FileType
288 -> (JobLog -> m ())
289 -> m JobLog
290 addToCorpusWithFile cid input filetype logStatus = do
291 logStatus JobLog { _scst_succeeded = Just 10
292 , _scst_failed = Just 2
293 , _scst_remaining = Just 138
294 , _scst_events = Just []
295 }
296 printDebug "addToCorpusWithFile" cid
297 _h <- postUpload cid filetype input
298
299 pure JobLog { _scst_succeeded = Just 137
300 , _scst_failed = Just 13
301 , _scst_remaining = Just 0
302 , _scst_events = Just []
303 }
304 -}
305
306
307
308 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
309 :> "corpus"
310 :> Capture "corpus_id" CorpusId
311 :> "add"
312 :> "file"
313 :> "async"
314 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
315
316 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
317 => User
318 -> CorpusId
319 -> NewWithFile
320 -> (JobLog -> m ())
321 -> m JobLog
322 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
323
324 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
325 logStatus JobLog { _scst_succeeded = Just 0
326 , _scst_failed = Just 0
327 , _scst_remaining = Just 1
328 , _scst_events = Just []
329 }
330
331 fPath <- GPU.writeFile nwf
332 printDebug "[addToCorpusWithFile] File saved as: " fPath
333
334 uId <- getUserId user
335 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
336
337 _ <- case nIds of
338 [nId] -> do
339 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
340 let hl = node ^. node_hyperdata
341 _ <- updateHyperdata nId $ hl { _hff_name = fName
342 , _hff_path = T.pack fPath }
343
344 printDebug "[addToCorpusWithFile] Created node with id: " nId
345 _ -> pure ()
346
347 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
348
349 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
350 sendMail user
351
352 pure $ JobLog { _scst_succeeded = Just 1
353 , _scst_failed = Just 0
354 , _scst_remaining = Just 0
355 , _scst_events = Just []
356 }