]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/New.hs
[FIX] merge
[gargantext.git] / src / Gargantext / API / Node / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
17
18 module Gargantext.API.Node.Corpus.New
19 where
20
21 import Control.Lens hiding (elements, Empty)
22 import Data.Aeson
23 import Data.Aeson.TH (deriveJSON)
24 import Data.Either
25 import Data.Maybe (fromMaybe)
26 import Data.Swagger
27 import Data.Text (Text)
28 import qualified Data.Text as T
29 import GHC.Generics (Generic)
30 import qualified Prelude as Prelude
31 import Protolude (readFile)
32 import Servant
33 import Servant.Job.Utils (jsonOptions)
34 -- import Servant.Multipart
35 -- import Test.QuickCheck (elements)
36 import Test.QuickCheck.Arbitrary
37
38 import Gargantext.Prelude
39
40 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
41 import Gargantext.API.Admin.Types (HasSettings)
42 import Gargantext.API.Job (jobLogSuccess, jobLogFailTotal)
43 import Gargantext.API.Node.Corpus.New.File
44 import Gargantext.API.Node.Corpus.Searx
45 import Gargantext.API.Node.Corpus.Types
46 import Gargantext.API.Node.Types
47 import Gargantext.Core (Lang(..){-, allLangs-})
48 import qualified Gargantext.Core.Text.Corpus.API as API
49 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
50 import Gargantext.Core.Types.Individu (User(..))
51 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
52 import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
53 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
54 import Gargantext.Database.Action.Mail (sendMail)
55 import Gargantext.Database.Action.Node (mkNodeWithParent)
56 import Gargantext.Database.Action.User (getUserId)
57 import Gargantext.Database.Admin.Types.Hyperdata
58 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
59 import Gargantext.Database.Query.Table.Node (getNodeWith)
60 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
61 import Gargantext.Database.Schema.Node (node_hyperdata)
62 import qualified Gargantext.Database.GargDB as GargDB
63
64 ------------------------------------------------------------------------
65 {-
66 data Query = Query { query_query :: Text
67 , query_node_id :: Int
68 , query_lang :: Lang
69 , query_databases :: [DataOrigin]
70 }
71 deriving (Eq, Generic)
72
73 deriveJSON (unPrefix "query_") 'Query
74
75 instance Arbitrary Query where
76 arbitrary = elements [ Query q n la fs
77 | q <- ["honeybee* AND collapse"
78 ,"covid 19"
79 ]
80 , n <- [0..10]
81 , la <- allLangs
82 , fs <- take 3 $ repeat allDataOrigins
83 ]
84
85 instance ToSchema Query where
86 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
87 -}
88
89 ------------------------------------------------------------------------
90
91 {-
92 type Api = PostApi
93 :<|> GetApi
94
95 type PostApi = Summary "New Corpus endpoint"
96 :> ReqBody '[JSON] Query
97 :> Post '[JSON] CorpusId
98 type GetApi = Get '[JSON] ApiInfo
99 -}
100
101 -- | TODO manage several apis
102 -- TODO-ACCESS
103 -- TODO this is only the POST
104 {-
105 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
106 api uid (Query q _ as) = do
107 cId <- case head as of
108 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
109 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
110 Just a -> do
111 docs <- liftBase $ API.get a q (Just 1000)
112 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
113 pure cId'
114
115 pure cId
116 -}
117
118 ------------------------------------------------
119 -- TODO use this route for Client implementation
120 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
121 deriving (Generic)
122 instance Arbitrary ApiInfo where
123 arbitrary = ApiInfo <$> arbitrary
124
125 deriveJSON (unPrefix "") 'ApiInfo
126
127 instance ToSchema ApiInfo
128
129 info :: FlowCmdM env err m => UserId -> m ApiInfo
130 info _u = pure $ ApiInfo API.externalAPIs
131
132 ------------------------------------------------------------------------
133 ------------------------------------------------------------------------
134 data WithQuery = WithQuery
135 { _wq_query :: !Text
136 , _wq_databases :: !Database
137 , _wq_datafield :: !Datafield
138 , _wq_lang :: !Lang
139 , _wq_node_id :: !Int
140 }
141 deriving Generic
142
143 makeLenses ''WithQuery
144 instance FromJSON WithQuery where
145 parseJSON = genericParseJSON $ jsonOptions "_wq_"
146 instance ToJSON WithQuery where
147 toJSON = genericToJSON $ jsonOptions "_wq_"
148 instance ToSchema WithQuery where
149 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
150
151 ------------------------------------------------------------------------
152
153 type AddWithQuery = Summary "Add with Query to corpus endpoint"
154 :> "corpus"
155 :> Capture "corpus_id" CorpusId
156 :> "query"
157 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
158
159 {-
160 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
161 :> "corpus"
162 :> Capture "corpus_id" CorpusId
163 :> "add"
164 :> "file"
165 :> MultipartForm Mem (MultipartData Mem)
166 :> QueryParam "fileType" FileType
167 :> "async"
168 :> AsyncJobs JobLog '[JSON] () JobLog
169 -}
170
171
172 ------------------------------------------------------------------------
173 -- TODO WithQuery also has a corpus id
174 addToCorpusWithQuery :: FlowCmdM env err m
175 => User
176 -> CorpusId
177 -> WithQuery
178 -> Maybe Integer
179 -> (JobLog -> m ())
180 -> m JobLog
181 addToCorpusWithQuery user cid (WithQuery q dbs datafield l _nid) maybeLimit logStatus = do
182 -- TODO ...
183 logStatus JobLog { _scst_succeeded = Just 0
184 , _scst_failed = Just 0
185 , _scst_remaining = Just 3
186 , _scst_events = Just []
187 }
188 printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
189 printDebug "[addToCorpusWithQuery] datafield" datafield
190
191 case datafield of
192 Web -> do
193 printDebug "[addToCorpusWithQuery] processing web request" datafield
194
195 _ <- triggerSearxSearch cid q l
196
197 pure JobLog { _scst_succeeded = Just 3
198 , _scst_failed = Just 0
199 , _scst_remaining = Just 0
200 , _scst_events = Just []
201 }
202
203 _ -> do
204 -- TODO add cid
205 -- TODO if cid is folder -> create Corpus
206 -- if cid is corpus -> add to corpus
207 -- if cid is root -> create corpus in Private
208 txts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
209
210 logStatus JobLog { _scst_succeeded = Just 2
211 , _scst_failed = Just 0
212 , _scst_remaining = Just 1
213 , _scst_events = Just []
214 }
215
216 cids <- mapM (\txt -> flowDataText user txt (Multi l) cid) txts
217 printDebug "corpus id" cids
218 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
219 sendMail user
220 -- TODO ...
221 pure JobLog { _scst_succeeded = Just 3
222 , _scst_failed = Just 0
223 , _scst_remaining = Just 0
224 , _scst_events = Just []
225 }
226
227
228 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
229 :> "corpus"
230 :> Capture "corpus_id" CorpusId
231 :> "add"
232 :> "form"
233 :> "async"
234 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
235
236 addToCorpusWithForm :: FlowCmdM env err m
237 => User
238 -> CorpusId
239 -> NewWithForm
240 -> (JobLog -> m ())
241 -> JobLog
242 -> m JobLog
243 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
244 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
245 printDebug "[addToCorpusWithForm] fileType" ft
246 logStatus jobLog
247 let
248 parse = case ft of
249 CSV_HAL -> Parser.parseFormat Parser.CsvHal
250 CSV -> Parser.parseFormat Parser.CsvGargV3
251 WOS -> Parser.parseFormat Parser.WOS
252 PresseRIS -> Parser.parseFormat Parser.RisPresse
253
254 -- TODO granularity of the logStatus
255 eDocs <- liftBase $ parse $ cs d
256 case eDocs of
257 Right docs' -> do
258 let docs = splitEvery 500 $ take 1000000 docs'
259
260 printDebug "Parsing corpus finished : " cid
261 logStatus jobLog2
262
263 printDebug "Starting extraction : " cid
264 -- TODO granularity of the logStatus
265 _cid' <- flowCorpus user
266 (Right [cid])
267 (Multi $ fromMaybe EN l)
268 (map (map toHyperdataDocument) docs)
269
270 printDebug "Extraction finished : " cid
271 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
272 sendMail user
273
274 logStatus jobLog3
275 pure $ jobLog3
276 Left e -> do
277 printDebug "Error" e
278
279 logStatus jobLogE
280 pure jobLogE
281 where
282 jobLog2 = jobLogSuccess jobLog
283 jobLog3 = jobLogSuccess jobLog2
284 jobLogE = jobLogFailTotal jobLog
285
286 parseCsvGargV3Path :: [Char] -> IO (Either Prelude.String [HyperdataDocument])
287 parseCsvGargV3Path fp = do
288 contents <- readFile fp
289 Parser.parseFormat Parser.CsvGargV3 $ cs contents
290
291 {-
292 addToCorpusWithFile :: FlowCmdM env err m
293 => CorpusId
294 -> MultipartData Mem
295 -> Maybe FileType
296 -> (JobLog -> m ())
297 -> m JobLog
298 addToCorpusWithFile cid input filetype logStatus = do
299 logStatus JobLog { _scst_succeeded = Just 10
300 , _scst_failed = Just 2
301 , _scst_remaining = Just 138
302 , _scst_events = Just []
303 }
304 printDebug "addToCorpusWithFile" cid
305 _h <- postUpload cid filetype input
306
307 pure JobLog { _scst_succeeded = Just 137
308 , _scst_failed = Just 13
309 , _scst_remaining = Just 0
310 , _scst_events = Just []
311 }
312 -}
313
314
315
316 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
317 :> "corpus"
318 :> Capture "corpus_id" CorpusId
319 :> "add"
320 :> "file"
321 :> "async"
322 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
323
324 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
325 => User
326 -> CorpusId
327 -> NewWithFile
328 -> (JobLog -> m ())
329 -> m JobLog
330 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
331
332 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
333 logStatus JobLog { _scst_succeeded = Just 0
334 , _scst_failed = Just 0
335 , _scst_remaining = Just 1
336 , _scst_events = Just []
337 }
338
339 fPath <- GargDB.writeFile nwf
340 printDebug "[addToCorpusWithFile] File saved as: " fPath
341
342 uId <- getUserId user
343 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
344
345 _ <- case nIds of
346 [nId] -> do
347 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
348 let hl = node ^. node_hyperdata
349 _ <- updateHyperdata nId $ hl { _hff_name = fName
350 , _hff_path = T.pack fPath }
351
352 printDebug "[addToCorpusWithFile] Created node with id: " nId
353 _ -> pure ()
354
355 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
356
357 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
358 sendMail user
359
360 pure $ JobLog { _scst_succeeded = Just 1
361 , _scst_failed = Just 0
362 , _scst_remaining = Just 0
363 , _scst_events = Just []
364 }