]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/New.hs
Merge branch 'dev-optim' into dev
[gargantext.git] / src / Gargantext / API / Node / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
17
18 module Gargantext.API.Node.Corpus.New
19 where
20
21 import Control.Lens hiding (elements, Empty)
22 import Data.Aeson
23 import Data.Aeson.TH (deriveJSON)
24 import Data.Either
25 import Data.Maybe (fromMaybe)
26 import Data.Swagger
27 import Data.Text (Text)
28 import qualified Data.Text as T
29 import GHC.Generics (Generic)
30 import qualified Prelude as Prelude
31 import Protolude (readFile)
32 import Servant
33 import Servant.Job.Utils (jsonOptions)
34 -- import Servant.Multipart
35 -- import Test.QuickCheck (elements)
36 import Test.QuickCheck.Arbitrary
37
38 import Gargantext.Prelude
39
40 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
41 import Gargantext.API.Admin.Types (HasSettings)
42 import Gargantext.API.Job (jobLogSuccess, jobLogFailTotal)
43 import Gargantext.API.Node.Corpus.New.File
44 import Gargantext.API.Node.Corpus.Searx
45 import Gargantext.API.Node.Corpus.Types
46 import Gargantext.API.Node.Types
47 import Gargantext.Core (Lang(..){-, allLangs-})
48 import qualified Gargantext.Core.Text.Corpus.API as API
49 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
50 import Gargantext.Core.Types.Individu (User(..))
51 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
52 import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
53 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
54 import Gargantext.Database.Action.Mail (sendMail)
55 import Gargantext.Database.Action.Node (mkNodeWithParent)
56 import Gargantext.Database.Action.User (getUserId)
57 import Gargantext.Database.Admin.Types.Hyperdata
58 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
59 import Gargantext.Database.Query.Table.Node (getNodeWith)
60 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
61 import Gargantext.Database.Schema.Node (node_hyperdata)
62 import qualified Gargantext.Database.GargDB as GargDB
63
64 ------------------------------------------------------------------------
65 {-
66 data Query = Query { query_query :: Text
67 , query_node_id :: Int
68 , query_lang :: Lang
69 , query_databases :: [DataOrigin]
70 }
71 deriving (Eq, Generic)
72
73 deriveJSON (unPrefix "query_") 'Query
74
75 instance Arbitrary Query where
76 arbitrary = elements [ Query q n la fs
77 | q <- ["honeybee* AND collapse"
78 ,"covid 19"
79 ]
80 , n <- [0..10]
81 , la <- allLangs
82 , fs <- take 3 $ repeat allDataOrigins
83 ]
84
85 instance ToSchema Query where
86 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
87 -}
88
89 ------------------------------------------------------------------------
90
91 {-
92 type Api = PostApi
93 :<|> GetApi
94
95 type PostApi = Summary "New Corpus endpoint"
96 :> ReqBody '[JSON] Query
97 :> Post '[JSON] CorpusId
98 type GetApi = Get '[JSON] ApiInfo
99 -}
100
101 -- | TODO manage several apis
102 -- TODO-ACCESS
103 -- TODO this is only the POST
104 {-
105 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
106 api uid (Query q _ as) = do
107 cId <- case head as of
108 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
109 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
110 Just a -> do
111 docs <- liftBase $ API.get a q (Just 1000)
112 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
113 pure cId'
114
115 pure cId
116 -}
117
118 ------------------------------------------------
119 -- TODO use this route for Client implementation
120 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
121 deriving (Generic)
122 instance Arbitrary ApiInfo where
123 arbitrary = ApiInfo <$> arbitrary
124
125 deriveJSON (unPrefix "") 'ApiInfo
126
127 instance ToSchema ApiInfo
128
129 info :: FlowCmdM env err m => UserId -> m ApiInfo
130 info _u = pure $ ApiInfo API.externalAPIs
131
132 ------------------------------------------------------------------------
133 ------------------------------------------------------------------------
134 data WithQuery = WithQuery
135 { _wq_query :: !Text
136 , _wq_databases :: !Database
137 , _wq_datafield :: !Datafield
138 , _wq_lang :: !Lang
139 , _wq_node_id :: !Int
140 -- , _wq_flowListWith :: !FlowSocialListWith
141 }
142 deriving Generic
143
144 makeLenses ''WithQuery
145 instance FromJSON WithQuery where
146 parseJSON = genericParseJSON $ jsonOptions "_wq_"
147 instance ToJSON WithQuery where
148 toJSON = genericToJSON $ jsonOptions "_wq_"
149 instance ToSchema WithQuery where
150 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
151
152 ------------------------------------------------------------------------
153
154 type AddWithQuery = Summary "Add with Query to corpus endpoint"
155 :> "corpus"
156 :> Capture "corpus_id" CorpusId
157 :> "query"
158 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
159
160 {-
161 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
162 :> "corpus"
163 :> Capture "corpus_id" CorpusId
164 :> "add"
165 :> "file"
166 :> MultipartForm Mem (MultipartData Mem)
167 :> QueryParam "fileType" FileType
168 :> "async"
169 :> AsyncJobs JobLog '[JSON] () JobLog
170 -}
171
172
173 ------------------------------------------------------------------------
174 -- TODO WithQuery also has a corpus id
175 addToCorpusWithQuery :: FlowCmdM env err m
176 => User
177 -> CorpusId
178 -> WithQuery
179 -> Maybe Integer
180 -> (JobLog -> m ())
181 -> m JobLog
182 addToCorpusWithQuery user cid (WithQuery q dbs datafield l _nid) maybeLimit logStatus = do
183 -- TODO ...
184 logStatus JobLog { _scst_succeeded = Just 0
185 , _scst_failed = Just 0
186 , _scst_remaining = Just 3
187 , _scst_events = Just []
188 }
189 printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
190 printDebug "[addToCorpusWithQuery] datafield" datafield
191
192 case datafield of
193 Web -> do
194 printDebug "[addToCorpusWithQuery] processing web request" datafield
195
196 _ <- triggerSearxSearch cid q l
197
198 pure JobLog { _scst_succeeded = Just 3
199 , _scst_failed = Just 0
200 , _scst_remaining = Just 0
201 , _scst_events = Just []
202 }
203
204 _ -> do
205 -- TODO add cid
206 -- TODO if cid is folder -> create Corpus
207 -- if cid is corpus -> add to corpus
208 -- if cid is root -> create corpus in Private
209 txts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
210
211 logStatus JobLog { _scst_succeeded = Just 2
212 , _scst_failed = Just 0
213 , _scst_remaining = Just 1
214 , _scst_events = Just []
215 }
216
217 cids <- mapM (\txt -> flowDataText user txt (Multi l) cid Nothing) txts
218 printDebug "corpus id" cids
219 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
220 sendMail user
221 -- TODO ...
222 pure JobLog { _scst_succeeded = Just 3
223 , _scst_failed = Just 0
224 , _scst_remaining = Just 0
225 , _scst_events = Just []
226 }
227
228
229 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
230 :> "corpus"
231 :> Capture "corpus_id" CorpusId
232 :> "add"
233 :> "form"
234 :> "async"
235 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
236
237 addToCorpusWithForm :: FlowCmdM env err m
238 => User
239 -> CorpusId
240 -> NewWithForm
241 -> (JobLog -> m ())
242 -> JobLog
243 -> m JobLog
244 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
245 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
246 printDebug "[addToCorpusWithForm] fileType" ft
247 logStatus jobLog
248 let
249 parse = case ft of
250 CSV_HAL -> Parser.parseFormat Parser.CsvHal
251 CSV -> Parser.parseFormat Parser.CsvGargV3
252 WOS -> Parser.parseFormat Parser.WOS
253 PresseRIS -> Parser.parseFormat Parser.RisPresse
254
255 -- TODO granularity of the logStatus
256 eDocs <- liftBase $ parse $ cs d
257 case eDocs of
258 Right docs' -> do
259 let docs = splitEvery 500 $ take 1000000 docs'
260
261 printDebug "Parsing corpus finished : " cid
262 logStatus jobLog2
263
264 printDebug "Starting extraction : " cid
265 -- TODO granularity of the logStatus
266 _cid' <- flowCorpus user
267 (Right [cid])
268 (Multi $ fromMaybe EN l)
269 Nothing
270 (map (map toHyperdataDocument) docs)
271
272 printDebug "Extraction finished : " cid
273 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
274 sendMail user
275
276 logStatus jobLog3
277 pure $ jobLog3
278 Left e -> do
279 printDebug "Error" e
280
281 logStatus jobLogE
282 pure jobLogE
283 where
284 jobLog2 = jobLogSuccess jobLog
285 jobLog3 = jobLogSuccess jobLog2
286 jobLogE = jobLogFailTotal jobLog
287
288 parseCsvGargV3Path :: [Char] -> IO (Either Prelude.String [HyperdataDocument])
289 parseCsvGargV3Path fp = do
290 contents <- readFile fp
291 Parser.parseFormat Parser.CsvGargV3 $ cs contents
292
293 {-
294 addToCorpusWithFile :: FlowCmdM env err m
295 => CorpusId
296 -> MultipartData Mem
297 -> Maybe FileType
298 -> (JobLog -> m ())
299 -> m JobLog
300 addToCorpusWithFile cid input filetype logStatus = do
301 logStatus JobLog { _scst_succeeded = Just 10
302 , _scst_failed = Just 2
303 , _scst_remaining = Just 138
304 , _scst_events = Just []
305 }
306 printDebug "addToCorpusWithFile" cid
307 _h <- postUpload cid filetype input
308
309 pure JobLog { _scst_succeeded = Just 137
310 , _scst_failed = Just 13
311 , _scst_remaining = Just 0
312 , _scst_events = Just []
313 }
314 -}
315
316
317
318 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
319 :> "corpus"
320 :> Capture "corpus_id" CorpusId
321 :> "add"
322 :> "file"
323 :> "async"
324 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
325
326 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
327 => User
328 -> CorpusId
329 -> NewWithFile
330 -> (JobLog -> m ())
331 -> m JobLog
332 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
333
334 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
335 logStatus JobLog { _scst_succeeded = Just 0
336 , _scst_failed = Just 0
337 , _scst_remaining = Just 1
338 , _scst_events = Just []
339 }
340
341 fPath <- GargDB.writeFile nwf
342 printDebug "[addToCorpusWithFile] File saved as: " fPath
343
344 uId <- getUserId user
345 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
346
347 _ <- case nIds of
348 [nId] -> do
349 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
350 let hl = node ^. node_hyperdata
351 _ <- updateHyperdata nId $ hl { _hff_name = fName
352 , _hff_path = T.pack fPath }
353
354 printDebug "[addToCorpusWithFile] Created node with id: " nId
355 _ -> pure ()
356
357 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
358
359 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
360 sendMail user
361
362 pure $ JobLog { _scst_succeeded = Just 1
363 , _scst_failed = Just 0
364 , _scst_remaining = Just 0
365 , _scst_events = Just []
366 }