]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/New.hs
[list selection] some initial work (doesn't compile)
[gargantext.git] / src / Gargantext / API / Node / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
17
18 module Gargantext.API.Node.Corpus.New
19 where
20
21 import Control.Lens hiding (elements, Empty)
22 import Data.Aeson
23 import Data.Aeson.TH (deriveJSON)
24 import Data.Either
25 import Data.Maybe (fromMaybe)
26 import Data.Swagger
27 import Data.Text (Text)
28 import qualified Data.Text as T
29 import GHC.Generics (Generic)
30 import qualified Prelude as Prelude
31 import Protolude (readFile)
32 import Servant
33 import Servant.Job.Utils (jsonOptions)
34 -- import Servant.Multipart
35 -- import Test.QuickCheck (elements)
36 import Test.QuickCheck.Arbitrary
37
38 import Gargantext.Prelude
39
40 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
41 import Gargantext.API.Admin.Types (HasSettings)
42 import Gargantext.API.Job (jobLogSuccess, jobLogFailTotal)
43 import Gargantext.API.Node.Corpus.New.File
44 import Gargantext.API.Node.Corpus.Searx
45 import Gargantext.API.Node.Corpus.Types
46 import Gargantext.API.Node.Types
47 import Gargantext.Core (Lang(..){-, allLangs-})
48 import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
49 import qualified Gargantext.Core.Text.Corpus.API as API
50 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
51 import Gargantext.Core.Types.Individu (User(..))
52 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
53 import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
54 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
55 import Gargantext.Database.Action.Mail (sendMail)
56 import Gargantext.Database.Action.Node (mkNodeWithParent)
57 import Gargantext.Database.Action.User (getUserId)
58 import Gargantext.Database.Admin.Types.Hyperdata
59 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
60 import Gargantext.Database.Query.Table.Node (getNodeWith)
61 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
62 import Gargantext.Database.Schema.Node (node_hyperdata)
63 import qualified Gargantext.Database.GargDB as GargDB
64
65 ------------------------------------------------------------------------
66 {-
67 data Query = Query { query_query :: Text
68 , query_node_id :: Int
69 , query_lang :: Lang
70 , query_databases :: [DataOrigin]
71 }
72 deriving (Eq, Generic)
73
74 deriveJSON (unPrefix "query_") 'Query
75
76 instance Arbitrary Query where
77 arbitrary = elements [ Query q n la fs
78 | q <- ["honeybee* AND collapse"
79 ,"covid 19"
80 ]
81 , n <- [0..10]
82 , la <- allLangs
83 , fs <- take 3 $ repeat allDataOrigins
84 ]
85
86 instance ToSchema Query where
87 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
88 -}
89
90 ------------------------------------------------------------------------
91
92 {-
93 type Api = PostApi
94 :<|> GetApi
95
96 type PostApi = Summary "New Corpus endpoint"
97 :> ReqBody '[JSON] Query
98 :> Post '[JSON] CorpusId
99 type GetApi = Get '[JSON] ApiInfo
100 -}
101
102 -- | TODO manage several apis
103 -- TODO-ACCESS
104 -- TODO this is only the POST
105 {-
106 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
107 api uid (Query q _ as) = do
108 cId <- case head as of
109 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
110 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
111 Just a -> do
112 docs <- liftBase $ API.get a q (Just 1000)
113 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
114 pure cId'
115
116 pure cId
117 -}
118
119 ------------------------------------------------
120 -- TODO use this route for Client implementation
121 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
122 deriving (Generic)
123 instance Arbitrary ApiInfo where
124 arbitrary = ApiInfo <$> arbitrary
125
126 deriveJSON (unPrefix "") 'ApiInfo
127
128 instance ToSchema ApiInfo
129
130 info :: FlowCmdM env err m => UserId -> m ApiInfo
131 info _u = pure $ ApiInfo API.externalAPIs
132
133 ------------------------------------------------------------------------
134 ------------------------------------------------------------------------
135 data WithQuery = WithQuery
136 { _wq_query :: !Text
137 , _wq_databases :: !Database
138 , _wq_datafield :: !(Maybe Datafield)
139 , _wq_lang :: !Lang
140 , _wq_node_id :: !Int
141 , _wq_flowListWith :: !FlowSocialListWith
142 }
143 deriving Generic
144
145 makeLenses ''WithQuery
146 instance FromJSON WithQuery where
147 parseJSON = genericParseJSON $ jsonOptions "_wq_"
148 instance ToJSON WithQuery where
149 toJSON = genericToJSON $ jsonOptions "_wq_"
150 instance ToSchema WithQuery where
151 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
152
153 ------------------------------------------------------------------------
154
155 type AddWithQuery = Summary "Add with Query to corpus endpoint"
156 :> "corpus"
157 :> Capture "corpus_id" CorpusId
158 :> "query"
159 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
160
161 {-
162 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
163 :> "corpus"
164 :> Capture "corpus_id" CorpusId
165 :> "add"
166 :> "file"
167 :> MultipartForm Mem (MultipartData Mem)
168 :> QueryParam "fileType" FileType
169 :> "async"
170 :> AsyncJobs JobLog '[JSON] () JobLog
171 -}
172
173
174 ------------------------------------------------------------------------
175 -- TODO WithQuery also has a corpus id
176 addToCorpusWithQuery :: FlowCmdM env err m
177 => User
178 -> CorpusId
179 -> WithQuery
180 -> Maybe Integer
181 -> (JobLog -> m ())
182 -> m JobLog
183 addToCorpusWithQuery user cid (WithQuery q dbs datafield l _nid) maybeLimit logStatus = do
184 -- TODO ...
185 logStatus JobLog { _scst_succeeded = Just 0
186 , _scst_failed = Just 0
187 , _scst_remaining = Just 3
188 , _scst_events = Just []
189 }
190 printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
191 printDebug "[addToCorpusWithQuery] datafield" datafield
192
193 case datafield of
194 Just Web -> do
195 printDebug "[addToCorpusWithQuery] processing web request" datafield
196
197 _ <- triggerSearxSearch cid q l
198
199 pure JobLog { _scst_succeeded = Just 3
200 , _scst_failed = Just 0
201 , _scst_remaining = Just 0
202 , _scst_events = Just []
203 }
204
205 _ -> do
206 -- TODO add cid
207 -- TODO if cid is folder -> create Corpus
208 -- if cid is corpus -> add to corpus
209 -- if cid is root -> create corpus in Private
210 txts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
211
212 logStatus JobLog { _scst_succeeded = Just 2
213 , _scst_failed = Just 0
214 , _scst_remaining = Just 1
215 , _scst_events = Just []
216 }
217
218 cids <- mapM (\txt -> flowDataText user txt (Multi l) cid Nothing) txts
219 printDebug "corpus id" cids
220 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
221 sendMail user
222 -- TODO ...
223 pure JobLog { _scst_succeeded = Just 3
224 , _scst_failed = Just 0
225 , _scst_remaining = Just 0
226 , _scst_events = Just []
227 }
228
229
230 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
231 :> "corpus"
232 :> Capture "corpus_id" CorpusId
233 :> "add"
234 :> "form"
235 :> "async"
236 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
237
238 addToCorpusWithForm :: FlowCmdM env err m
239 => User
240 -> CorpusId
241 -> NewWithForm
242 -> (JobLog -> m ())
243 -> JobLog
244 -> m JobLog
245 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
246 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
247 printDebug "[addToCorpusWithForm] fileType" ft
248 logStatus jobLog
249 let
250 parse = case ft of
251 CSV_HAL -> Parser.parseFormat Parser.CsvHal
252 CSV -> Parser.parseFormat Parser.CsvGargV3
253 WOS -> Parser.parseFormat Parser.WOS
254 PresseRIS -> Parser.parseFormat Parser.RisPresse
255
256 -- TODO granularity of the logStatus
257 eDocs <- liftBase $ parse $ cs d
258 case eDocs of
259 Right docs' -> do
260 let docs = splitEvery 500 $ take 1000000 docs'
261
262 printDebug "Parsing corpus finished : " cid
263 logStatus jobLog2
264
265 printDebug "Starting extraction : " cid
266 -- TODO granularity of the logStatus
267 _cid' <- flowCorpus user
268 (Right [cid])
269 (Multi $ fromMaybe EN l)
270 Nothing
271 (map (map toHyperdataDocument) docs)
272
273 printDebug "Extraction finished : " cid
274 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
275 sendMail user
276
277 logStatus jobLog3
278 pure $ jobLog3
279 Left e -> do
280 printDebug "Error" e
281
282 logStatus jobLogE
283 pure jobLogE
284 where
285 jobLog2 = jobLogSuccess jobLog
286 jobLog3 = jobLogSuccess jobLog2
287 jobLogE = jobLogFailTotal jobLog
288
289 parseCsvGargV3Path :: [Char] -> IO (Either Prelude.String [HyperdataDocument])
290 parseCsvGargV3Path fp = do
291 contents <- readFile fp
292 Parser.parseFormat Parser.CsvGargV3 $ cs contents
293
294 {-
295 addToCorpusWithFile :: FlowCmdM env err m
296 => CorpusId
297 -> MultipartData Mem
298 -> Maybe FileType
299 -> (JobLog -> m ())
300 -> m JobLog
301 addToCorpusWithFile cid input filetype logStatus = do
302 logStatus JobLog { _scst_succeeded = Just 10
303 , _scst_failed = Just 2
304 , _scst_remaining = Just 138
305 , _scst_events = Just []
306 }
307 printDebug "addToCorpusWithFile" cid
308 _h <- postUpload cid filetype input
309
310 pure JobLog { _scst_succeeded = Just 137
311 , _scst_failed = Just 13
312 , _scst_remaining = Just 0
313 , _scst_events = Just []
314 }
315 -}
316
317
318
319 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
320 :> "corpus"
321 :> Capture "corpus_id" CorpusId
322 :> "add"
323 :> "file"
324 :> "async"
325 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
326
327 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
328 => User
329 -> CorpusId
330 -> NewWithFile
331 -> (JobLog -> m ())
332 -> m JobLog
333 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
334
335 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
336 logStatus JobLog { _scst_succeeded = Just 0
337 , _scst_failed = Just 0
338 , _scst_remaining = Just 1
339 , _scst_events = Just []
340 }
341
342 fPath <- GargDB.writeFile nwf
343 printDebug "[addToCorpusWithFile] File saved as: " fPath
344
345 uId <- getUserId user
346 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
347
348 _ <- case nIds of
349 [nId] -> do
350 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
351 let hl = node ^. node_hyperdata
352 _ <- updateHyperdata nId $ hl { _hff_name = fName
353 , _hff_path = T.pack fPath }
354
355 printDebug "[addToCorpusWithFile] Created node with id: " nId
356 _ -> pure ()
357
358 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
359
360 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
361 sendMail user
362
363 pure $ JobLog { _scst_succeeded = Just 1
364 , _scst_failed = Just 0
365 , _scst_remaining = Just 0
366 , _scst_events = Just []
367 }