]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/New.hs
[git] add directories to .gitignore, add .envrc for direnv
[gargantext.git] / src / Gargantext / API / Node / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
17
18 module Gargantext.API.Node.Corpus.New
19 where
20
21 import Control.Lens hiding (elements, Empty)
22 import Data.Aeson
23 import Data.Aeson.TH (deriveJSON)
24 import Data.Either
25 import Data.Maybe (fromMaybe)
26 import Data.Swagger
27 import Data.Text (Text)
28 import qualified Data.Text as T
29 import GHC.Generics (Generic)
30 import qualified Prelude as Prelude
31 import Protolude (readFile)
32 import Servant
33 import Servant.Job.Utils (jsonOptions)
34 -- import Servant.Multipart
35 -- import Test.QuickCheck (elements)
36 import Test.QuickCheck.Arbitrary
37
38 import Gargantext.Prelude
39
40 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
41 import Gargantext.API.Admin.Types (HasSettings)
42 import Gargantext.API.Job (jobLogSuccess, jobLogFailTotal)
43 import Gargantext.API.Node.Corpus.New.File
44 import Gargantext.API.Node.Corpus.Searx
45 import Gargantext.API.Node.Corpus.Types
46 import Gargantext.API.Node.Types
47 import Gargantext.Core (Lang(..){-, allLangs-})
48 import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
49 import qualified Gargantext.Core.Text.Corpus.API as API
50 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
51 import Gargantext.Core.Types.Individu (User(..))
52 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
53 import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
54 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
55 import Gargantext.Database.Action.Mail (sendMail)
56 import Gargantext.Database.Action.Node (mkNodeWithParent)
57 import Gargantext.Database.Action.User (getUserId)
58 import Gargantext.Database.Admin.Types.Hyperdata
59 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
60 import Gargantext.Database.Query.Table.Node (getNodeWith)
61 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
62 import Gargantext.Database.Schema.Node (node_hyperdata)
63 import qualified Gargantext.Database.GargDB as GargDB
64
65 ------------------------------------------------------------------------
66 {-
67 data Query = Query { query_query :: Text
68 , query_node_id :: Int
69 , query_lang :: Lang
70 , query_databases :: [DataOrigin]
71 }
72 deriving (Eq, Generic)
73
74 deriveJSON (unPrefix "query_") 'Query
75
76 instance Arbitrary Query where
77 arbitrary = elements [ Query q n la fs
78 | q <- ["honeybee* AND collapse"
79 ,"covid 19"
80 ]
81 , n <- [0..10]
82 , la <- allLangs
83 , fs <- take 3 $ repeat allDataOrigins
84 ]
85
86 instance ToSchema Query where
87 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
88 -}
89
90 ------------------------------------------------------------------------
91
92 {-
93 type Api = PostApi
94 :<|> GetApi
95
96 type PostApi = Summary "New Corpus endpoint"
97 :> ReqBody '[JSON] Query
98 :> Post '[JSON] CorpusId
99 type GetApi = Get '[JSON] ApiInfo
100 -}
101
102 -- | TODO manage several apis
103 -- TODO-ACCESS
104 -- TODO this is only the POST
105 {-
106 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
107 api uid (Query q _ as) = do
108 cId <- case head as of
109 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
110 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
111 Just a -> do
112 docs <- liftBase $ API.get a q (Just 1000)
113 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
114 pure cId'
115
116 pure cId
117 -}
118
119 ------------------------------------------------
120 -- TODO use this route for Client implementation
121 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
122 deriving (Generic)
123 instance Arbitrary ApiInfo where
124 arbitrary = ApiInfo <$> arbitrary
125
126 deriveJSON (unPrefix "") 'ApiInfo
127
128 instance ToSchema ApiInfo
129
130 info :: FlowCmdM env err m => UserId -> m ApiInfo
131 info _u = pure $ ApiInfo API.externalAPIs
132
133 ------------------------------------------------------------------------
134 ------------------------------------------------------------------------
135 data WithQuery = WithQuery
136 { _wq_query :: !Text
137 , _wq_databases :: !Database
138 , _wq_datafield :: !(Maybe Datafield)
139 , _wq_lang :: !Lang
140 , _wq_node_id :: !Int
141 , _wq_flowListWith :: !FlowSocialListWith
142 }
143 deriving Generic
144
145 makeLenses ''WithQuery
146 instance FromJSON WithQuery where
147 parseJSON = genericParseJSON $ jsonOptions "_wq_"
148 instance ToJSON WithQuery where
149 toJSON = genericToJSON $ jsonOptions "_wq_"
150 instance ToSchema WithQuery where
151 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
152
153 ------------------------------------------------------------------------
154
155 type AddWithQuery = Summary "Add with Query to corpus endpoint"
156 :> "corpus"
157 :> Capture "corpus_id" CorpusId
158 :> "query"
159 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
160
161 {-
162 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
163 :> "corpus"
164 :> Capture "corpus_id" CorpusId
165 :> "add"
166 :> "file"
167 :> MultipartForm Mem (MultipartData Mem)
168 :> QueryParam "fileType" FileType
169 :> "async"
170 :> AsyncJobs JobLog '[JSON] () JobLog
171 -}
172
173
174 ------------------------------------------------------------------------
175 -- TODO WithQuery also has a corpus id
176 addToCorpusWithQuery :: FlowCmdM env err m
177 => User
178 -> CorpusId
179 -> WithQuery
180 -> Maybe Integer
181 -> (JobLog -> m ())
182 -> m JobLog
183 addToCorpusWithQuery user cid (WithQuery { _wq_query = q
184 , _wq_databases = dbs
185 , _wq_datafield = datafield
186 , _wq_lang = l
187 , _wq_flowListWith = flw }) maybeLimit logStatus = do
188 -- TODO ...
189 logStatus JobLog { _scst_succeeded = Just 0
190 , _scst_failed = Just 0
191 , _scst_remaining = Just 3
192 , _scst_events = Just []
193 }
194 printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
195 printDebug "[addToCorpusWithQuery] datafield" datafield
196 printDebug "[addToCorpusWithQuery] flowListWith" flw
197
198 case datafield of
199 Just Web -> do
200 printDebug "[addToCorpusWithQuery] processing web request" datafield
201
202 _ <- triggerSearxSearch cid q l
203
204 pure JobLog { _scst_succeeded = Just 3
205 , _scst_failed = Just 0
206 , _scst_remaining = Just 0
207 , _scst_events = Just []
208 }
209
210 _ -> do
211 -- TODO add cid
212 -- TODO if cid is folder -> create Corpus
213 -- if cid is corpus -> add to corpus
214 -- if cid is root -> create corpus in Private
215 txts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
216
217 logStatus JobLog { _scst_succeeded = Just 2
218 , _scst_failed = Just 0
219 , _scst_remaining = Just 1
220 , _scst_events = Just []
221 }
222
223 cids <- mapM (\txt -> flowDataText user txt (Multi l) cid Nothing) txts
224 printDebug "corpus id" cids
225 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
226 sendMail user
227 -- TODO ...
228 pure JobLog { _scst_succeeded = Just 3
229 , _scst_failed = Just 0
230 , _scst_remaining = Just 0
231 , _scst_events = Just []
232 }
233
234
235 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
236 :> "corpus"
237 :> Capture "corpus_id" CorpusId
238 :> "add"
239 :> "form"
240 :> "async"
241 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
242
243 addToCorpusWithForm :: FlowCmdM env err m
244 => User
245 -> CorpusId
246 -> NewWithForm
247 -> (JobLog -> m ())
248 -> JobLog
249 -> m JobLog
250 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
251 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
252 printDebug "[addToCorpusWithForm] fileType" ft
253 logStatus jobLog
254 let
255 parse = case ft of
256 CSV_HAL -> Parser.parseFormat Parser.CsvHal
257 CSV -> Parser.parseFormat Parser.CsvGargV3
258 WOS -> Parser.parseFormat Parser.WOS
259 PresseRIS -> Parser.parseFormat Parser.RisPresse
260 ZIP -> Parser.parseFormat Parser.ZIP
261
262 -- TODO granularity of the logStatus
263 eDocs <- liftBase $ parse $ cs d
264 case eDocs of
265 Right docs' -> do
266 let docs = splitEvery 500 $ take 1000000 docs'
267
268 printDebug "Parsing corpus finished : " cid
269 logStatus jobLog2
270
271 printDebug "Starting extraction : " cid
272 -- TODO granularity of the logStatus
273 _cid' <- flowCorpus user
274 (Right [cid])
275 (Multi $ fromMaybe EN l)
276 Nothing
277 (map (map toHyperdataDocument) docs)
278
279 printDebug "Extraction finished : " cid
280 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
281 sendMail user
282
283 logStatus jobLog3
284 pure $ jobLog3
285 Left e -> do
286 printDebug "Error" e
287
288 logStatus jobLogE
289 pure jobLogE
290 where
291 jobLog2 = jobLogSuccess jobLog
292 jobLog3 = jobLogSuccess jobLog2
293 jobLogE = jobLogFailTotal jobLog
294
295 parseCsvGargV3Path :: [Char] -> IO (Either Prelude.String [HyperdataDocument])
296 parseCsvGargV3Path fp = do
297 contents <- readFile fp
298 Parser.parseFormat Parser.CsvGargV3 $ cs contents
299
300 {-
301 addToCorpusWithFile :: FlowCmdM env err m
302 => CorpusId
303 -> MultipartData Mem
304 -> Maybe FileType
305 -> (JobLog -> m ())
306 -> m JobLog
307 addToCorpusWithFile cid input filetype logStatus = do
308 logStatus JobLog { _scst_succeeded = Just 10
309 , _scst_failed = Just 2
310 , _scst_remaining = Just 138
311 , _scst_events = Just []
312 }
313 printDebug "addToCorpusWithFile" cid
314 _h <- postUpload cid filetype input
315
316 pure JobLog { _scst_succeeded = Just 137
317 , _scst_failed = Just 13
318 , _scst_remaining = Just 0
319 , _scst_events = Just []
320 }
321 -}
322
323
324
325 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
326 :> "corpus"
327 :> Capture "corpus_id" CorpusId
328 :> "add"
329 :> "file"
330 :> "async"
331 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
332
333 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
334 => User
335 -> CorpusId
336 -> NewWithFile
337 -> (JobLog -> m ())
338 -> m JobLog
339 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
340
341 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
342 logStatus JobLog { _scst_succeeded = Just 0
343 , _scst_failed = Just 0
344 , _scst_remaining = Just 1
345 , _scst_events = Just []
346 }
347
348 fPath <- GargDB.writeFile nwf
349 printDebug "[addToCorpusWithFile] File saved as: " fPath
350
351 uId <- getUserId user
352 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
353
354 _ <- case nIds of
355 [nId] -> do
356 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
357 let hl = node ^. node_hyperdata
358 _ <- updateHyperdata nId $ hl { _hff_name = fName
359 , _hff_path = T.pack fPath }
360
361 printDebug "[addToCorpusWithFile] Created node with id: " nId
362 _ -> pure ()
363
364 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
365
366 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
367 sendMail user
368
369 pure $ JobLog { _scst_succeeded = Just 1
370 , _scst_failed = Just 0
371 , _scst_remaining = Just 0
372 , _scst_events = Just []
373 }
374