]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/New.hs
[framewrite] better line parsing
[gargantext.git] / src / Gargantext / API / Node / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
17
18 module Gargantext.API.Node.Corpus.New
19 where
20
21 import Control.Lens hiding (elements, Empty)
22 import Data.Aeson
23 import Data.Aeson.TH (deriveJSON)
24 import Data.Either
25 import Data.Maybe (fromMaybe)
26 import Data.Swagger
27 import Data.Text (Text)
28 import qualified Data.Text as T
29 import GHC.Generics (Generic)
30 import qualified Prelude as Prelude
31 import Protolude (readFile)
32 import Servant
33 import Servant.Job.Utils (jsonOptions)
34 -- import Servant.Multipart
35 -- import Test.QuickCheck (elements)
36 import Test.QuickCheck.Arbitrary
37
38 import Gargantext.Prelude
39
40 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
41 import Gargantext.API.Admin.Types (HasSettings)
42 import Gargantext.API.Job (jobLogSuccess, jobLogFailTotal)
43 import Gargantext.API.Node.Corpus.New.File
44 import Gargantext.API.Node.Corpus.Searx
45 import Gargantext.API.Node.Corpus.Types
46 import Gargantext.API.Node.Types
47 import Gargantext.Core (Lang(..){-, allLangs-})
48 import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
49 import qualified Gargantext.Core.Text.Corpus.API as API
50 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
51 import Gargantext.Core.Types.Individu (User(..))
52 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
53 import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
54 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
55 import Gargantext.Database.Action.Mail (sendMail)
56 import Gargantext.Database.Action.Node (mkNodeWithParent)
57 import Gargantext.Database.Action.User (getUserId)
58 import Gargantext.Database.Admin.Types.Hyperdata
59 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
60 import Gargantext.Database.Query.Table.Node (getNodeWith)
61 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
62 import Gargantext.Database.Schema.Node (node_hyperdata)
63 import qualified Gargantext.Database.GargDB as GargDB
64
65 ------------------------------------------------------------------------
66 {-
67 data Query = Query { query_query :: Text
68 , query_node_id :: Int
69 , query_lang :: Lang
70 , query_databases :: [DataOrigin]
71 }
72 deriving (Eq, Generic)
73
74 deriveJSON (unPrefix "query_") 'Query
75
76 instance Arbitrary Query where
77 arbitrary = elements [ Query q n la fs
78 | q <- ["honeybee* AND collapse"
79 ,"covid 19"
80 ]
81 , n <- [0..10]
82 , la <- allLangs
83 , fs <- take 3 $ repeat allDataOrigins
84 ]
85
86 instance ToSchema Query where
87 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
88 -}
89
90 ------------------------------------------------------------------------
91
92 {-
93 type Api = PostApi
94 :<|> GetApi
95
96 type PostApi = Summary "New Corpus endpoint"
97 :> ReqBody '[JSON] Query
98 :> Post '[JSON] CorpusId
99 type GetApi = Get '[JSON] ApiInfo
100 -}
101
102 -- | TODO manage several apis
103 -- TODO-ACCESS
104 -- TODO this is only the POST
105 {-
106 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
107 api uid (Query q _ as) = do
108 cId <- case head as of
109 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
110 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
111 Just a -> do
112 docs <- liftBase $ API.get a q (Just 1000)
113 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
114 pure cId'
115
116 pure cId
117 -}
118
119 ------------------------------------------------
120 -- TODO use this route for Client implementation
121 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
122 deriving (Generic)
123 instance Arbitrary ApiInfo where
124 arbitrary = ApiInfo <$> arbitrary
125
126 deriveJSON (unPrefix "") 'ApiInfo
127
128 instance ToSchema ApiInfo
129
130 info :: FlowCmdM env err m => UserId -> m ApiInfo
131 info _u = pure $ ApiInfo API.externalAPIs
132
133 ------------------------------------------------------------------------
134 ------------------------------------------------------------------------
135 data WithQuery = WithQuery
136 { _wq_query :: !Text
137 , _wq_databases :: !Database
138 , _wq_datafield :: !(Maybe Datafield)
139 , _wq_lang :: !Lang
140 , _wq_node_id :: !Int
141 , _wq_flowListWith :: !FlowSocialListWith
142 }
143 deriving Generic
144
145 makeLenses ''WithQuery
146 instance FromJSON WithQuery where
147 parseJSON = genericParseJSON $ jsonOptions "_wq_"
148 instance ToJSON WithQuery where
149 toJSON = genericToJSON $ jsonOptions "_wq_"
150 instance ToSchema WithQuery where
151 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
152
153 ------------------------------------------------------------------------
154
155 type AddWithQuery = Summary "Add with Query to corpus endpoint"
156 :> "corpus"
157 :> Capture "corpus_id" CorpusId
158 :> "query"
159 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
160
161 {-
162 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
163 :> "corpus"
164 :> Capture "corpus_id" CorpusId
165 :> "add"
166 :> "file"
167 :> MultipartForm Mem (MultipartData Mem)
168 :> QueryParam "fileType" FileType
169 :> "async"
170 :> AsyncJobs JobLog '[JSON] () JobLog
171 -}
172
173
174 ------------------------------------------------------------------------
175 -- TODO WithQuery also has a corpus id
176 addToCorpusWithQuery :: FlowCmdM env err m
177 => User
178 -> CorpusId
179 -> WithQuery
180 -> Maybe Integer
181 -> (JobLog -> m ())
182 -> m JobLog
183 addToCorpusWithQuery user cid (WithQuery { _wq_query = q
184 , _wq_databases = dbs
185 , _wq_datafield = datafield
186 , _wq_lang = l
187 , _wq_flowListWith = flw }) maybeLimit logStatus = do
188 -- TODO ...
189 logStatus JobLog { _scst_succeeded = Just 0
190 , _scst_failed = Just 0
191 , _scst_remaining = Just 3
192 , _scst_events = Just []
193 }
194 printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
195 printDebug "[addToCorpusWithQuery] datafield" datafield
196 printDebug "[addToCorpusWithQuery] flowListWith" flw
197
198 case datafield of
199 Just Web -> do
200 printDebug "[addToCorpusWithQuery] processing web request" datafield
201
202 _ <- triggerSearxSearch cid q l
203
204 pure JobLog { _scst_succeeded = Just 3
205 , _scst_failed = Just 0
206 , _scst_remaining = Just 0
207 , _scst_events = Just []
208 }
209
210 _ -> do
211 -- TODO add cid
212 -- TODO if cid is folder -> create Corpus
213 -- if cid is corpus -> add to corpus
214 -- if cid is root -> create corpus in Private
215 txts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
216
217 logStatus JobLog { _scst_succeeded = Just 2
218 , _scst_failed = Just 0
219 , _scst_remaining = Just 1
220 , _scst_events = Just []
221 }
222
223 cids <- mapM (\txt -> flowDataText user txt (Multi l) cid Nothing) txts
224 printDebug "corpus id" cids
225 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
226 sendMail user
227 -- TODO ...
228 pure JobLog { _scst_succeeded = Just 3
229 , _scst_failed = Just 0
230 , _scst_remaining = Just 0
231 , _scst_events = Just []
232 }
233
234
235 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
236 :> "corpus"
237 :> Capture "corpus_id" CorpusId
238 :> "add"
239 :> "form"
240 :> "async"
241 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
242
243 addToCorpusWithForm :: FlowCmdM env err m
244 => User
245 -> CorpusId
246 -> NewWithForm
247 -> (JobLog -> m ())
248 -> JobLog
249 -> m JobLog
250 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
251 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
252 printDebug "[addToCorpusWithForm] fileType" ft
253 logStatus jobLog
254 let
255 parse = case ft of
256 CSV_HAL -> Parser.parseFormat Parser.CsvHal
257 CSV -> Parser.parseFormat Parser.CsvGargV3
258 WOS -> Parser.parseFormat Parser.WOS
259 PresseRIS -> Parser.parseFormat Parser.RisPresse
260
261 -- TODO granularity of the logStatus
262 eDocs <- liftBase $ parse $ cs d
263 case eDocs of
264 Right docs' -> do
265 let docs = splitEvery 500 $ take 1000000 docs'
266
267 printDebug "Parsing corpus finished : " cid
268 logStatus jobLog2
269
270 printDebug "Starting extraction : " cid
271 -- TODO granularity of the logStatus
272 _cid' <- flowCorpus user
273 (Right [cid])
274 (Multi $ fromMaybe EN l)
275 Nothing
276 (map (map toHyperdataDocument) docs)
277
278 printDebug "Extraction finished : " cid
279 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
280 sendMail user
281
282 logStatus jobLog3
283 pure $ jobLog3
284 Left e -> do
285 printDebug "Error" e
286
287 logStatus jobLogE
288 pure jobLogE
289 where
290 jobLog2 = jobLogSuccess jobLog
291 jobLog3 = jobLogSuccess jobLog2
292 jobLogE = jobLogFailTotal jobLog
293
294 parseCsvGargV3Path :: [Char] -> IO (Either Prelude.String [HyperdataDocument])
295 parseCsvGargV3Path fp = do
296 contents <- readFile fp
297 Parser.parseFormat Parser.CsvGargV3 $ cs contents
298
299 {-
300 addToCorpusWithFile :: FlowCmdM env err m
301 => CorpusId
302 -> MultipartData Mem
303 -> Maybe FileType
304 -> (JobLog -> m ())
305 -> m JobLog
306 addToCorpusWithFile cid input filetype logStatus = do
307 logStatus JobLog { _scst_succeeded = Just 10
308 , _scst_failed = Just 2
309 , _scst_remaining = Just 138
310 , _scst_events = Just []
311 }
312 printDebug "addToCorpusWithFile" cid
313 _h <- postUpload cid filetype input
314
315 pure JobLog { _scst_succeeded = Just 137
316 , _scst_failed = Just 13
317 , _scst_remaining = Just 0
318 , _scst_events = Just []
319 }
320 -}
321
322
323
324 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
325 :> "corpus"
326 :> Capture "corpus_id" CorpusId
327 :> "add"
328 :> "file"
329 :> "async"
330 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
331
332 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
333 => User
334 -> CorpusId
335 -> NewWithFile
336 -> (JobLog -> m ())
337 -> m JobLog
338 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
339
340 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
341 logStatus JobLog { _scst_succeeded = Just 0
342 , _scst_failed = Just 0
343 , _scst_remaining = Just 1
344 , _scst_events = Just []
345 }
346
347 fPath <- GargDB.writeFile nwf
348 printDebug "[addToCorpusWithFile] File saved as: " fPath
349
350 uId <- getUserId user
351 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
352
353 _ <- case nIds of
354 [nId] -> do
355 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
356 let hl = node ^. node_hyperdata
357 _ <- updateHyperdata nId $ hl { _hff_name = fName
358 , _hff_path = T.pack fPath }
359
360 printDebug "[addToCorpusWithFile] Created node with id: " nId
361 _ -> pure ()
362
363 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
364
365 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
366 sendMail user
367
368 pure $ JobLog { _scst_succeeded = Just 1
369 , _scst_failed = Just 0
370 , _scst_remaining = Just 0
371 , _scst_events = Just []
372 }