]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/New.hs
[searx] first draft of searx parsing, updated stack to lts 18.4
[gargantext.git] / src / Gargantext / API / Node / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
17
18 module Gargantext.API.Node.Corpus.New
19 where
20
21 import Control.Lens hiding (elements, Empty)
22 import Data.Aeson
23 import Data.Aeson.TH (deriveJSON)
24 import Data.Either
25 import Data.Maybe (fromMaybe)
26 import Data.Swagger
27 import Data.Text (Text)
28 import qualified Data.Text as T
29 import GHC.Generics (Generic)
30 import Servant
31 import Servant.Job.Utils (jsonOptions)
32 -- import Servant.Multipart
33 -- import Test.QuickCheck (elements)
34 import Test.QuickCheck.Arbitrary
35
36 import Gargantext.Prelude
37
38 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
39 import Gargantext.API.Admin.Types (HasSettings)
40 import Gargantext.API.Node.Corpus.New.File
41 import Gargantext.API.Node.Corpus.Searx
42 import Gargantext.API.Node.Corpus.Types
43 import Gargantext.API.Node.Types
44 import Gargantext.Core (Lang(..){-, allLangs-})
45 import qualified Gargantext.Core.Text.Corpus.API as API
46 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
47 import Gargantext.Core.Types.Individu (User(..))
48 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
49 import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
50 import Gargantext.Database.Action.Mail (sendMail)
51 import Gargantext.Database.Action.Node (mkNodeWithParent)
52 import Gargantext.Database.Action.User (getUserId)
53 import Gargantext.Database.Admin.Types.Hyperdata
54 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
55 import Gargantext.Database.Query.Table.Node (getNodeWith)
56 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
57 import Gargantext.Database.Schema.Node (node_hyperdata)
58 import qualified Gargantext.Database.GargDB as GargDB
59
60 ------------------------------------------------------------------------
61 {-
62 data Query = Query { query_query :: Text
63 , query_node_id :: Int
64 , query_lang :: Lang
65 , query_databases :: [DataOrigin]
66 }
67 deriving (Eq, Generic)
68
69 deriveJSON (unPrefix "query_") 'Query
70
71 instance Arbitrary Query where
72 arbitrary = elements [ Query q n la fs
73 | q <- ["honeybee* AND collapse"
74 ,"covid 19"
75 ]
76 , n <- [0..10]
77 , la <- allLangs
78 , fs <- take 3 $ repeat allDataOrigins
79 ]
80
81 instance ToSchema Query where
82 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
83 -}
84
85 ------------------------------------------------------------------------
86
87 {-
88 type Api = PostApi
89 :<|> GetApi
90
91 type PostApi = Summary "New Corpus endpoint"
92 :> ReqBody '[JSON] Query
93 :> Post '[JSON] CorpusId
94 type GetApi = Get '[JSON] ApiInfo
95 -}
96
97 -- | TODO manage several apis
98 -- TODO-ACCESS
99 -- TODO this is only the POST
100 {-
101 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
102 api uid (Query q _ as) = do
103 cId <- case head as of
104 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
105 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
106 Just a -> do
107 docs <- liftBase $ API.get a q (Just 1000)
108 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
109 pure cId'
110
111 pure cId
112 -}
113
114 ------------------------------------------------
115 -- TODO use this route for Client implementation
116 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
117 deriving (Generic)
118 instance Arbitrary ApiInfo where
119 arbitrary = ApiInfo <$> arbitrary
120
121 deriveJSON (unPrefix "") 'ApiInfo
122
123 instance ToSchema ApiInfo
124
125 info :: FlowCmdM env err m => UserId -> m ApiInfo
126 info _u = pure $ ApiInfo API.externalAPIs
127
128 ------------------------------------------------------------------------
129 ------------------------------------------------------------------------
130 data WithQuery = WithQuery
131 { _wq_query :: !Text
132 , _wq_databases :: !Database
133 , _wq_datafield :: !Datafield
134 , _wq_lang :: !Lang
135 , _wq_node_id :: !Int
136 }
137 deriving Generic
138
139 makeLenses ''WithQuery
140 instance FromJSON WithQuery where
141 parseJSON = genericParseJSON $ jsonOptions "_wq_"
142 instance ToJSON WithQuery where
143 toJSON = genericToJSON $ jsonOptions "_wq_"
144 instance ToSchema WithQuery where
145 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
146
147 ------------------------------------------------------------------------
148
149 type AddWithQuery = Summary "Add with Query to corpus endpoint"
150 :> "corpus"
151 :> Capture "corpus_id" CorpusId
152 :> "query"
153 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
154
155 {-
156 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
157 :> "corpus"
158 :> Capture "corpus_id" CorpusId
159 :> "add"
160 :> "file"
161 :> MultipartForm Mem (MultipartData Mem)
162 :> QueryParam "fileType" FileType
163 :> "async"
164 :> AsyncJobs JobLog '[JSON] () JobLog
165 -}
166
167
168 ------------------------------------------------------------------------
169 -- TODO WithQuery also has a corpus id
170 addToCorpusWithQuery :: FlowCmdM env err m
171 => User
172 -> CorpusId
173 -> WithQuery
174 -> Maybe Integer
175 -> (JobLog -> m ())
176 -> m JobLog
177 addToCorpusWithQuery user cid (WithQuery q dbs datafield l _nid) maybeLimit logStatus = do
178 -- TODO ...
179 logStatus JobLog { _scst_succeeded = Just 0
180 , _scst_failed = Just 0
181 , _scst_remaining = Just 3
182 , _scst_events = Just []
183 }
184 printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
185 printDebug "[addToCorpusWithQuery] datafield" datafield
186
187 case datafield of
188 Web -> do
189 printDebug "[addToCorpusWithQuery] processing web request" datafield
190
191 _ <- triggerSearxSearch cid q l
192
193 pure JobLog { _scst_succeeded = Just 3
194 , _scst_failed = Just 0
195 , _scst_remaining = Just 0
196 , _scst_events = Just []
197 }
198
199 _ -> do
200 -- TODO add cid
201 -- TODO if cid is folder -> create Corpus
202 -- if cid is corpus -> add to corpus
203 -- if cid is root -> create corpus in Private
204 txts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
205
206 logStatus JobLog { _scst_succeeded = Just 2
207 , _scst_failed = Just 0
208 , _scst_remaining = Just 1
209 , _scst_events = Just []
210 }
211
212 cids <- mapM (\txt -> flowDataText user txt (Multi l) cid) txts
213 printDebug "corpus id" cids
214 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
215 sendMail user
216 -- TODO ...
217 pure JobLog { _scst_succeeded = Just 3
218 , _scst_failed = Just 0
219 , _scst_remaining = Just 0
220 , _scst_events = Just []
221 }
222
223
224 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
225 :> "corpus"
226 :> Capture "corpus_id" CorpusId
227 :> "add"
228 :> "form"
229 :> "async"
230 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
231
232 addToCorpusWithForm :: FlowCmdM env err m
233 => User
234 -> CorpusId
235 -> NewWithForm
236 -> (JobLog -> m ())
237 -> m JobLog
238 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus = do
239
240 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
241 printDebug "[addToCorpusWithForm] fileType" ft
242 logStatus JobLog { _scst_succeeded = Just 0
243 , _scst_failed = Just 0
244 , _scst_remaining = Just 2
245 , _scst_events = Just []
246 }
247 let
248 parse = case ft of
249 CSV_HAL -> Parser.parseFormat Parser.CsvHal
250 CSV -> Parser.parseFormat Parser.CsvGargV3
251 WOS -> Parser.parseFormat Parser.WOS
252 PresseRIS -> Parser.parseFormat Parser.RisPresse
253
254 -- TODO granularity of the logStatus
255 docs <- liftBase $ splitEvery 500
256 <$> take 1000000
257 <$> parse (cs d)
258
259 printDebug "Parsing corpus finished : " cid
260 logStatus JobLog { _scst_succeeded = Just 1
261 , _scst_failed = Just 0
262 , _scst_remaining = Just 1
263 , _scst_events = Just []
264 }
265
266
267 printDebug "Starting extraction : " cid
268 -- TODO granularity of the logStatus
269 _cid' <- flowCorpus user
270 (Right [cid])
271 (Multi $ fromMaybe EN l)
272 (map (map toHyperdataDocument) docs)
273
274 printDebug "Extraction finished : " cid
275 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
276 sendMail user
277
278 pure JobLog { _scst_succeeded = Just 2
279 , _scst_failed = Just 0
280 , _scst_remaining = Just 0
281 , _scst_events = Just []
282 }
283
284 {-
285 addToCorpusWithFile :: FlowCmdM env err m
286 => CorpusId
287 -> MultipartData Mem
288 -> Maybe FileType
289 -> (JobLog -> m ())
290 -> m JobLog
291 addToCorpusWithFile cid input filetype logStatus = do
292 logStatus JobLog { _scst_succeeded = Just 10
293 , _scst_failed = Just 2
294 , _scst_remaining = Just 138
295 , _scst_events = Just []
296 }
297 printDebug "addToCorpusWithFile" cid
298 _h <- postUpload cid filetype input
299
300 pure JobLog { _scst_succeeded = Just 137
301 , _scst_failed = Just 13
302 , _scst_remaining = Just 0
303 , _scst_events = Just []
304 }
305 -}
306
307
308
309 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
310 :> "corpus"
311 :> Capture "corpus_id" CorpusId
312 :> "add"
313 :> "file"
314 :> "async"
315 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
316
317 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
318 => User
319 -> CorpusId
320 -> NewWithFile
321 -> (JobLog -> m ())
322 -> m JobLog
323 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
324
325 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
326 logStatus JobLog { _scst_succeeded = Just 0
327 , _scst_failed = Just 0
328 , _scst_remaining = Just 1
329 , _scst_events = Just []
330 }
331
332 fPath <- GargDB.writeFile nwf
333 printDebug "[addToCorpusWithFile] File saved as: " fPath
334
335 uId <- getUserId user
336 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
337
338 _ <- case nIds of
339 [nId] -> do
340 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
341 let hl = node ^. node_hyperdata
342 _ <- updateHyperdata nId $ hl { _hff_name = fName
343 , _hff_path = T.pack fPath }
344
345 printDebug "[addToCorpusWithFile] Created node with id: " nId
346 _ -> pure ()
347
348 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
349
350 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
351 sendMail user
352
353 pure $ JobLog { _scst_succeeded = Just 1
354 , _scst_failed = Just 0
355 , _scst_remaining = Just 0
356 , _scst_events = Just []
357 }