]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Corpus/New.hs
[List] filename for content-disposition
[gargantext.git] / src / Gargantext / API / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE DeriveGeneric #-}
18 {-# LANGUAGE DataKinds #-}
19 {-# LANGUAGE TypeOperators #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE FlexibleContexts #-}
22 {-# LANGUAGE RankNTypes #-}
23
24 module Gargantext.API.Corpus.New
25 where
26
27 --import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat(..))
28 import Control.Lens hiding (elements)
29 import Control.Monad.IO.Class (liftIO)
30 import Control.Concurrent
31 import Data.Aeson
32 import Data.Aeson.TH (deriveJSON)
33 import Data.Maybe (fromMaybe)
34 import Data.Either
35 import Data.Swagger
36 import Data.Text (Text)
37 import GHC.Generics (Generic)
38 import Gargantext.API.Corpus.New.File
39 import Gargantext.API.Orchestrator.Types
40 import Gargantext.Core (Lang(..))
41 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
42 import Gargantext.Database.Flow (FlowCmdM, flowCorpus)
43 import Gargantext.Database.Flow (flowCorpusSearchInDatabase)
44 import Gargantext.Database.Types.Node (CorpusId)
45 import Gargantext.Database.Types.Node (ToHyperdataDocument(..))
46 import Gargantext.Database.Types.Node (UserId)
47 import Gargantext.Prelude
48 import qualified Gargantext.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
49 import Gargantext.Text.Terms (TermType(..))
50 import Servant
51 import Servant.API.Flatten (Flat)
52 import Servant.Job.Core
53 import Servant.Job.Types
54 import Servant.Job.Utils (jsonOptions)
55 import Servant.Multipart
56 import Test.QuickCheck (elements)
57 import Test.QuickCheck.Arbitrary
58 import Web.FormUrlEncoded (FromForm)
59 import qualified Gargantext.Text.Corpus.API as API
60
61 data Query = Query { query_query :: Text
62 , query_corpus_id :: Int
63 , query_databases :: [API.ExternalAPIs]
64 }
65 deriving (Eq, Show, Generic)
66
67 deriveJSON (unPrefix "query_") 'Query
68
69
70 instance Arbitrary Query where
71 arbitrary = elements [ Query q n fs
72 | q <- ["a","b"]
73 , n <- [0..10]
74 , fs <- take 3 $ repeat API.externalAPIs
75 ]
76
77 instance ToSchema Query where
78 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
79
80 type Api = Summary "New Corpus endpoint"
81 :> ReqBody '[JSON] Query
82 :> Post '[JSON] CorpusId
83 :<|> Get '[JSON] ApiInfo
84
85 -- | TODO manage several apis
86 -- TODO-ACCESS
87 -- TODO this is only the POST
88 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
89 api _uId (Query q _ as) = do
90 cId <- case head as of
91 Nothing -> flowCorpusSearchInDatabase "user1" EN q
92 Just API.All -> flowCorpusSearchInDatabase "user1" EN q
93 Just a -> do
94 docs <- liftIO $ API.get a q (Just 1000)
95 cId' <- flowCorpus "user1" (Left q) (Multi EN) [docs]
96 pure cId'
97
98 pure cId
99
100 ------------------------------------------------
101 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
102 deriving (Generic)
103 instance Arbitrary ApiInfo where
104 arbitrary = ApiInfo <$> arbitrary
105
106 deriveJSON (unPrefix "") 'ApiInfo
107
108 instance ToSchema ApiInfo
109
110 info :: FlowCmdM env err m => UserId -> m ApiInfo
111 info _u = pure $ ApiInfo API.externalAPIs
112
113 ------------------------------------------------------------------------
114 ------------------------------------------------------------------------
115 data WithQuery = WithQuery
116 { _wq_query :: !Text
117 , _wq_databases :: ![ExternalAPIs]
118 , _wq_lang :: !(Maybe Lang)
119 }
120 deriving Generic
121
122 makeLenses ''WithQuery
123 instance FromJSON WithQuery where
124 parseJSON = genericParseJSON $ jsonOptions "_wq_"
125 instance ToSchema WithQuery where
126 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
127
128 -------------------------------------------------------
129 data WithForm = WithForm
130 { _wf_filetype :: !FileType
131 , _wf_data :: !Text
132 , _wf_lang :: !(Maybe Lang)
133 } deriving (Eq, Show, Generic)
134
135 makeLenses ''WithForm
136 instance FromForm WithForm
137 instance FromJSON WithForm where
138 parseJSON = genericParseJSON $ jsonOptions "_wf_"
139 instance ToSchema WithForm where
140 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")
141
142 ------------------------------------------------------------------------
143 type AsyncJobs event ctI input output =
144 Flat (AsyncJobsAPI' 'Unsafe 'Safe ctI '[JSON] Maybe event input output)
145 ------------------------------------------------------------------------
146
147 type Upload = Summary "Corpus Upload endpoint"
148 :> "corpus"
149 :> Capture "corpus_id" CorpusId
150 :<|> "addWithquery" :> AsyncJobsAPI ScraperStatus WithQuery ScraperStatus
151 :<|> "addWithfile" :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
152
153
154 type AddWithQuery = Summary "Add with Query to corpus endpoint"
155 :> "corpus"
156 :> Capture "corpus_id" CorpusId
157 :> "add"
158 :> "query"
159 :> "async"
160 :> AsyncJobsAPI ScraperStatus WithQuery ScraperStatus
161
162 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
163 :> "corpus"
164 :> Capture "corpus_id" CorpusId
165 :> "add"
166 :> "file"
167 :> MultipartForm Mem (MultipartData Mem)
168 :> QueryParam "fileType" FileType
169 :> "async"
170 :> AsyncJobs ScraperStatus '[JSON] () ScraperStatus
171
172 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
173 :> "corpus"
174 :> Capture "corpus_id" CorpusId
175 :> "add"
176 :> "form"
177 :> "async"
178 :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
179
180 ------------------------------------------------------------------------
181 -- TODO WithQuery also has a corpus id
182 addToCorpusJobFunction :: FlowCmdM env err m
183 => CorpusId
184 -> WithQuery
185 -> (ScraperStatus -> m ())
186 -> m ScraperStatus
187 addToCorpusJobFunction _cid (WithQuery _q _dbs _l) logStatus = do
188 -- TODO ...
189 logStatus ScraperStatus { _scst_succeeded = Just 10
190 , _scst_failed = Just 2
191 , _scst_remaining = Just 138
192 , _scst_events = Just []
193 }
194 -- TODO ...
195 pure ScraperStatus { _scst_succeeded = Just 137
196 , _scst_failed = Just 13
197 , _scst_remaining = Just 0
198 , _scst_events = Just []
199 }
200
201
202 addToCorpusWithFile :: FlowCmdM env err m
203 => CorpusId
204 -> MultipartData Mem
205 -> Maybe FileType
206 -> (ScraperStatus -> m ())
207 -> m ScraperStatus
208 addToCorpusWithFile cid input filetype logStatus = do
209 logStatus ScraperStatus { _scst_succeeded = Just 10
210 , _scst_failed = Just 2
211 , _scst_remaining = Just 138
212 , _scst_events = Just []
213 }
214 _h <- postUpload cid filetype input
215
216 pure ScraperStatus { _scst_succeeded = Just 137
217 , _scst_failed = Just 13
218 , _scst_remaining = Just 0
219 , _scst_events = Just []
220 }
221
222 {- | Model to fork the flow
223 -- This is not really optimized since it increases the need RAM
224 -- and freezes the whole system
225 -- This is mainly for documentation (see a better solution in the function below)
226 -- Each process has to be tailored
227 addToCorpusWithForm' :: FlowCmdM env err m
228 => CorpusId
229 -> WithForm
230 -> (ScraperStatus -> m ())
231 -> m ScraperStatus
232 addToCorpusWithForm' cid (WithForm ft d l) logStatus = do
233 newStatus <- liftIO newEmptyMVar
234 s <- addToCorpusWithForm cid (WithForm ft d l) logStatus
235 _ <- liftIO $ forkIO $ putMVar newStatus s
236 s' <- liftIO $ takeMVar newStatus
237 pure s'
238 -}
239 addToCorpusWithForm :: FlowCmdM env err m
240 => CorpusId
241 -> WithForm
242 -> (ScraperStatus -> m ())
243 -> m ScraperStatus
244 addToCorpusWithForm cid (WithForm ft d l) logStatus = do
245
246 printDebug "ft" ft
247
248 let
249 parse = case ft of
250 CSV_HAL -> Parser.parseFormat Parser.CsvHal
251 CSV -> Parser.parseFormat Parser.CsvGargV3
252 WOS -> Parser.parseFormat Parser.WOS
253 PresseRIS -> Parser.parseFormat Parser.RisPresse
254
255 newDocs <- liftIO newEmptyMVar
256 docs <- liftIO
257 $ splitEvery 500
258 <$> take 1000000
259 <$> parse (cs d)
260 _ <- liftIO $ forkIO $ putMVar newDocs docs
261
262 logStatus ScraperStatus { _scst_succeeded = Just 1
263 , _scst_failed = Just 0
264 , _scst_remaining = Just 1
265 , _scst_events = Just []
266 }
267
268 docs' <- liftIO $ takeMVar newDocs
269 newCid <- liftIO newEmptyMVar
270 cid' <- flowCorpus "user1"
271 (Right [cid])
272 (Multi $ fromMaybe EN l)
273 (map (map toHyperdataDocument) docs')
274 _ <- liftIO $ forkIO $ putMVar newCid cid'
275
276 cid'' <- liftIO $ takeMVar newCid
277 printDebug "cid'" cid''
278
279 pure ScraperStatus { _scst_succeeded = Just 2
280 , _scst_failed = Just 0
281 , _scst_remaining = Just 0
282 , _scst_events = Just []
283 }
284