]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Corpus/New.hs
[API][FLOW] Upload file for CsvHalFormat (WIP)
[gargantext.git] / src / Gargantext / API / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE DeriveGeneric #-}
18 {-# LANGUAGE DataKinds #-}
19 {-# LANGUAGE TypeOperators #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE FlexibleContexts #-}
22 {-# LANGUAGE RankNTypes #-}
23
24 module Gargantext.API.Corpus.New
25 where
26
27 --import Debug.Trace (trace)
28 import Web.FormUrlEncoded (FromForm)
29 import Data.Either
30 import Control.Monad.IO.Class (liftIO)
31 import Data.Aeson.TH (deriveJSON)
32 import Data.Aeson
33 import Servant.Job.Utils (jsonOptions)
34 import Control.Lens hiding (elements)
35 import Servant.Multipart
36 import Data.Swagger
37 import Data.Text (Text)
38 import GHC.Generics (Generic)
39 import Servant.Job.Types
40 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
41 import Gargantext.Database.Flow (flowCorpusSearchInDatabase)
42 import Gargantext.Database.Types.Node (CorpusId)
43 import Gargantext.Text.Terms (TermType(..))
44 import Gargantext.Prelude
45 import Gargantext.API.Orchestrator.Types
46 import Servant
47 -- import Servant.Job.Server
48 import Test.QuickCheck (elements)
49 import Test.QuickCheck.Arbitrary
50 import Gargantext.Core (Lang(..))
51 import Gargantext.Database.Flow (FlowCmdM, flowCorpus)
52 import qualified Gargantext.Text.Corpus.API as API
53 --import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat(..))
54 import Gargantext.Text.Corpus.Parsers.CSV (parseHal')
55 import Gargantext.Database.Types.Node (ToHyperdataDocument(..))
56 import Gargantext.Database.Types.Node (UserId)
57 import Gargantext.API.Corpus.New.File
58
59 data Query = Query { query_query :: Text
60 , query_corpus_id :: Int
61 , query_databases :: [API.ExternalAPIs]
62 }
63 deriving (Eq, Show, Generic)
64
65 deriveJSON (unPrefix "query_") 'Query
66
67
68 instance Arbitrary Query where
69 arbitrary = elements [ Query q n fs
70 | q <- ["a","b"]
71 , n <- [0..10]
72 , fs <- take 3 $ repeat API.externalAPIs
73 ]
74
75 instance ToSchema Query where
76 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
77
78 type Api = Summary "New Corpus endpoint"
79 :> ReqBody '[JSON] Query
80 :> Post '[JSON] CorpusId
81 :<|> Get '[JSON] ApiInfo
82
83 -- | TODO manage several apis
84 -- TODO-ACCESS
85 -- TODO this is only the POST
86 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
87 api _uId (Query q _ as) = do
88 cId <- case head as of
89 Nothing -> flowCorpusSearchInDatabase "user1" EN q
90 Just API.All -> flowCorpusSearchInDatabase "user1" EN q
91 Just a -> do
92 docs <- liftIO $ API.get a q (Just 1000)
93 cId' <- flowCorpus "user1" (Left q) (Multi EN) [docs]
94 pure cId'
95
96 pure cId
97
98 ------------------------------------------------
99 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
100 deriving (Generic)
101 instance Arbitrary ApiInfo where
102 arbitrary = ApiInfo <$> arbitrary
103
104 deriveJSON (unPrefix "") 'ApiInfo
105
106 instance ToSchema ApiInfo
107
108 info :: FlowCmdM env err m => UserId -> m ApiInfo
109 info _u = pure $ ApiInfo API.externalAPIs
110
111 {-
112 -- Proposal to replace the Query type which seems to generically named.
113 data ScraperInput = ScraperInput
114 { _scin_query :: !Text
115 , _scin_corpus_id :: !Int
116 , _scin_databases :: [API.ExternalAPIs]
117 }
118 deriving (Eq, Show, Generic)
119
120 makeLenses ''ScraperInput
121
122 deriveJSON (unPrefix "_scin_") 'ScraperInput
123
124 data ScraperEvent = ScraperEvent
125 { _scev_message :: !(Maybe Text)
126 , _scev_level :: !(Maybe Text)
127 , _scev_date :: !(Maybe Text)
128 }
129 deriving Generic
130
131 deriveJSON (unPrefix "_scev_") 'ScraperEvent
132
133 data ScraperStatus = ScraperStatus
134 { _scst_succeeded :: !(Maybe Int)
135 , _scst_failed :: !(Maybe Int)
136 , _scst_remaining :: !(Maybe Int)
137 , _scst_events :: !(Maybe [ScraperEvent])
138 }
139 deriving Generic
140
141 deriveJSON (unPrefix "_scst_") 'ScraperStatus
142 -}
143
144
145
146 ------------------------------------------------------------------------
147 ------------------------------------------------------------------------
148 data WithQuery = WithQuery
149 { _wq_query :: !Text
150 , _wq_databases :: ![ExternalAPIs]
151 }
152 deriving Generic
153
154 makeLenses ''WithQuery
155
156 instance FromJSON WithQuery where
157 parseJSON = genericParseJSON $ jsonOptions "_wq_"
158
159 instance ToSchema WithQuery
160 -------------------------------------------------------
161 data WithForm = WithForm
162 { _wf_filetype :: !FileType
163 , _wf_data :: !Text
164 } deriving (Eq, Show, Generic)
165
166 makeLenses ''WithForm
167
168 instance FromJSON WithForm where
169 parseJSON = genericParseJSON $ jsonOptions "_wf_"
170 instance ToSchema WithForm
171 instance FromForm WithForm
172
173
174 ------------------------------------------------------------------------
175 type
176 AddAPI withInput = AsyncJobsAPI ScraperStatus withInput ScraperStatus
177 ------------------------------------------------------------------------
178
179 type AddWithQuery = Summary "Add with Query to corpus endpoint"
180 :> "corpus"
181 :> Capture "corpus_id" CorpusId
182 :> "add"
183 :> "query"
184 :> "async"
185 :> AddAPI WithQuery
186
187 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
188 :> "corpus"
189 :> Capture "corpus_id" CorpusId
190 :> "add"
191 :> "file"
192 :> MultipartForm Mem (MultipartData Mem)
193 :> QueryParam "fileType" FileType
194 :> "async"
195 :> AddAPI ()
196
197 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
198 :> "corpus"
199 :> Capture "corpus_id" CorpusId
200 :> "add"
201 :> "form"
202 :> ReqBody '[FormUrlEncoded] WithForm
203 :> "async"
204 :> AddAPI ()
205
206 ------------------------------------------------------------------------
207 -- TODO WithQuery also has a corpus id
208 addToCorpusJobFunction :: FlowCmdM env err m
209 => CorpusId
210 -> WithQuery
211 -> (ScraperStatus -> m ())
212 -> m ScraperStatus
213 addToCorpusJobFunction _cid _input logStatus = do
214 -- TODO ...
215 logStatus ScraperStatus { _scst_succeeded = Just 10
216 , _scst_failed = Just 2
217 , _scst_remaining = Just 138
218 , _scst_events = Just []
219 }
220 -- TODO ...
221 pure ScraperStatus { _scst_succeeded = Just 137
222 , _scst_failed = Just 13
223 , _scst_remaining = Just 0
224 , _scst_events = Just []
225 }
226
227
228 addToCorpusWithFile :: FlowCmdM env err m
229 => CorpusId
230 -> MultipartData Mem
231 -> Maybe FileType
232 -> (ScraperStatus -> m ())
233 -> m ScraperStatus
234 addToCorpusWithFile cid input filetype logStatus = do
235 logStatus ScraperStatus { _scst_succeeded = Just 10
236 , _scst_failed = Just 2
237 , _scst_remaining = Just 138
238 , _scst_events = Just []
239 }
240 _h <- postUpload cid filetype input
241
242 pure ScraperStatus { _scst_succeeded = Just 137
243 , _scst_failed = Just 13
244 , _scst_remaining = Just 0
245 , _scst_events = Just []
246 }
247
248 addToCorpusWithForm :: FlowCmdM env err m
249 => CorpusId
250 -> WithForm
251 -> (ScraperStatus -> m ())
252 -> m ScraperStatus
253 addToCorpusWithForm cid (WithForm _ft d) logStatus = do
254
255 let docs = splitEvery 500
256 $ take 10000
257 $ parseHal' (cs d)
258
259 cid' <- flowCorpus "user1" (Right [cid]) (Multi EN) (map (map toHyperdataDocument) docs)
260 printDebug "cid'" cid'
261
262 logStatus ScraperStatus { _scst_succeeded = Just 10
263 , _scst_failed = Just 2
264 , _scst_remaining = Just 138
265 , _scst_events = Just []
266 }
267
268 pure ScraperStatus { _scst_succeeded = Just 137
269 , _scst_failed = Just 13
270 , _scst_remaining = Just 0
271 , _scst_events = Just []
272 }
273