]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Corpus/New.hs
[FIX] Upload CSV + new Lang all
[gargantext.git] / src / Gargantext / API / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE DeriveGeneric #-}
18 {-# LANGUAGE DataKinds #-}
19 {-# LANGUAGE TypeOperators #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE FlexibleContexts #-}
22 {-# LANGUAGE RankNTypes #-}
23
24 module Gargantext.API.Corpus.New
25 where
26
27 --import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat(..))
28 import Control.Lens hiding (elements)
29 import Control.Monad.IO.Class (liftIO)
30 import Data.Aeson
31 import Data.Aeson.TH (deriveJSON)
32 import Data.Either
33 import Data.Swagger
34 import Data.Text (Text)
35 import GHC.Generics (Generic)
36 import Gargantext.API.Corpus.New.File
37 import Gargantext.API.Orchestrator.Types
38 import Gargantext.Core (Lang(..))
39 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
40 import Gargantext.Database.Flow (FlowCmdM, flowCorpus)
41 import Gargantext.Database.Flow (flowCorpusSearchInDatabase)
42 import Gargantext.Database.Types.Node (CorpusId)
43 import Gargantext.Database.Types.Node (ToHyperdataDocument(..))
44 import Gargantext.Database.Types.Node (UserId)
45 import Gargantext.Prelude
46 import Gargantext.Text.Corpus.Parsers.CSV (parseCsv', parseHal')
47 import Gargantext.Text.Terms (TermType(..))
48 import Servant
49 import Servant.API.Flatten (Flat)
50 import Servant.Job.Core
51 import Servant.Job.Types
52 import Servant.Job.Utils (jsonOptions)
53 import Servant.Multipart
54 import Test.QuickCheck (elements)
55 import Test.QuickCheck.Arbitrary
56 import Web.FormUrlEncoded (FromForm)
57 import qualified Gargantext.Text.Corpus.API as API
58
59 data Query = Query { query_query :: Text
60 , query_corpus_id :: Int
61 , query_databases :: [API.ExternalAPIs]
62 }
63 deriving (Eq, Show, Generic)
64
65 deriveJSON (unPrefix "query_") 'Query
66
67
68 instance Arbitrary Query where
69 arbitrary = elements [ Query q n fs
70 | q <- ["a","b"]
71 , n <- [0..10]
72 , fs <- take 3 $ repeat API.externalAPIs
73 ]
74
75 instance ToSchema Query where
76 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
77
78 type Api = Summary "New Corpus endpoint"
79 :> ReqBody '[JSON] Query
80 :> Post '[JSON] CorpusId
81 :<|> Get '[JSON] ApiInfo
82
83 -- | TODO manage several apis
84 -- TODO-ACCESS
85 -- TODO this is only the POST
86 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
87 api _uId (Query q _ as) = do
88 cId <- case head as of
89 Nothing -> flowCorpusSearchInDatabase "user1" EN q
90 Just API.All -> flowCorpusSearchInDatabase "user1" EN q
91 Just a -> do
92 docs <- liftIO $ API.get a q (Just 1000)
93 cId' <- flowCorpus "user1" (Left q) (Multi EN) [docs]
94 pure cId'
95
96 pure cId
97
98 ------------------------------------------------
99 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
100 deriving (Generic)
101 instance Arbitrary ApiInfo where
102 arbitrary = ApiInfo <$> arbitrary
103
104 deriveJSON (unPrefix "") 'ApiInfo
105
106 instance ToSchema ApiInfo
107
108 info :: FlowCmdM env err m => UserId -> m ApiInfo
109 info _u = pure $ ApiInfo API.externalAPIs
110
111 ------------------------------------------------------------------------
112 ------------------------------------------------------------------------
113 data WithQuery = WithQuery
114 { _wq_query :: !Text
115 , _wq_databases :: ![ExternalAPIs]
116 , _wq_lang :: !(Maybe Lang)
117 }
118 deriving Generic
119
120 makeLenses ''WithQuery
121 instance FromJSON WithQuery where
122 parseJSON = genericParseJSON $ jsonOptions "_wq_"
123 instance ToSchema WithQuery where
124 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
125
126 -------------------------------------------------------
127 data WithForm = WithForm
128 { _wf_filetype :: !FileType
129 , _wf_data :: !Text
130 } deriving (Eq, Show, Generic)
131
132 makeLenses ''WithForm
133 instance FromForm WithForm
134 instance FromJSON WithForm where
135 parseJSON = genericParseJSON $ jsonOptions "_wf_"
136 instance ToSchema WithForm where
137 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")
138
139 ------------------------------------------------------------------------
140 type AsyncJobs event ctI input output =
141 Flat (AsyncJobsAPI' 'Unsafe 'Safe ctI '[JSON] Maybe event input output)
142 ------------------------------------------------------------------------
143
144 type Upload = Summary "Corpus Upload endpoint"
145 :> "corpus"
146 :> Capture "corpus_id" CorpusId
147 :<|> "addWithquery" :> AsyncJobsAPI ScraperStatus WithQuery ScraperStatus
148 :<|> "addWithfile" :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
149
150
151 type AddWithQuery = Summary "Add with Query to corpus endpoint"
152 :> "corpus"
153 :> Capture "corpus_id" CorpusId
154 :> "add"
155 :> "query"
156 :> "async"
157 :> AsyncJobsAPI ScraperStatus WithQuery ScraperStatus
158
159 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
160 :> "corpus"
161 :> Capture "corpus_id" CorpusId
162 :> "add"
163 :> "file"
164 :> MultipartForm Mem (MultipartData Mem)
165 :> QueryParam "fileType" FileType
166 :> "async"
167 :> AsyncJobs ScraperStatus '[JSON] () ScraperStatus
168
169 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
170 :> "corpus"
171 :> Capture "corpus_id" CorpusId
172 :> "add"
173 :> "form"
174 :> "async"
175 :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
176
177 ------------------------------------------------------------------------
178 -- TODO WithQuery also has a corpus id
179 addToCorpusJobFunction :: FlowCmdM env err m
180 => CorpusId
181 -> WithQuery
182 -> (ScraperStatus -> m ())
183 -> m ScraperStatus
184 addToCorpusJobFunction _cid (WithQuery _q _dbs _l) logStatus = do
185 -- TODO ...
186 logStatus ScraperStatus { _scst_succeeded = Just 10
187 , _scst_failed = Just 2
188 , _scst_remaining = Just 138
189 , _scst_events = Just []
190 }
191 -- TODO ...
192 pure ScraperStatus { _scst_succeeded = Just 137
193 , _scst_failed = Just 13
194 , _scst_remaining = Just 0
195 , _scst_events = Just []
196 }
197
198
199 addToCorpusWithFile :: FlowCmdM env err m
200 => CorpusId
201 -> MultipartData Mem
202 -> Maybe FileType
203 -> (ScraperStatus -> m ())
204 -> m ScraperStatus
205 addToCorpusWithFile cid input filetype logStatus = do
206 logStatus ScraperStatus { _scst_succeeded = Just 10
207 , _scst_failed = Just 2
208 , _scst_remaining = Just 138
209 , _scst_events = Just []
210 }
211 _h <- postUpload cid filetype input
212
213 pure ScraperStatus { _scst_succeeded = Just 137
214 , _scst_failed = Just 13
215 , _scst_remaining = Just 0
216 , _scst_events = Just []
217 }
218
219 addToCorpusWithForm :: FlowCmdM env err m
220 => CorpusId
221 -> WithForm
222 -> (ScraperStatus -> m ())
223 -> m ScraperStatus
224 addToCorpusWithForm cid (WithForm ft d) logStatus = do
225
226 printDebug "ft" ft
227
228 let
229 parse = case ft of
230 CSV_HAL -> parseHal'
231 CSV -> parseCsv'
232 _ -> parseHal'
233
234 docs = splitEvery 500
235 $ take 1000000
236 $ parse (cs d)
237
238 logStatus ScraperStatus { _scst_succeeded = Just 1
239 , _scst_failed = Just 0
240 , _scst_remaining = Just 1
241 , _scst_events = Just []
242 }
243
244 cid' <- flowCorpus "user1" (Right [cid]) (Multi EN) (map (map toHyperdataDocument) docs)
245 printDebug "cid'" cid'
246
247 pure ScraperStatus { _scst_succeeded = Just 2
248 , _scst_failed = Just 0
249 , _scst_remaining = Just 0
250 , _scst_events = Just []
251 }
252