]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Corpus/New.hs
[API Query] api connection ok, needs RootId fix
[gargantext.git] / src / Gargantext / API / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE DeriveGeneric #-}
18 {-# LANGUAGE DataKinds #-}
19 {-# LANGUAGE TypeOperators #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE FlexibleContexts #-}
22 {-# LANGUAGE RankNTypes #-}
23 {-# OPTIONS_GHC -fno-warn-orphans #-}
24
25 module Gargantext.API.Corpus.New
26 where
27
28 import Control.Lens hiding (elements)
29 import Data.Aeson
30 import Data.Aeson.TH (deriveJSON)
31 import Data.Either
32 import Data.Maybe (fromMaybe)
33 import Data.Swagger
34 import Data.Text (Text)
35 import GHC.Generics (Generic)
36 import Gargantext.API.Admin.Orchestrator.Types
37 import Gargantext.API.Corpus.New.File
38 import Gargantext.Core (Lang(..), allLangs)
39 import Gargantext.Core.Types.Individu (UserId, User(..))
40 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
41 import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, getDataText, flowDataText, TermType(..), DataOrigin(..), allDataOrigins)
42 import Gargantext.Database.Admin.Types.Node (CorpusId, ToHyperdataDocument(..))
43 import Gargantext.Prelude
44 import Servant
45 import Servant.API.Flatten (Flat)
46 import Servant.Job.Core
47 import Servant.Job.Types
48 import Servant.Job.Utils (jsonOptions)
49 import Servant.Multipart
50 import Test.QuickCheck (elements)
51 import Test.QuickCheck.Arbitrary
52 import Web.FormUrlEncoded (FromForm)
53 import qualified Gargantext.Text.Corpus.API as API
54 import qualified Gargantext.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
55
56 ------------------------------------------------------------------------
57 data Query = Query { query_query :: Text
58 , query_node_id :: Int
59 , query_lang :: Lang
60 , query_databases :: [DataOrigin]
61 }
62 deriving (Eq, Generic)
63
64 deriveJSON (unPrefix "query_") 'Query
65
66 instance Arbitrary Query where
67 arbitrary = elements [ Query q n la fs
68 | q <- ["honeybee* AND collapse"
69 ,"covid 19"
70 ]
71 , n <- [0..10]
72 , la <- allLangs
73 , fs <- take 3 $ repeat allDataOrigins
74 ]
75
76 instance ToSchema Query where
77 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
78
79 ------------------------------------------------------------------------
80
81 type Api = PostApi
82 :<|> GetApi
83
84 type PostApi = Summary "New Corpus endpoint"
85 :> ReqBody '[JSON] Query
86 :> Post '[JSON] CorpusId
87 type GetApi = Get '[JSON] ApiInfo
88
89 -- | TODO manage several apis
90 -- TODO-ACCESS
91 -- TODO this is only the POST
92 {-
93 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
94 api uid (Query q _ as) = do
95 cId <- case head as of
96 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
97 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
98 Just a -> do
99 docs <- liftBase $ API.get a q (Just 1000)
100 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
101 pure cId'
102
103 pure cId
104 -}
105
106 ------------------------------------------------
107 -- TODO use this route for Client implementation
108 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
109 deriving (Generic)
110 instance Arbitrary ApiInfo where
111 arbitrary = ApiInfo <$> arbitrary
112
113 deriveJSON (unPrefix "") 'ApiInfo
114
115 instance ToSchema ApiInfo
116
117 info :: FlowCmdM env err m => UserId -> m ApiInfo
118 info _u = pure $ ApiInfo API.externalAPIs
119
120 ------------------------------------------------------------------------
121 ------------------------------------------------------------------------
122 data WithQuery = WithQuery
123 { _wq_query :: !Text
124 , _wq_databases :: ![DataOrigin]
125 , _wq_lang :: !(Maybe (TermType Lang))
126 }
127 deriving Generic
128
129 makeLenses ''WithQuery
130 instance FromJSON WithQuery where
131 parseJSON = genericParseJSON $ jsonOptions "_wq_"
132 instance ToSchema WithQuery where
133 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
134
135 -------------------------------------------------------
136 data WithForm = WithForm
137 { _wf_filetype :: !FileType
138 , _wf_data :: !Text
139 , _wf_lang :: !(Maybe Lang)
140 , _wf_name :: !Text
141 } deriving (Eq, Show, Generic)
142
143 makeLenses ''WithForm
144 instance FromForm WithForm
145 instance FromJSON WithForm where
146 parseJSON = genericParseJSON $ jsonOptions "_wf_"
147 instance ToSchema WithForm where
148 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")
149
150 ------------------------------------------------------------------------
151 type AsyncJobs event ctI input output =
152 Flat (AsyncJobsAPI' 'Unsafe 'Safe ctI '[JSON] Maybe event input output)
153 ------------------------------------------------------------------------
154
155 type Upload = Summary "Corpus Upload endpoint"
156 :> "corpus"
157 :> Capture "corpus_id" CorpusId
158 :<|> "addWithquery"
159 :> AsyncJobsAPI ScraperStatus WithQuery ScraperStatus
160 :<|> "addWithfile"
161 :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
162
163 type AddWithQuery = Summary "Add with Query to corpus endpoint"
164 :> "corpus"
165 :> Capture "corpus_id" CorpusId
166 :> "add"
167 :> "query"
168 :> "async"
169 :> AsyncJobsAPI ScraperStatus WithQuery ScraperStatus
170
171 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
172 :> "corpus"
173 :> Capture "corpus_id" CorpusId
174 :> "add"
175 :> "file"
176 :> MultipartForm Mem (MultipartData Mem)
177 :> QueryParam "fileType" FileType
178 :> "async"
179 :> AsyncJobs ScraperStatus '[JSON] () ScraperStatus
180
181 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
182 :> "corpus"
183 :> Capture "corpus_id" CorpusId
184 :> "add"
185 :> "form"
186 :> "async"
187 :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
188
189 ------------------------------------------------------------------------
190 -- TODO WithQuery also has a corpus id
191 addToCorpusWithQuery :: FlowCmdM env err m
192 => User
193 -> CorpusId
194 -> WithQuery
195 -> (ScraperStatus -> m ())
196 -> m ScraperStatus
197 addToCorpusWithQuery u cid (WithQuery q dbs l) logStatus = do
198 -- TODO ...
199 logStatus ScraperStatus { _scst_succeeded = Just 10
200 , _scst_failed = Just 2
201 , _scst_remaining = Just 138
202 , _scst_events = Just []
203 }
204 printDebug "addToCorpusWithQuery" cid
205 -- TODO add cid
206 -- TODO if cid is folder -> create Corpus
207 -- if cid is corpus -> add to corpus
208 -- if cid is root -> create corpus in Private
209 txts <- mapM (\db -> getDataText db (fromMaybe (Multi EN) l) q (Just 10000)) dbs
210 cids <- mapM (\txt -> flowDataText u txt (fromMaybe (Multi EN) l) cid) txts
211 printDebug "corpus id" cids
212 -- TODO ...
213 pure ScraperStatus { _scst_succeeded = Just 137
214 , _scst_failed = Just 13
215 , _scst_remaining = Just 0
216 , _scst_events = Just []
217 }
218
219 addToCorpusWithFile :: FlowCmdM env err m
220 => CorpusId
221 -> MultipartData Mem
222 -> Maybe FileType
223 -> (ScraperStatus -> m ())
224 -> m ScraperStatus
225 addToCorpusWithFile cid input filetype logStatus = do
226 logStatus ScraperStatus { _scst_succeeded = Just 10
227 , _scst_failed = Just 2
228 , _scst_remaining = Just 138
229 , _scst_events = Just []
230 }
231 printDebug "addToCorpusWithFile" cid
232 _h <- postUpload cid filetype input
233
234 pure ScraperStatus { _scst_succeeded = Just 137
235 , _scst_failed = Just 13
236 , _scst_remaining = Just 0
237 , _scst_events = Just []
238 }
239
240 addToCorpusWithForm :: FlowCmdM env err m
241 => User
242 -> CorpusId
243 -> WithForm
244 -> (ScraperStatus -> m ())
245 -> m ScraperStatus
246 addToCorpusWithForm user cid (WithForm ft d l _n) logStatus = do
247
248 let
249 parse = case ft of
250 CSV_HAL -> Parser.parseFormat Parser.CsvHal
251 CSV -> Parser.parseFormat Parser.CsvGargV3
252 WOS -> Parser.parseFormat Parser.WOS
253 PresseRIS -> Parser.parseFormat Parser.RisPresse
254
255 logStatus ScraperStatus { _scst_succeeded = Just 1
256 , _scst_failed = Just 0
257 , _scst_remaining = Just 1
258 , _scst_events = Just []
259 }
260
261 printDebug "Parsing corpus: " cid
262
263 -- TODO granularity of the logStatus
264 docs <- liftBase $ splitEvery 500
265 <$> take 1000000
266 <$> parse (cs d)
267
268 printDebug "Parsing corpus finished : " cid
269 printDebug "Starting extraction : " cid
270
271 -- TODO granularity of the logStatus
272 _cid' <- flowCorpus user
273 (Right [cid])
274 (Multi $ fromMaybe EN l)
275 (map (map toHyperdataDocument) docs)
276
277 printDebug "Extraction finished : " cid
278
279 pure ScraperStatus { _scst_succeeded = Just 2
280 , _scst_failed = Just 0
281 , _scst_remaining = Just 0
282 , _scst_events = Just []
283 }
284