]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Corpus/New.hs
[Clean] before factoring
[gargantext.git] / src / Gargantext / API / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE DeriveGeneric #-}
18 {-# LANGUAGE DataKinds #-}
19 {-# LANGUAGE TypeOperators #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE FlexibleContexts #-}
22 {-# LANGUAGE RankNTypes #-}
23 {-# OPTIONS_GHC -fno-warn-orphans #-}
24
25 module Gargantext.API.Corpus.New
26 where
27
28 import Control.Lens hiding (elements)
29 import Data.Aeson
30 import Data.Aeson.TH (deriveJSON)
31 import Data.Maybe (fromMaybe)
32 import Data.Either
33 import Data.Swagger
34 import Data.Text (Text)
35 import GHC.Generics (Generic)
36 import Gargantext.API.Corpus.New.File
37 import Gargantext.API.Orchestrator.Types
38 import Gargantext.Core (Lang(..))
39 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
40 import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, flowCorpusSearchInDatabase)
41 import Gargantext.Database.Admin.Types.Node (CorpusId, ToHyperdataDocument(..))
42 import Gargantext.Core.Types.Individu (UserId, User(..))
43 import Gargantext.Prelude
44 import qualified Gargantext.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
45 import Gargantext.Text.Terms (TermType(..))
46 import Servant
47 import Servant.API.Flatten (Flat)
48 import Servant.Job.Core
49 import Servant.Job.Types
50 import Servant.Job.Utils (jsonOptions)
51 import Servant.Multipart
52 import Test.QuickCheck (elements)
53 import Test.QuickCheck.Arbitrary
54 import Web.FormUrlEncoded (FromForm)
55 import qualified Gargantext.Text.Corpus.API as API
56
57 ------------------------------------------------------------------------
58 data Query = Query { query_query :: Text
59 , query_corpus_id :: Int
60 , query_databases :: [API.ExternalAPIs]
61 }
62 deriving (Eq, Show, Generic)
63
64 deriveJSON (unPrefix "query_") 'Query
65
66 instance Arbitrary Query where
67 arbitrary = elements [ Query q n fs
68 | q <- ["honeybee* AND collopase"
69 ,"covid 19"]
70 , n <- [0..10]
71 , fs <- take 3 $ repeat API.externalAPIs
72 ]
73
74 instance ToSchema Query where
75 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
76
77 ------------------------------------------------------------------------
78
79 type Api = PostApi
80 :<|> GetApi
81
82 type PostApi = Summary "New Corpus endpoint"
83 :> ReqBody '[JSON] Query
84 :> Post '[JSON] CorpusId
85 type GetApi = Get '[JSON] ApiInfo
86
87 -- | TODO manage several apis
88 -- TODO-ACCESS
89 -- TODO this is only the POST
90 {-
91 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
92 api uid (Query q _ as) = do
93 cId <- case head as of
94 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
95 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
96 Just a -> do
97 docs <- liftBase $ API.get a q (Just 1000)
98 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
99 pure cId'
100
101 pure cId
102 -}
103
104 ------------------------------------------------
105 -- TODO use this route for Client implementation
106 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
107 deriving (Generic)
108 instance Arbitrary ApiInfo where
109 arbitrary = ApiInfo <$> arbitrary
110
111 deriveJSON (unPrefix "") 'ApiInfo
112
113 instance ToSchema ApiInfo
114
115 info :: FlowCmdM env err m => UserId -> m ApiInfo
116 info _u = pure $ ApiInfo API.externalAPIs
117
118 ------------------------------------------------------------------------
119 ------------------------------------------------------------------------
120 data WithQuery = WithQuery
121 { _wq_query :: !Text
122 , _wq_databases :: ![ExternalAPIs]
123 , _wq_lang :: !(Maybe Lang)
124 }
125 deriving Generic
126
127 makeLenses ''WithQuery
128 instance FromJSON WithQuery where
129 parseJSON = genericParseJSON $ jsonOptions "_wq_"
130 instance ToSchema WithQuery where
131 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
132
133 -------------------------------------------------------
134 data WithForm = WithForm
135 { _wf_filetype :: !FileType
136 , _wf_data :: !Text
137 , _wf_lang :: !(Maybe Lang)
138 , _wf_name :: !Text
139 } deriving (Eq, Show, Generic)
140
141 makeLenses ''WithForm
142 instance FromForm WithForm
143 instance FromJSON WithForm where
144 parseJSON = genericParseJSON $ jsonOptions "_wf_"
145 instance ToSchema WithForm where
146 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")
147
148 ------------------------------------------------------------------------
149 type AsyncJobs event ctI input output =
150 Flat (AsyncJobsAPI' 'Unsafe 'Safe ctI '[JSON] Maybe event input output)
151 ------------------------------------------------------------------------
152
153 type Upload = Summary "Corpus Upload endpoint"
154 :> "corpus"
155 :> Capture "corpus_id" CorpusId
156 :<|> "addWithquery" :> AsyncJobsAPI ScraperStatus WithQuery ScraperStatus
157 :<|> "addWithfile" :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
158
159 type AddWithQuery = Summary "Add with Query to corpus endpoint"
160 :> "corpus"
161 :> Capture "corpus_id" CorpusId
162 :> "add"
163 :> "query"
164 :> "async"
165 :> AsyncJobsAPI ScraperStatus WithQuery ScraperStatus
166
167 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
168 :> "corpus"
169 :> Capture "corpus_id" CorpusId
170 :> "add"
171 :> "file"
172 :> MultipartForm Mem (MultipartData Mem)
173 :> QueryParam "fileType" FileType
174 :> "async"
175 :> AsyncJobs ScraperStatus '[JSON] () ScraperStatus
176
177 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
178 :> "corpus"
179 :> Capture "corpus_id" CorpusId
180 :> "add"
181 :> "form"
182 :> "async"
183 :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
184
185 ------------------------------------------------------------------------
186 -- TODO WithQuery also has a corpus id
187 addToCorpusWithQuery :: FlowCmdM env err m
188 => User
189 -> CorpusId
190 -> WithQuery
191 -> (ScraperStatus -> m ())
192 -> m ScraperStatus
193 addToCorpusWithQuery u cid (WithQuery q _dbs l) logStatus = do
194 -- TODO ...
195 logStatus ScraperStatus { _scst_succeeded = Just 10
196 , _scst_failed = Just 2
197 , _scst_remaining = Just 138
198 , _scst_events = Just []
199 }
200 printDebug "addToCorpusWithQuery" cid
201 -- TODO add cid
202 -- TODO if cid is folder -> create Corpus
203 -- if cid is corpus -> add to corpus
204 -- if cid is root -> create corpus in Private
205 cids <- flowCorpusSearchInDatabase u (maybe EN identity l) q
206 printDebug "corpus id" cids
207 -- TODO ...
208 pure ScraperStatus { _scst_succeeded = Just 137
209 , _scst_failed = Just 13
210 , _scst_remaining = Just 0
211 , _scst_events = Just []
212 }
213
214 addToCorpusWithFile :: FlowCmdM env err m
215 => CorpusId
216 -> MultipartData Mem
217 -> Maybe FileType
218 -> (ScraperStatus -> m ())
219 -> m ScraperStatus
220 addToCorpusWithFile cid input filetype logStatus = do
221 logStatus ScraperStatus { _scst_succeeded = Just 10
222 , _scst_failed = Just 2
223 , _scst_remaining = Just 138
224 , _scst_events = Just []
225 }
226 printDebug "addToCorpusWithFile" cid
227 _h <- postUpload cid filetype input
228
229 pure ScraperStatus { _scst_succeeded = Just 137
230 , _scst_failed = Just 13
231 , _scst_remaining = Just 0
232 , _scst_events = Just []
233 }
234
235 addToCorpusWithForm :: FlowCmdM env err m
236 => User
237 -> CorpusId
238 -> WithForm
239 -> (ScraperStatus -> m ())
240 -> m ScraperStatus
241 addToCorpusWithForm user cid (WithForm ft d l _n) logStatus = do
242
243 let
244 parse = case ft of
245 CSV_HAL -> Parser.parseFormat Parser.CsvHal
246 CSV -> Parser.parseFormat Parser.CsvGargV3
247 WOS -> Parser.parseFormat Parser.WOS
248 PresseRIS -> Parser.parseFormat Parser.RisPresse
249
250 logStatus ScraperStatus { _scst_succeeded = Just 1
251 , _scst_failed = Just 0
252 , _scst_remaining = Just 1
253 , _scst_events = Just []
254 }
255
256 printDebug "Parsing corpus: " cid
257
258 -- TODO granularity of the logStatus
259 docs <- liftBase $ splitEvery 500
260 <$> take 1000000
261 <$> parse (cs d)
262
263 printDebug "Parsing corpus finished : " cid
264 printDebug "Starting extraction : " cid
265
266 -- TODO granularity of the logStatus
267 _cid' <- flowCorpus user
268 (Right [cid])
269 (Multi $ fromMaybe EN l)
270 (map (map toHyperdataDocument) docs)
271
272 printDebug "Extraction finished : " cid
273
274 pure ScraperStatus { _scst_succeeded = Just 2
275 , _scst_failed = Just 0
276 , _scst_remaining = Just 0
277 , _scst_events = Just []
278 }
279