]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/New.hs
Merge branch 'dev-default-extensions' into dev
[gargantext.git] / src / Gargantext / API / Node / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
17 {-# OPTIONS_GHC -fno-warn-orphans #-}
18
19 module Gargantext.API.Node.Corpus.New
20 where
21
22 import Control.Lens hiding (elements, Empty)
23 import Data.Aeson
24 import Data.Aeson.TH (deriveJSON)
25 import Data.Either
26 import Data.Maybe (fromMaybe)
27 import Data.Swagger
28 import Data.Text (Text)
29 import GHC.Generics (Generic)
30 import Gargantext.API.Admin.Orchestrator.Types (ScraperStatus(..))
31 import qualified Gargantext.API.Admin.Orchestrator.Types as T
32 import Gargantext.API.Node.Corpus.New.File
33 import Gargantext.Core (Lang(..){-, allLangs-})
34 import Gargantext.Core.Types.Individu (User(..))
35 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
36 import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, getDataText, flowDataText, TermType(..), DataOrigin(..){-, allDataOrigins-})
37 import Gargantext.Database.Admin.Types.Node (CorpusId, ToHyperdataDocument(..), UserId)
38 import Gargantext.Prelude
39 import Servant
40 import Servant.Job.Core
41 import Servant.Job.Types
42 import Servant.Job.Utils (jsonOptions)
43 -- import Servant.Multipart
44 -- import Test.QuickCheck (elements)
45 import Test.QuickCheck.Arbitrary
46 import Web.FormUrlEncoded (FromForm)
47 import qualified Gargantext.Text.Corpus.API as API
48 import qualified Gargantext.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
49
50 ------------------------------------------------------------------------
51 {-
52 data Query = Query { query_query :: Text
53 , query_node_id :: Int
54 , query_lang :: Lang
55 , query_databases :: [DataOrigin]
56 }
57 deriving (Eq, Generic)
58
59 deriveJSON (unPrefix "query_") 'Query
60
61 instance Arbitrary Query where
62 arbitrary = elements [ Query q n la fs
63 | q <- ["honeybee* AND collapse"
64 ,"covid 19"
65 ]
66 , n <- [0..10]
67 , la <- allLangs
68 , fs <- take 3 $ repeat allDataOrigins
69 ]
70
71 instance ToSchema Query where
72 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
73 -}
74
75 ------------------------------------------------------------------------
76
77 {-
78 type Api = PostApi
79 :<|> GetApi
80
81 type PostApi = Summary "New Corpus endpoint"
82 :> ReqBody '[JSON] Query
83 :> Post '[JSON] CorpusId
84 type GetApi = Get '[JSON] ApiInfo
85 -}
86
87 -- | TODO manage several apis
88 -- TODO-ACCESS
89 -- TODO this is only the POST
90 {-
91 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
92 api uid (Query q _ as) = do
93 cId <- case head as of
94 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
95 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
96 Just a -> do
97 docs <- liftBase $ API.get a q (Just 1000)
98 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
99 pure cId'
100
101 pure cId
102 -}
103
104 ------------------------------------------------
105 -- TODO use this route for Client implementation
106 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
107 deriving (Generic)
108 instance Arbitrary ApiInfo where
109 arbitrary = ApiInfo <$> arbitrary
110
111 deriveJSON (unPrefix "") 'ApiInfo
112
113 instance ToSchema ApiInfo
114
115 info :: FlowCmdM env err m => UserId -> m ApiInfo
116 info _u = pure $ ApiInfo API.externalAPIs
117
118 ------------------------------------------------------------------------
119
120 data Database = Empty
121 | PubMed
122 | HAL
123 | IsTex
124 | Isidore
125 deriving (Eq, Show, Generic)
126
127 deriveJSON (unPrefix "") ''Database
128 instance ToSchema Database
129
130 database2origin :: Database -> DataOrigin
131 database2origin Empty = InternalOrigin T.IsTex
132 database2origin PubMed = ExternalOrigin T.PubMed
133 database2origin HAL = ExternalOrigin T.HAL
134 database2origin IsTex = ExternalOrigin T.IsTex
135 database2origin Isidore = ExternalOrigin T.Isidore
136
137 ------------------------------------------------------------------------
138 data WithQuery = WithQuery
139 { _wq_query :: !Text
140 , _wq_databases :: !Database
141 , _wq_lang :: !Lang
142 , _wq_node_id :: !Int
143 }
144 deriving Generic
145
146 makeLenses ''WithQuery
147 instance FromJSON WithQuery where
148 parseJSON = genericParseJSON $ jsonOptions "_wq_"
149 instance ToSchema WithQuery where
150 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
151
152 -------------------------------------------------------
153 data WithForm = WithForm
154 { _wf_filetype :: !FileType
155 , _wf_data :: !Text
156 , _wf_lang :: !(Maybe Lang)
157 , _wf_name :: !Text
158 } deriving (Eq, Show, Generic)
159
160 makeLenses ''WithForm
161 instance FromForm WithForm
162 instance FromJSON WithForm where
163 parseJSON = genericParseJSON $ jsonOptions "_wf_"
164 instance ToSchema WithForm where
165 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")
166
167 ------------------------------------------------------------------------
168 type AsyncJobs event ctI input output =
169 AsyncJobsAPI' 'Unsafe 'Safe ctI '[JSON] Maybe event input output
170 ------------------------------------------------------------------------
171
172 type AddWithQuery = Summary "Add with Query to corpus endpoint"
173 :> "corpus"
174 :> Capture "corpus_id" CorpusId
175 :> "query"
176 :> AsyncJobs ScraperStatus '[JSON] WithQuery ScraperStatus
177
178 {-
179 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
180 :> "corpus"
181 :> Capture "corpus_id" CorpusId
182 :> "add"
183 :> "file"
184 :> MultipartForm Mem (MultipartData Mem)
185 :> QueryParam "fileType" FileType
186 :> "async"
187 :> AsyncJobs ScraperStatus '[JSON] () ScraperStatus
188 -}
189
190 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
191 :> "corpus"
192 :> Capture "corpus_id" CorpusId
193 :> "add"
194 :> "form"
195 :> "async"
196 :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
197
198
199 ------------------------------------------------------------------------
200 -- TODO WithQuery also has a corpus id
201 addToCorpusWithQuery :: FlowCmdM env err m
202 => User
203 -> CorpusId
204 -> WithQuery
205 -> (ScraperStatus -> m ())
206 -> m ScraperStatus
207 addToCorpusWithQuery u cid (WithQuery q dbs l _nid) logStatus = do
208 -- TODO ...
209 logStatus ScraperStatus { _scst_succeeded = Just 10
210 , _scst_failed = Just 2
211 , _scst_remaining = Just 138
212 , _scst_events = Just []
213 }
214 printDebug "addToCorpusWithQuery" cid
215 -- TODO add cid
216 -- TODO if cid is folder -> create Corpus
217 -- if cid is corpus -> add to corpus
218 -- if cid is root -> create corpus in Private
219 txts <- mapM (\db -> getDataText db (Multi l) q (Just 10000)) [database2origin dbs]
220 cids <- mapM (\txt -> flowDataText u txt (Multi l) cid) txts
221 printDebug "corpus id" cids
222 -- TODO ...
223 pure ScraperStatus { _scst_succeeded = Just 137
224 , _scst_failed = Just 13
225 , _scst_remaining = Just 0
226 , _scst_events = Just []
227 }
228
229 addToCorpusWithForm :: FlowCmdM env err m
230 => User
231 -> CorpusId
232 -> WithForm
233 -> (ScraperStatus -> m ())
234 -> m ScraperStatus
235 addToCorpusWithForm user cid (WithForm ft d l _n) logStatus = do
236
237 let
238 parse = case ft of
239 CSV_HAL -> Parser.parseFormat Parser.CsvHal
240 CSV -> Parser.parseFormat Parser.CsvGargV3
241 WOS -> Parser.parseFormat Parser.WOS
242 PresseRIS -> Parser.parseFormat Parser.RisPresse
243
244 logStatus ScraperStatus { _scst_succeeded = Just 1
245 , _scst_failed = Just 0
246 , _scst_remaining = Just 1
247 , _scst_events = Just []
248 }
249
250 printDebug "Parsing corpus: " cid
251
252 -- TODO granularity of the logStatus
253 docs <- liftBase $ splitEvery 500
254 <$> take 1000000
255 <$> parse (cs d)
256
257 printDebug "Parsing corpus finished : " cid
258 printDebug "Starting extraction : " cid
259
260 -- TODO granularity of the logStatus
261 _cid' <- flowCorpus user
262 (Right [cid])
263 (Multi $ fromMaybe EN l)
264 (map (map toHyperdataDocument) docs)
265
266 printDebug "Extraction finished : " cid
267
268 pure ScraperStatus { _scst_succeeded = Just 2
269 , _scst_failed = Just 0
270 , _scst_remaining = Just 0
271 , _scst_events = Just []
272 }
273
274 {-
275 addToCorpusWithFile :: FlowCmdM env err m
276 => CorpusId
277 -> MultipartData Mem
278 -> Maybe FileType
279 -> (ScraperStatus -> m ())
280 -> m ScraperStatus
281 addToCorpusWithFile cid input filetype logStatus = do
282 logStatus ScraperStatus { _scst_succeeded = Just 10
283 , _scst_failed = Just 2
284 , _scst_remaining = Just 138
285 , _scst_events = Just []
286 }
287 printDebug "addToCorpusWithFile" cid
288 _h <- postUpload cid filetype input
289
290 pure ScraperStatus { _scst_succeeded = Just 137
291 , _scst_failed = Just 13
292 , _scst_remaining = Just 0
293 , _scst_events = Just []
294 }
295 -}
296
297