]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/New.hs
[hyperdata] refactor code to add hyperdata graph metrics
[gargantext.git] / src / Gargantext / API / Node / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE DeriveGeneric #-}
18 {-# LANGUAGE DataKinds #-}
19 {-# LANGUAGE TypeOperators #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE FlexibleContexts #-}
22 {-# LANGUAGE RankNTypes #-}
23 {-# OPTIONS_GHC -fno-warn-orphans #-}
24
25 module Gargantext.API.Node.Corpus.New
26 where
27
28 import Control.Lens hiding (elements, Empty)
29 import Data.Aeson
30 import Data.Aeson.TH (deriveJSON)
31 import Data.Either
32 import Data.Maybe (fromMaybe)
33 import Data.Swagger
34 import Data.Text (Text)
35 import GHC.Generics (Generic)
36 import Servant
37 import Servant.Job.Core
38 import Servant.Job.Types
39 import Servant.Job.Utils (jsonOptions)
40 -- import Servant.Multipart
41 -- import Test.QuickCheck (elements)
42 import Test.QuickCheck.Arbitrary
43 import Web.FormUrlEncoded (FromForm)
44
45 import Gargantext.API.Admin.Orchestrator.Types (ScraperStatus(..))
46 import qualified Gargantext.API.Admin.Orchestrator.Types as T
47 import Gargantext.API.Node.Corpus.New.File
48 import Gargantext.Core (Lang(..){-, allLangs-})
49 import Gargantext.Core.Types.Individu (User(..))
50 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
51 import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, getDataText, flowDataText, TermType(..), DataOrigin(..){-, allDataOrigins-})
52 import Gargantext.Database.Admin.Types.Hyperdata
53 import Gargantext.Database.Admin.Types.Node (CorpusId, UserId)
54 import Gargantext.Prelude
55 import qualified Gargantext.Text.Corpus.API as API
56 import qualified Gargantext.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
57
58 ------------------------------------------------------------------------
59 {-
60 data Query = Query { query_query :: Text
61 , query_node_id :: Int
62 , query_lang :: Lang
63 , query_databases :: [DataOrigin]
64 }
65 deriving (Eq, Generic)
66
67 deriveJSON (unPrefix "query_") 'Query
68
69 instance Arbitrary Query where
70 arbitrary = elements [ Query q n la fs
71 | q <- ["honeybee* AND collapse"
72 ,"covid 19"
73 ]
74 , n <- [0..10]
75 , la <- allLangs
76 , fs <- take 3 $ repeat allDataOrigins
77 ]
78
79 instance ToSchema Query where
80 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
81 -}
82
83 ------------------------------------------------------------------------
84
85 {-
86 type Api = PostApi
87 :<|> GetApi
88
89 type PostApi = Summary "New Corpus endpoint"
90 :> ReqBody '[JSON] Query
91 :> Post '[JSON] CorpusId
92 type GetApi = Get '[JSON] ApiInfo
93 -}
94
95 -- | TODO manage several apis
96 -- TODO-ACCESS
97 -- TODO this is only the POST
98 {-
99 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
100 api uid (Query q _ as) = do
101 cId <- case head as of
102 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
103 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
104 Just a -> do
105 docs <- liftBase $ API.get a q (Just 1000)
106 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
107 pure cId'
108
109 pure cId
110 -}
111
112 ------------------------------------------------
113 -- TODO use this route for Client implementation
114 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
115 deriving (Generic)
116 instance Arbitrary ApiInfo where
117 arbitrary = ApiInfo <$> arbitrary
118
119 deriveJSON (unPrefix "") 'ApiInfo
120
121 instance ToSchema ApiInfo
122
123 info :: FlowCmdM env err m => UserId -> m ApiInfo
124 info _u = pure $ ApiInfo API.externalAPIs
125
126 ------------------------------------------------------------------------
127
128 data Database = Empty
129 | PubMed
130 | HAL
131 | IsTex
132 | Isidore
133 deriving (Eq, Show, Generic)
134
135 deriveJSON (unPrefix "") ''Database
136 instance ToSchema Database
137
138 database2origin :: Database -> DataOrigin
139 database2origin Empty = InternalOrigin T.IsTex
140 database2origin PubMed = ExternalOrigin T.PubMed
141 database2origin HAL = ExternalOrigin T.HAL
142 database2origin IsTex = ExternalOrigin T.IsTex
143 database2origin Isidore = ExternalOrigin T.Isidore
144
145 ------------------------------------------------------------------------
146 data WithQuery = WithQuery
147 { _wq_query :: !Text
148 , _wq_databases :: !Database
149 , _wq_lang :: !Lang
150 , _wq_node_id :: !Int
151 }
152 deriving Generic
153
154 makeLenses ''WithQuery
155 instance FromJSON WithQuery where
156 parseJSON = genericParseJSON $ jsonOptions "_wq_"
157 instance ToSchema WithQuery where
158 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
159
160 -------------------------------------------------------
161 data WithForm = WithForm
162 { _wf_filetype :: !FileType
163 , _wf_data :: !Text
164 , _wf_lang :: !(Maybe Lang)
165 , _wf_name :: !Text
166 } deriving (Eq, Show, Generic)
167
168 makeLenses ''WithForm
169 instance FromForm WithForm
170 instance FromJSON WithForm where
171 parseJSON = genericParseJSON $ jsonOptions "_wf_"
172 instance ToSchema WithForm where
173 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")
174
175 ------------------------------------------------------------------------
176 type AsyncJobs event ctI input output =
177 AsyncJobsAPI' 'Unsafe 'Safe ctI '[JSON] Maybe event input output
178 ------------------------------------------------------------------------
179
180 type AddWithQuery = Summary "Add with Query to corpus endpoint"
181 :> "corpus"
182 :> Capture "corpus_id" CorpusId
183 :> "query"
184 :> AsyncJobs ScraperStatus '[JSON] WithQuery ScraperStatus
185
186 {-
187 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
188 :> "corpus"
189 :> Capture "corpus_id" CorpusId
190 :> "add"
191 :> "file"
192 :> MultipartForm Mem (MultipartData Mem)
193 :> QueryParam "fileType" FileType
194 :> "async"
195 :> AsyncJobs ScraperStatus '[JSON] () ScraperStatus
196 -}
197
198 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
199 :> "corpus"
200 :> Capture "corpus_id" CorpusId
201 :> "add"
202 :> "form"
203 :> "async"
204 :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
205
206
207 ------------------------------------------------------------------------
208 -- TODO WithQuery also has a corpus id
209 addToCorpusWithQuery :: FlowCmdM env err m
210 => User
211 -> CorpusId
212 -> WithQuery
213 -> (ScraperStatus -> m ())
214 -> m ScraperStatus
215 addToCorpusWithQuery u cid (WithQuery q dbs l _nid) logStatus = do
216 -- TODO ...
217 logStatus ScraperStatus { _scst_succeeded = Just 10
218 , _scst_failed = Just 2
219 , _scst_remaining = Just 138
220 , _scst_events = Just []
221 }
222 printDebug "addToCorpusWithQuery" cid
223 -- TODO add cid
224 -- TODO if cid is folder -> create Corpus
225 -- if cid is corpus -> add to corpus
226 -- if cid is root -> create corpus in Private
227 txts <- mapM (\db -> getDataText db (Multi l) q (Just 10000)) [database2origin dbs]
228 cids <- mapM (\txt -> flowDataText u txt (Multi l) cid) txts
229 printDebug "corpus id" cids
230 -- TODO ...
231 pure ScraperStatus { _scst_succeeded = Just 137
232 , _scst_failed = Just 13
233 , _scst_remaining = Just 0
234 , _scst_events = Just []
235 }
236
237 addToCorpusWithForm :: FlowCmdM env err m
238 => User
239 -> CorpusId
240 -> WithForm
241 -> (ScraperStatus -> m ())
242 -> m ScraperStatus
243 addToCorpusWithForm user cid (WithForm ft d l _n) logStatus = do
244
245 let
246 parse = case ft of
247 CSV_HAL -> Parser.parseFormat Parser.CsvHal
248 CSV -> Parser.parseFormat Parser.CsvGargV3
249 WOS -> Parser.parseFormat Parser.WOS
250 PresseRIS -> Parser.parseFormat Parser.RisPresse
251
252 logStatus ScraperStatus { _scst_succeeded = Just 1
253 , _scst_failed = Just 0
254 , _scst_remaining = Just 1
255 , _scst_events = Just []
256 }
257
258 printDebug "Parsing corpus: " cid
259
260 -- TODO granularity of the logStatus
261 docs <- liftBase $ splitEvery 500
262 <$> take 1000000
263 <$> parse (cs d)
264
265 printDebug "Parsing corpus finished : " cid
266 printDebug "Starting extraction : " cid
267
268 -- TODO granularity of the logStatus
269 _cid' <- flowCorpus user
270 (Right [cid])
271 (Multi $ fromMaybe EN l)
272 (map (map toHyperdataDocument) docs)
273
274 printDebug "Extraction finished : " cid
275
276 pure ScraperStatus { _scst_succeeded = Just 2
277 , _scst_failed = Just 0
278 , _scst_remaining = Just 0
279 , _scst_events = Just []
280 }
281
282 {-
283 addToCorpusWithFile :: FlowCmdM env err m
284 => CorpusId
285 -> MultipartData Mem
286 -> Maybe FileType
287 -> (ScraperStatus -> m ())
288 -> m ScraperStatus
289 addToCorpusWithFile cid input filetype logStatus = do
290 logStatus ScraperStatus { _scst_succeeded = Just 10
291 , _scst_failed = Just 2
292 , _scst_remaining = Just 138
293 , _scst_events = Just []
294 }
295 printDebug "addToCorpusWithFile" cid
296 _h <- postUpload cid filetype input
297
298 pure ScraperStatus { _scst_succeeded = Just 137
299 , _scst_failed = Just 13
300 , _scst_remaining = Just 0
301 , _scst_events = Just []
302 }
303 -}
304
305