]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/New.hs
[REFACT] hash functions
[gargantext.git] / src / Gargantext / API / Node / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
17 {-# OPTIONS_GHC -fno-warn-orphans #-}
18
19 module Gargantext.API.Node.Corpus.New
20 where
21
22 import Control.Lens hiding (elements, Empty)
23 import Data.Aeson
24 import Data.Aeson.TH (deriveJSON)
25 import Data.Either
26 import Data.Maybe (fromMaybe)
27 import Data.Swagger
28 import Data.Text (Text)
29 import GHC.Generics (Generic)
30 import Servant
31 import Servant.Job.Core
32 import Servant.Job.Types
33 import Servant.Job.Utils (jsonOptions)
34 -- import Servant.Multipart
35 -- import Test.QuickCheck (elements)
36 import Test.QuickCheck.Arbitrary
37 import Web.FormUrlEncoded (FromForm)
38
39 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
40 import qualified Gargantext.API.Admin.Orchestrator.Types as T
41 import Gargantext.API.Node.Corpus.New.File
42 import Gargantext.Core (Lang(..){-, allLangs-})
43 import Gargantext.Core.Types.Individu (User(..))
44 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
45 import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, getDataText, flowDataText, TermType(..), DataOrigin(..){-, allDataOrigins-})
46 import Gargantext.Database.Admin.Types.Hyperdata
47 import Gargantext.Database.Admin.Types.Node (CorpusId, UserId)
48 import Gargantext.Prelude
49 import qualified Gargantext.Text.Corpus.API as API
50 import qualified Gargantext.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
51
52 ------------------------------------------------------------------------
53 {-
54 data Query = Query { query_query :: Text
55 , query_node_id :: Int
56 , query_lang :: Lang
57 , query_databases :: [DataOrigin]
58 }
59 deriving (Eq, Generic)
60
61 deriveJSON (unPrefix "query_") 'Query
62
63 instance Arbitrary Query where
64 arbitrary = elements [ Query q n la fs
65 | q <- ["honeybee* AND collapse"
66 ,"covid 19"
67 ]
68 , n <- [0..10]
69 , la <- allLangs
70 , fs <- take 3 $ repeat allDataOrigins
71 ]
72
73 instance ToSchema Query where
74 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
75 -}
76
77 ------------------------------------------------------------------------
78
79 {-
80 type Api = PostApi
81 :<|> GetApi
82
83 type PostApi = Summary "New Corpus endpoint"
84 :> ReqBody '[JSON] Query
85 :> Post '[JSON] CorpusId
86 type GetApi = Get '[JSON] ApiInfo
87 -}
88
89 -- | TODO manage several apis
90 -- TODO-ACCESS
91 -- TODO this is only the POST
92 {-
93 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
94 api uid (Query q _ as) = do
95 cId <- case head as of
96 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
97 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
98 Just a -> do
99 docs <- liftBase $ API.get a q (Just 1000)
100 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
101 pure cId'
102
103 pure cId
104 -}
105
106 ------------------------------------------------
107 -- TODO use this route for Client implementation
108 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
109 deriving (Generic)
110 instance Arbitrary ApiInfo where
111 arbitrary = ApiInfo <$> arbitrary
112
113 deriveJSON (unPrefix "") 'ApiInfo
114
115 instance ToSchema ApiInfo
116
117 info :: FlowCmdM env err m => UserId -> m ApiInfo
118 info _u = pure $ ApiInfo API.externalAPIs
119
120 ------------------------------------------------------------------------
121
122 data Database = Empty
123 | PubMed
124 | HAL
125 | IsTex
126 | Isidore
127 deriving (Eq, Show, Generic)
128
129 deriveJSON (unPrefix "") ''Database
130 instance ToSchema Database
131
132 database2origin :: Database -> DataOrigin
133 database2origin Empty = InternalOrigin T.IsTex
134 database2origin PubMed = ExternalOrigin T.PubMed
135 database2origin HAL = ExternalOrigin T.HAL
136 database2origin IsTex = ExternalOrigin T.IsTex
137 database2origin Isidore = ExternalOrigin T.Isidore
138
139 ------------------------------------------------------------------------
140 data WithQuery = WithQuery
141 { _wq_query :: !Text
142 , _wq_databases :: !Database
143 , _wq_lang :: !Lang
144 , _wq_node_id :: !Int
145 }
146 deriving Generic
147
148 makeLenses ''WithQuery
149 instance FromJSON WithQuery where
150 parseJSON = genericParseJSON $ jsonOptions "_wq_"
151 instance ToSchema WithQuery where
152 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
153
154 -------------------------------------------------------
155 data NewWithForm = NewWithForm
156 { _wf_filetype :: !FileType
157 , _wf_data :: !Text
158 , _wf_lang :: !(Maybe Lang)
159 , _wf_name :: !Text
160 } deriving (Eq, Show, Generic)
161
162 makeLenses ''NewWithForm
163 instance FromForm NewWithForm
164 instance FromJSON NewWithForm where
165 parseJSON = genericParseJSON $ jsonOptions "_wf_"
166 instance ToSchema NewWithForm where
167 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")
168
169 ------------------------------------------------------------------------
170 type AsyncJobs event ctI input output =
171 AsyncJobsAPI' 'Unsafe 'Safe ctI '[JSON] Maybe event input output
172 ------------------------------------------------------------------------
173
174 type AddWithQuery = Summary "Add with Query to corpus endpoint"
175 :> "corpus"
176 :> Capture "corpus_id" CorpusId
177 :> "query"
178 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
179
180 {-
181 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
182 :> "corpus"
183 :> Capture "corpus_id" CorpusId
184 :> "add"
185 :> "file"
186 :> MultipartForm Mem (MultipartData Mem)
187 :> QueryParam "fileType" FileType
188 :> "async"
189 :> AsyncJobs JobLog '[JSON] () JobLog
190 -}
191
192 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
193 :> "corpus"
194 :> Capture "corpus_id" CorpusId
195 :> "add"
196 :> "form"
197 :> "async"
198 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
199
200
201 ------------------------------------------------------------------------
202 -- TODO WithQuery also has a corpus id
203 addToCorpusWithQuery :: FlowCmdM env err m
204 => User
205 -> CorpusId
206 -> WithQuery
207 -> (JobLog -> m ())
208 -> m JobLog
209 addToCorpusWithQuery u cid (WithQuery q dbs l _nid) logStatus = do
210 -- TODO ...
211 logStatus JobLog { _scst_succeeded = Just 0
212 , _scst_failed = Just 0
213 , _scst_remaining = Just 5
214 , _scst_events = Just []
215 }
216 printDebug "addToCorpusWithQuery" cid
217 -- TODO add cid
218 -- TODO if cid is folder -> create Corpus
219 -- if cid is corpus -> add to corpus
220 -- if cid is root -> create corpus in Private
221 txts <- mapM (\db -> getDataText db (Multi l) q (Just 10000)) [database2origin dbs]
222
223 logStatus JobLog { _scst_succeeded = Just 2
224 , _scst_failed = Just 0
225 , _scst_remaining = Just 1
226 , _scst_events = Just []
227 }
228
229 cids <- mapM (\txt -> flowDataText u txt (Multi l) cid) txts
230 printDebug "corpus id" cids
231 -- TODO ...
232 pure JobLog { _scst_succeeded = Just 3
233 , _scst_failed = Just 0
234 , _scst_remaining = Just 0
235 , _scst_events = Just []
236 }
237
238 addToCorpusWithForm :: FlowCmdM env err m
239 => User
240 -> CorpusId
241 -> NewWithForm
242 -> (JobLog -> m ())
243 -> m JobLog
244 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus = do
245
246 printDebug "Parsing corpus: " cid
247 logStatus JobLog { _scst_succeeded = Just 0
248 , _scst_failed = Just 0
249 , _scst_remaining = Just 2
250 , _scst_events = Just []
251 }
252 let
253 parse = case ft of
254 CSV_HAL -> Parser.parseFormat Parser.CsvHal
255 CSV -> Parser.parseFormat Parser.CsvGargV3
256 WOS -> Parser.parseFormat Parser.WOS
257 PresseRIS -> Parser.parseFormat Parser.RisPresse
258
259 -- TODO granularity of the logStatus
260 docs <- liftBase $ splitEvery 500
261 <$> take 1000000
262 <$> parse (cs d)
263
264 printDebug "Parsing corpus finished : " cid
265 logStatus JobLog { _scst_succeeded = Just 1
266 , _scst_failed = Just 0
267 , _scst_remaining = Just 1
268 , _scst_events = Just []
269 }
270
271
272 printDebug "Starting extraction : " cid
273 -- TODO granularity of the logStatus
274 _cid' <- flowCorpus user
275 (Right [cid])
276 (Multi $ fromMaybe EN l)
277 (map (map toHyperdataDocument) docs)
278
279 printDebug "Extraction finished : " cid
280 pure JobLog { _scst_succeeded = Just 2
281 , _scst_failed = Just 0
282 , _scst_remaining = Just 0
283 , _scst_events = Just []
284 }
285
286 {-
287 addToCorpusWithFile :: FlowCmdM env err m
288 => CorpusId
289 -> MultipartData Mem
290 -> Maybe FileType
291 -> (JobLog -> m ())
292 -> m JobLog
293 addToCorpusWithFile cid input filetype logStatus = do
294 logStatus JobLog { _scst_succeeded = Just 10
295 , _scst_failed = Just 2
296 , _scst_remaining = Just 138
297 , _scst_events = Just []
298 }
299 printDebug "addToCorpusWithFile" cid
300 _h <- postUpload cid filetype input
301
302 pure JobLog { _scst_succeeded = Just 137
303 , _scst_failed = Just 13
304 , _scst_remaining = Just 0
305 , _scst_events = Just []
306 }
307 -}
308
309