]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Corpus/New.hs
[WIP][REFACT] imports fixed
[gargantext.git] / src / Gargantext / API / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE DeriveGeneric #-}
18 {-# LANGUAGE DataKinds #-}
19 {-# LANGUAGE TypeOperators #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE FlexibleContexts #-}
22 {-# LANGUAGE RankNTypes #-}
23 {-# OPTIONS_GHC -fno-warn-orphans #-}
24
25 module Gargantext.API.Corpus.New
26 where
27
28 import Control.Lens hiding (elements)
29 import Data.Aeson
30 import Data.Aeson.TH (deriveJSON)
31 import Data.Maybe (fromMaybe)
32 import Data.Either
33 import Data.Swagger
34 import Data.Text (Text)
35 import GHC.Generics (Generic)
36 import Gargantext.API.Corpus.New.File
37 import Gargantext.API.Orchestrator.Types
38 import Gargantext.Core (Lang(..))
39 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
40 import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, flowCorpusSearchInDatabase)
41 import Gargantext.Database.Admin.Types.Node (CorpusId, ToHyperdataDocument(..))
42 import Gargantext.Core.Types.Individu (UserId, User(..))
43 import Gargantext.Prelude
44 import qualified Gargantext.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
45 import Gargantext.Text.Terms (TermType(..))
46 import Servant
47 import Servant.API.Flatten (Flat)
48 import Servant.Job.Core
49 import Servant.Job.Types
50 import Servant.Job.Utils (jsonOptions)
51 import Servant.Multipart
52 import Test.QuickCheck (elements)
53 import Test.QuickCheck.Arbitrary
54 import Web.FormUrlEncoded (FromForm)
55 import qualified Gargantext.Text.Corpus.API as API
56
57 data Query = Query { query_query :: Text
58 , query_corpus_id :: Int
59 , query_databases :: [API.ExternalAPIs]
60 }
61 deriving (Eq, Show, Generic)
62
63 deriveJSON (unPrefix "query_") 'Query
64
65 instance Arbitrary Query where
66 arbitrary = elements [ Query q n fs
67 | q <- ["a","b"]
68 , n <- [0..10]
69 , fs <- take 3 $ repeat API.externalAPIs
70 ]
71
72 instance ToSchema Query where
73 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
74
75 ------------------------------------------------------------------------
76
77 type Api = PostApi
78 :<|> GetApi
79
80 type PostApi = Summary "New Corpus endpoint"
81 :> ReqBody '[JSON] Query
82 :> Post '[JSON] CorpusId
83 type GetApi = Get '[JSON] ApiInfo
84
85 -- | TODO manage several apis
86 -- TODO-ACCESS
87 -- TODO this is only the POST
88 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
89 api uid (Query q _ as) = do
90 cId <- case head as of
91 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
92 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
93 Just a -> do
94 docs <- liftBase $ API.get a q (Just 1000)
95 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
96 pure cId'
97
98 pure cId
99
100 ------------------------------------------------
101 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
102 deriving (Generic)
103 instance Arbitrary ApiInfo where
104 arbitrary = ApiInfo <$> arbitrary
105
106 deriveJSON (unPrefix "") 'ApiInfo
107
108 instance ToSchema ApiInfo
109
110 info :: FlowCmdM env err m => UserId -> m ApiInfo
111 info _u = pure $ ApiInfo API.externalAPIs
112
113 ------------------------------------------------------------------------
114 ------------------------------------------------------------------------
115 data WithQuery = WithQuery
116 { _wq_query :: !Text
117 , _wq_databases :: ![ExternalAPIs]
118 , _wq_lang :: !(Maybe Lang)
119 }
120 deriving Generic
121
122 makeLenses ''WithQuery
123 instance FromJSON WithQuery where
124 parseJSON = genericParseJSON $ jsonOptions "_wq_"
125 instance ToSchema WithQuery where
126 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
127
128 -------------------------------------------------------
129 data WithForm = WithForm
130 { _wf_filetype :: !FileType
131 , _wf_data :: !Text
132 , _wf_lang :: !(Maybe Lang)
133 , _wf_name :: !Text
134 } deriving (Eq, Show, Generic)
135
136 makeLenses ''WithForm
137 instance FromForm WithForm
138 instance FromJSON WithForm where
139 parseJSON = genericParseJSON $ jsonOptions "_wf_"
140 instance ToSchema WithForm where
141 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")
142
143 ------------------------------------------------------------------------
144 type AsyncJobs event ctI input output =
145 Flat (AsyncJobsAPI' 'Unsafe 'Safe ctI '[JSON] Maybe event input output)
146 ------------------------------------------------------------------------
147
148 type Upload = Summary "Corpus Upload endpoint"
149 :> "corpus"
150 :> Capture "corpus_id" CorpusId
151 :<|> "addWithquery" :> AsyncJobsAPI ScraperStatus WithQuery ScraperStatus
152 :<|> "addWithfile" :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
153
154 type AddWithQuery = Summary "Add with Query to corpus endpoint"
155 :> "corpus"
156 :> Capture "corpus_id" CorpusId
157 :> "add"
158 :> "query"
159 :> "async"
160 :> AsyncJobsAPI ScraperStatus WithQuery ScraperStatus
161
162 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
163 :> "corpus"
164 :> Capture "corpus_id" CorpusId
165 :> "add"
166 :> "file"
167 :> MultipartForm Mem (MultipartData Mem)
168 :> QueryParam "fileType" FileType
169 :> "async"
170 :> AsyncJobs ScraperStatus '[JSON] () ScraperStatus
171
172 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
173 :> "corpus"
174 :> Capture "corpus_id" CorpusId
175 :> "add"
176 :> "form"
177 :> "async"
178 :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
179
180 ------------------------------------------------------------------------
181 -- TODO WithQuery also has a corpus id
182 addToCorpusWithQuery :: FlowCmdM env err m
183 => User
184 -> CorpusId
185 -> WithQuery
186 -> (ScraperStatus -> m ())
187 -> m ScraperStatus
188 addToCorpusWithQuery u cid (WithQuery q _dbs l) logStatus = do
189 -- TODO ...
190 logStatus ScraperStatus { _scst_succeeded = Just 10
191 , _scst_failed = Just 2
192 , _scst_remaining = Just 138
193 , _scst_events = Just []
194 }
195 printDebug "addToCorpusWithQuery" cid
196 -- TODO add cid
197 -- TODO if cid is folder -> create Corpus
198 -- if cid is corpus -> add to corpus
199 -- if cid is root -> create corpus in Private
200 cids <- flowCorpusSearchInDatabase u (maybe EN identity l) q
201 printDebug "corpus id" cids
202 -- TODO ...
203 pure ScraperStatus { _scst_succeeded = Just 137
204 , _scst_failed = Just 13
205 , _scst_remaining = Just 0
206 , _scst_events = Just []
207 }
208
209 addToCorpusWithFile :: FlowCmdM env err m
210 => CorpusId
211 -> MultipartData Mem
212 -> Maybe FileType
213 -> (ScraperStatus -> m ())
214 -> m ScraperStatus
215 addToCorpusWithFile cid input filetype logStatus = do
216 logStatus ScraperStatus { _scst_succeeded = Just 10
217 , _scst_failed = Just 2
218 , _scst_remaining = Just 138
219 , _scst_events = Just []
220 }
221 printDebug "addToCorpusWithFile" cid
222 _h <- postUpload cid filetype input
223
224 pure ScraperStatus { _scst_succeeded = Just 137
225 , _scst_failed = Just 13
226 , _scst_remaining = Just 0
227 , _scst_events = Just []
228 }
229
230 {- | Model to fork the flow
231 -- This is not really optimized since it increases the need RAM
232 -- and freezes the whole system
233 -- This is mainly for documentation (see a better solution in the function below)
234 -- Each process has to be tailored
235 addToCorpusWithForm' :: FlowCmdM env err m
236 => CorpusId
237 -> WithForm
238 -> (ScraperStatus -> m ())
239 -> m ScraperStatus
240 addToCorpusWithForm' cid (WithForm ft d l) logStatus = do
241 newStatus <- liftBase newEmptyMVar
242 s <- addToCorpusWithForm cid (WithForm ft d l) logStatus
243 _ <- liftBase $ forkIO $ putMVar newStatus s
244 s' <- liftBase $ takeMVar newStatus
245 pure s'
246 -}
247 addToCorpusWithForm :: FlowCmdM env err m
248 => User
249 -> CorpusId
250 -> WithForm
251 -> (ScraperStatus -> m ())
252 -> m ScraperStatus
253 addToCorpusWithForm user cid (WithForm ft d l _n) logStatus = do
254
255 let
256 parse = case ft of
257 CSV_HAL -> Parser.parseFormat Parser.CsvHal
258 CSV -> Parser.parseFormat Parser.CsvGargV3
259 WOS -> Parser.parseFormat Parser.WOS
260 PresseRIS -> Parser.parseFormat Parser.RisPresse
261
262 logStatus ScraperStatus { _scst_succeeded = Just 1
263 , _scst_failed = Just 0
264 , _scst_remaining = Just 1
265 , _scst_events = Just []
266 }
267
268 printDebug "Parsing corpus: " cid
269
270 -- TODO granularity of the logStatus
271 docs <- liftBase $ splitEvery 500
272 <$> take 1000000
273 <$> parse (cs d)
274
275 printDebug "Parsing corpus finished : " cid
276 printDebug "Starting extraction : " cid
277
278 -- TODO granularity of the logStatus
279 _cid' <- flowCorpus user
280 (Right [cid])
281 (Multi $ fromMaybe EN l)
282 (map (map toHyperdataDocument) docs)
283
284 printDebug "Extraction finished : " cid
285
286 pure ScraperStatus { _scst_succeeded = Just 2
287 , _scst_failed = Just 0
288 , _scst_remaining = Just 0
289 , _scst_events = Just []
290 }
291