]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Corpus/New.hs
[DB] gargPool
[gargantext.git] / src / Gargantext / API / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE DeriveGeneric #-}
18 {-# LANGUAGE DataKinds #-}
19 {-# LANGUAGE TypeOperators #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE FlexibleContexts #-}
22 {-# LANGUAGE RankNTypes #-}
23 {-# OPTIONS_GHC -fno-warn-orphans #-}
24
25 module Gargantext.API.Corpus.New
26 where
27
28 --import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat(..))
29 import Control.Lens hiding (elements)
30 import Control.Monad.IO.Class (liftIO)
31 import Data.Aeson
32 import Data.Aeson.TH (deriveJSON)
33 import Data.Maybe (fromMaybe)
34 import Data.Either
35 import Data.Swagger
36 import Data.Text (Text)
37 import GHC.Generics (Generic)
38 import Gargantext.API.Corpus.New.File
39 import Gargantext.API.Orchestrator.Types
40 import Gargantext.Core (Lang(..))
41 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
42 import Gargantext.Database.Flow (FlowCmdM, flowCorpus)
43 import Gargantext.Database.Flow (flowCorpusSearchInDatabase)
44 import Gargantext.Database.Types.Node (CorpusId)
45 import Gargantext.Database.Types.Node (ToHyperdataDocument(..))
46 import Gargantext.Database.Types.Node (UserId)
47 import Gargantext.Prelude
48 import qualified Gargantext.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
49 import Gargantext.Text.Terms (TermType(..))
50 import Servant
51 import Servant.API.Flatten (Flat)
52 import Servant.Job.Core
53 import Servant.Job.Types
54 import Servant.Job.Utils (jsonOptions)
55 import Servant.Multipart
56 import Test.QuickCheck (elements)
57 import Test.QuickCheck.Arbitrary
58 import Web.FormUrlEncoded (FromForm)
59 import qualified Gargantext.Text.Corpus.API as API
60
61 data Query = Query { query_query :: Text
62 , query_corpus_id :: Int
63 , query_databases :: [API.ExternalAPIs]
64 }
65 deriving (Eq, Show, Generic)
66
67 deriveJSON (unPrefix "query_") 'Query
68
69 instance Arbitrary Query where
70 arbitrary = elements [ Query q n fs
71 | q <- ["a","b"]
72 , n <- [0..10]
73 , fs <- take 3 $ repeat API.externalAPIs
74 ]
75
76 instance ToSchema Query where
77 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
78
79 ------------------------------------------------------------------------
80
81 type Api = PostApi
82 :<|> GetApi
83
84 type PostApi = Summary "New Corpus endpoint"
85 :> ReqBody '[JSON] Query
86 :> Post '[JSON] CorpusId
87 type GetApi = Get '[JSON] ApiInfo
88
89 -- | TODO manage several apis
90 -- TODO-ACCESS
91 -- TODO this is only the POST
92 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
93 api _uId (Query q _ as) = do
94 cId <- case head as of
95 Nothing -> flowCorpusSearchInDatabase "user1" EN q
96 Just API.All -> flowCorpusSearchInDatabase "user1" EN q
97 Just a -> do
98 docs <- liftIO $ API.get a q (Just 1000)
99 cId' <- flowCorpus "user1" (Left q) (Multi EN) [docs]
100 pure cId'
101
102 pure cId
103
104 ------------------------------------------------
105 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
106 deriving (Generic)
107 instance Arbitrary ApiInfo where
108 arbitrary = ApiInfo <$> arbitrary
109
110 deriveJSON (unPrefix "") 'ApiInfo
111
112 instance ToSchema ApiInfo
113
114 info :: FlowCmdM env err m => UserId -> m ApiInfo
115 info _u = pure $ ApiInfo API.externalAPIs
116
117 ------------------------------------------------------------------------
118 ------------------------------------------------------------------------
119 data WithQuery = WithQuery
120 { _wq_query :: !Text
121 , _wq_databases :: ![ExternalAPIs]
122 , _wq_lang :: !(Maybe Lang)
123 }
124 deriving Generic
125
126 makeLenses ''WithQuery
127 instance FromJSON WithQuery where
128 parseJSON = genericParseJSON $ jsonOptions "_wq_"
129 instance ToSchema WithQuery where
130 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
131
132 -------------------------------------------------------
133 data WithForm = WithForm
134 { _wf_filetype :: !FileType
135 , _wf_data :: !Text
136 , _wf_lang :: !(Maybe Lang)
137 , _wf_name :: !Text
138 } deriving (Eq, Show, Generic)
139
140 makeLenses ''WithForm
141 instance FromForm WithForm
142 instance FromJSON WithForm where
143 parseJSON = genericParseJSON $ jsonOptions "_wf_"
144 instance ToSchema WithForm where
145 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")
146
147 ------------------------------------------------------------------------
148 type AsyncJobs event ctI input output =
149 Flat (AsyncJobsAPI' 'Unsafe 'Safe ctI '[JSON] Maybe event input output)
150 ------------------------------------------------------------------------
151
152 type Upload = Summary "Corpus Upload endpoint"
153 :> "corpus"
154 :> Capture "corpus_id" CorpusId
155 :<|> "addWithquery" :> AsyncJobsAPI ScraperStatus WithQuery ScraperStatus
156 :<|> "addWithfile" :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
157
158 type AddWithQuery = Summary "Add with Query to corpus endpoint"
159 :> "corpus"
160 :> Capture "corpus_id" CorpusId
161 :> "add"
162 :> "query"
163 :> "async"
164 :> AsyncJobsAPI ScraperStatus WithQuery ScraperStatus
165
166 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
167 :> "corpus"
168 :> Capture "corpus_id" CorpusId
169 :> "add"
170 :> "file"
171 :> MultipartForm Mem (MultipartData Mem)
172 :> QueryParam "fileType" FileType
173 :> "async"
174 :> AsyncJobs ScraperStatus '[JSON] () ScraperStatus
175
176 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
177 :> "corpus"
178 :> Capture "corpus_id" CorpusId
179 :> "add"
180 :> "form"
181 :> "async"
182 :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
183
184 ------------------------------------------------------------------------
185 -- TODO WithQuery also has a corpus id
186 addToCorpusJobFunction :: FlowCmdM env err m
187 => CorpusId
188 -> WithQuery
189 -> (ScraperStatus -> m ())
190 -> m ScraperStatus
191 addToCorpusJobFunction _cid (WithQuery _q _dbs _l) logStatus = do
192 -- TODO ...
193 logStatus ScraperStatus { _scst_succeeded = Just 10
194 , _scst_failed = Just 2
195 , _scst_remaining = Just 138
196 , _scst_events = Just []
197 }
198 -- TODO ...
199 pure ScraperStatus { _scst_succeeded = Just 137
200 , _scst_failed = Just 13
201 , _scst_remaining = Just 0
202 , _scst_events = Just []
203 }
204
205 addToCorpusWithFile :: FlowCmdM env err m
206 => CorpusId
207 -> MultipartData Mem
208 -> Maybe FileType
209 -> (ScraperStatus -> m ())
210 -> m ScraperStatus
211 addToCorpusWithFile cid input filetype logStatus = do
212 logStatus ScraperStatus { _scst_succeeded = Just 10
213 , _scst_failed = Just 2
214 , _scst_remaining = Just 138
215 , _scst_events = Just []
216 }
217 printDebug "addToCorpusWithFile" cid
218 _h <- postUpload cid filetype input
219
220 pure ScraperStatus { _scst_succeeded = Just 137
221 , _scst_failed = Just 13
222 , _scst_remaining = Just 0
223 , _scst_events = Just []
224 }
225
226 {- | Model to fork the flow
227 -- This is not really optimized since it increases the need RAM
228 -- and freezes the whole system
229 -- This is mainly for documentation (see a better solution in the function below)
230 -- Each process has to be tailored
231 addToCorpusWithForm' :: FlowCmdM env err m
232 => CorpusId
233 -> WithForm
234 -> (ScraperStatus -> m ())
235 -> m ScraperStatus
236 addToCorpusWithForm' cid (WithForm ft d l) logStatus = do
237 newStatus <- liftIO newEmptyMVar
238 s <- addToCorpusWithForm cid (WithForm ft d l) logStatus
239 _ <- liftIO $ forkIO $ putMVar newStatus s
240 s' <- liftIO $ takeMVar newStatus
241 pure s'
242 -}
243 addToCorpusWithForm :: FlowCmdM env err m
244 => Text
245 -> CorpusId
246 -> WithForm
247 -> (ScraperStatus -> m ())
248 -> m ScraperStatus
249 addToCorpusWithForm username cid (WithForm ft d l _n) logStatus = do
250
251 let
252 parse = case ft of
253 CSV_HAL -> Parser.parseFormat Parser.CsvHal
254 CSV -> Parser.parseFormat Parser.CsvGargV3
255 WOS -> Parser.parseFormat Parser.WOS
256 PresseRIS -> Parser.parseFormat Parser.RisPresse
257
258 logStatus ScraperStatus { _scst_succeeded = Just 1
259 , _scst_failed = Just 0
260 , _scst_remaining = Just 1
261 , _scst_events = Just []
262 }
263
264 printDebug "Parsing corpus: " cid
265
266 -- TODO granularity of the logStatus
267 docs <- liftIO $ splitEvery 500
268 <$> take 1000000
269 <$> parse (cs d)
270
271 printDebug "Parsing corpus finished : " cid
272 printDebug "Starting extraction : " cid
273
274 -- TODO granularity of the logStatus
275 _cid' <- flowCorpus username
276 (Right [cid])
277 (Multi $ fromMaybe EN l)
278 (map (map toHyperdataDocument) docs)
279
280 printDebug "Extraction finished : " cid
281
282 pure ScraperStatus { _scst_succeeded = Just 2
283 , _scst_failed = Just 0
284 , _scst_remaining = Just 0
285 , _scst_events = Just []
286 }
287