]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Corpus/New.hs
[docker] update image, add README info
[gargantext.git] / src / Gargantext / API / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE DeriveGeneric #-}
18 {-# LANGUAGE DataKinds #-}
19 {-# LANGUAGE TypeOperators #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE FlexibleContexts #-}
22 {-# LANGUAGE RankNTypes #-}
23 {-# OPTIONS_GHC -fno-warn-orphans #-}
24
25 module Gargantext.API.Corpus.New
26 where
27
28 --import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat(..))
29 import Control.Lens hiding (elements)
30 import Control.Monad.IO.Class (liftIO)
31 import Control.Concurrent
32 import Data.Aeson
33 import Data.Aeson.TH (deriveJSON)
34 import Data.Maybe (fromMaybe)
35 import Data.Either
36 import Data.Swagger
37 import Data.Text (Text)
38 import GHC.Generics (Generic)
39 import Gargantext.API.Corpus.New.File
40 import Gargantext.API.Orchestrator.Types
41 import Gargantext.Core (Lang(..))
42 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
43 import Gargantext.Database.Flow (FlowCmdM, flowCorpus)
44 import Gargantext.Database.Flow (flowCorpusSearchInDatabase)
45 import Gargantext.Database.Types.Node (CorpusId)
46 import Gargantext.Database.Types.Node (ToHyperdataDocument(..))
47 import Gargantext.Database.Types.Node (UserId)
48 import Gargantext.Prelude
49 import qualified Gargantext.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
50 import Gargantext.Text.Terms (TermType(..))
51 import Servant
52 import Servant.API.Flatten (Flat)
53 import Servant.Job.Core
54 import Servant.Job.Types
55 import Servant.Job.Utils (jsonOptions)
56 import Servant.Multipart
57 import Test.QuickCheck (elements)
58 import Test.QuickCheck.Arbitrary
59 import Web.FormUrlEncoded (FromForm)
60 import qualified Gargantext.Text.Corpus.API as API
61
62 data Query = Query { query_query :: Text
63 , query_corpus_id :: Int
64 , query_databases :: [API.ExternalAPIs]
65 }
66 deriving (Eq, Show, Generic)
67
68 deriveJSON (unPrefix "query_") 'Query
69
70
71 instance Arbitrary Query where
72 arbitrary = elements [ Query q n fs
73 | q <- ["a","b"]
74 , n <- [0..10]
75 , fs <- take 3 $ repeat API.externalAPIs
76 ]
77
78 instance ToSchema Query where
79 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
80
81 ------------------------------------------------------------------------
82
83 type Api = PostApi
84 :<|> GetApi
85
86 type PostApi = Summary "New Corpus endpoint"
87 :> ReqBody '[JSON] Query
88 :> Post '[JSON] CorpusId
89 type GetApi = Get '[JSON] ApiInfo
90
91 -- | TODO manage several apis
92 -- TODO-ACCESS
93 -- TODO this is only the POST
94 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
95 api _uId (Query q _ as) = do
96 cId <- case head as of
97 Nothing -> flowCorpusSearchInDatabase "user1" EN q
98 Just API.All -> flowCorpusSearchInDatabase "user1" EN q
99 Just a -> do
100 docs <- liftIO $ API.get a q (Just 1000)
101 cId' <- flowCorpus "user1" (Left q) (Multi EN) [docs]
102 pure cId'
103
104 pure cId
105
106 ------------------------------------------------
107 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
108 deriving (Generic)
109 instance Arbitrary ApiInfo where
110 arbitrary = ApiInfo <$> arbitrary
111
112 deriveJSON (unPrefix "") 'ApiInfo
113
114 instance ToSchema ApiInfo
115
116 info :: FlowCmdM env err m => UserId -> m ApiInfo
117 info _u = pure $ ApiInfo API.externalAPIs
118
119 ------------------------------------------------------------------------
120 ------------------------------------------------------------------------
121 data WithQuery = WithQuery
122 { _wq_query :: !Text
123 , _wq_databases :: ![ExternalAPIs]
124 , _wq_lang :: !(Maybe Lang)
125 }
126 deriving Generic
127
128 makeLenses ''WithQuery
129 instance FromJSON WithQuery where
130 parseJSON = genericParseJSON $ jsonOptions "_wq_"
131 instance ToSchema WithQuery where
132 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
133
134 -------------------------------------------------------
135 data WithForm = WithForm
136 { _wf_filetype :: !FileType
137 , _wf_data :: !Text
138 , _wf_lang :: !(Maybe Lang)
139 , _wf_name :: !Text
140 } deriving (Eq, Show, Generic)
141
142 makeLenses ''WithForm
143 instance FromForm WithForm
144 instance FromJSON WithForm where
145 parseJSON = genericParseJSON $ jsonOptions "_wf_"
146 instance ToSchema WithForm where
147 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")
148
149 ------------------------------------------------------------------------
150 type AsyncJobs event ctI input output =
151 Flat (AsyncJobsAPI' 'Unsafe 'Safe ctI '[JSON] Maybe event input output)
152 ------------------------------------------------------------------------
153
154 type Upload = Summary "Corpus Upload endpoint"
155 :> "corpus"
156 :> Capture "corpus_id" CorpusId
157 :<|> "addWithquery" :> AsyncJobsAPI ScraperStatus WithQuery ScraperStatus
158 :<|> "addWithfile" :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
159
160
161 type AddWithQuery = Summary "Add with Query to corpus endpoint"
162 :> "corpus"
163 :> Capture "corpus_id" CorpusId
164 :> "add"
165 :> "query"
166 :> "async"
167 :> AsyncJobsAPI ScraperStatus WithQuery ScraperStatus
168
169 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
170 :> "corpus"
171 :> Capture "corpus_id" CorpusId
172 :> "add"
173 :> "file"
174 :> MultipartForm Mem (MultipartData Mem)
175 :> QueryParam "fileType" FileType
176 :> "async"
177 :> AsyncJobs ScraperStatus '[JSON] () ScraperStatus
178
179 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
180 :> "corpus"
181 :> Capture "corpus_id" CorpusId
182 :> "add"
183 :> "form"
184 :> "async"
185 :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
186
187 ------------------------------------------------------------------------
188 -- TODO WithQuery also has a corpus id
189 addToCorpusJobFunction :: FlowCmdM env err m
190 => CorpusId
191 -> WithQuery
192 -> (ScraperStatus -> m ())
193 -> m ScraperStatus
194 addToCorpusJobFunction _cid (WithQuery _q _dbs _l) logStatus = do
195 -- TODO ...
196 logStatus ScraperStatus { _scst_succeeded = Just 10
197 , _scst_failed = Just 2
198 , _scst_remaining = Just 138
199 , _scst_events = Just []
200 }
201 -- TODO ...
202 pure ScraperStatus { _scst_succeeded = Just 137
203 , _scst_failed = Just 13
204 , _scst_remaining = Just 0
205 , _scst_events = Just []
206 }
207
208
209 addToCorpusWithFile :: FlowCmdM env err m
210 => CorpusId
211 -> MultipartData Mem
212 -> Maybe FileType
213 -> (ScraperStatus -> m ())
214 -> m ScraperStatus
215 addToCorpusWithFile cid input filetype logStatus = do
216 logStatus ScraperStatus { _scst_succeeded = Just 10
217 , _scst_failed = Just 2
218 , _scst_remaining = Just 138
219 , _scst_events = Just []
220 }
221 _h <- postUpload cid filetype input
222
223 pure ScraperStatus { _scst_succeeded = Just 137
224 , _scst_failed = Just 13
225 , _scst_remaining = Just 0
226 , _scst_events = Just []
227 }
228
229 {- | Model to fork the flow
230 -- This is not really optimized since it increases the need RAM
231 -- and freezes the whole system
232 -- This is mainly for documentation (see a better solution in the function below)
233 -- Each process has to be tailored
234 addToCorpusWithForm' :: FlowCmdM env err m
235 => CorpusId
236 -> WithForm
237 -> (ScraperStatus -> m ())
238 -> m ScraperStatus
239 addToCorpusWithForm' cid (WithForm ft d l) logStatus = do
240 newStatus <- liftIO newEmptyMVar
241 s <- addToCorpusWithForm cid (WithForm ft d l) logStatus
242 _ <- liftIO $ forkIO $ putMVar newStatus s
243 s' <- liftIO $ takeMVar newStatus
244 pure s'
245 -}
246 addToCorpusWithForm :: FlowCmdM env err m
247 => CorpusId
248 -> WithForm
249 -> (ScraperStatus -> m ())
250 -> m ScraperStatus
251 addToCorpusWithForm cid (WithForm ft d l _n) logStatus = do
252
253 printDebug "ft" ft
254
255 let
256 parse = case ft of
257 CSV_HAL -> Parser.parseFormat Parser.CsvHal
258 CSV -> Parser.parseFormat Parser.CsvGargV3
259 WOS -> Parser.parseFormat Parser.WOS
260 PresseRIS -> Parser.parseFormat Parser.RisPresse
261
262 newDocs <- liftIO newEmptyMVar
263 docs <- liftIO
264 $ splitEvery 500
265 <$> take 1000000
266 <$> parse (cs d)
267 _ <- liftIO $ forkIO $ putMVar newDocs docs
268
269 logStatus ScraperStatus { _scst_succeeded = Just 1
270 , _scst_failed = Just 0
271 , _scst_remaining = Just 1
272 , _scst_events = Just []
273 }
274
275 docs' <- liftIO $ takeMVar newDocs
276 newCid <- liftIO newEmptyMVar
277 cid' <- flowCorpus "user1"
278 (Right [cid])
279 (Multi $ fromMaybe EN l)
280 (map (map toHyperdataDocument) docs')
281 _ <- liftIO $ forkIO $ putMVar newCid cid'
282
283 cid'' <- liftIO $ takeMVar newCid
284 printDebug "cid'" cid''
285
286 pure ScraperStatus { _scst_succeeded = Just 2
287 , _scst_failed = Just 0
288 , _scst_remaining = Just 0
289 , _scst_events = Just []
290 }
291