]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Corpus/New.hs
Merge branch 'dev' into dev-dashoard-charts
[gargantext.git] / src / Gargantext / API / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE DeriveGeneric #-}
18 {-# LANGUAGE DataKinds #-}
19 {-# LANGUAGE TypeOperators #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE FlexibleContexts #-}
22 {-# LANGUAGE RankNTypes #-}
23
24 module Gargantext.API.Corpus.New
25 where
26
27 --import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat(..))
28 import Control.Lens hiding (elements)
29 import Control.Monad.IO.Class (liftIO)
30 import Data.Aeson
31 import Data.Aeson.TH (deriveJSON)
32 import Data.Maybe (fromMaybe)
33 import Data.Either
34 import Data.Swagger
35 import Data.Text (Text)
36 import GHC.Generics (Generic)
37 import Gargantext.API.Corpus.New.File
38 import Gargantext.API.Orchestrator.Types
39 import Gargantext.Core (Lang(..))
40 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
41 import Gargantext.Database.Flow (FlowCmdM, flowCorpus)
42 import Gargantext.Database.Flow (flowCorpusSearchInDatabase)
43 import Gargantext.Database.Types.Node (CorpusId)
44 import Gargantext.Database.Types.Node (ToHyperdataDocument(..))
45 import Gargantext.Database.Types.Node (UserId)
46 import Gargantext.Prelude
47 import qualified Gargantext.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
48 import Gargantext.Text.Terms (TermType(..))
49 import Servant
50 import Servant.API.Flatten (Flat)
51 import Servant.Job.Core
52 import Servant.Job.Types
53 import Servant.Job.Utils (jsonOptions)
54 import Servant.Multipart
55 import Test.QuickCheck (elements)
56 import Test.QuickCheck.Arbitrary
57 import Web.FormUrlEncoded (FromForm)
58 import qualified Gargantext.Text.Corpus.API as API
59
60 data Query = Query { query_query :: Text
61 , query_corpus_id :: Int
62 , query_databases :: [API.ExternalAPIs]
63 }
64 deriving (Eq, Show, Generic)
65
66 deriveJSON (unPrefix "query_") 'Query
67
68
69 instance Arbitrary Query where
70 arbitrary = elements [ Query q n fs
71 | q <- ["a","b"]
72 , n <- [0..10]
73 , fs <- take 3 $ repeat API.externalAPIs
74 ]
75
76 instance ToSchema Query where
77 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
78
79 type Api = Summary "New Corpus endpoint"
80 :> ReqBody '[JSON] Query
81 :> Post '[JSON] CorpusId
82 :<|> Get '[JSON] ApiInfo
83
84 -- | TODO manage several apis
85 -- TODO-ACCESS
86 -- TODO this is only the POST
87 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
88 api _uId (Query q _ as) = do
89 cId <- case head as of
90 Nothing -> flowCorpusSearchInDatabase "user1" EN q
91 Just API.All -> flowCorpusSearchInDatabase "user1" EN q
92 Just a -> do
93 docs <- liftIO $ API.get a q (Just 1000)
94 cId' <- flowCorpus "user1" (Left q) (Multi EN) [docs]
95 pure cId'
96
97 pure cId
98
99 ------------------------------------------------
100 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
101 deriving (Generic)
102 instance Arbitrary ApiInfo where
103 arbitrary = ApiInfo <$> arbitrary
104
105 deriveJSON (unPrefix "") 'ApiInfo
106
107 instance ToSchema ApiInfo
108
109 info :: FlowCmdM env err m => UserId -> m ApiInfo
110 info _u = pure $ ApiInfo API.externalAPIs
111
112 ------------------------------------------------------------------------
113 ------------------------------------------------------------------------
114 data WithQuery = WithQuery
115 { _wq_query :: !Text
116 , _wq_databases :: ![ExternalAPIs]
117 , _wq_lang :: !(Maybe Lang)
118 }
119 deriving Generic
120
121 makeLenses ''WithQuery
122 instance FromJSON WithQuery where
123 parseJSON = genericParseJSON $ jsonOptions "_wq_"
124 instance ToSchema WithQuery where
125 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
126
127 -------------------------------------------------------
128 data WithForm = WithForm
129 { _wf_filetype :: !FileType
130 , _wf_data :: !Text
131 , _wf_lang :: !(Maybe Lang)
132 } deriving (Eq, Show, Generic)
133
134 makeLenses ''WithForm
135 instance FromForm WithForm
136 instance FromJSON WithForm where
137 parseJSON = genericParseJSON $ jsonOptions "_wf_"
138 instance ToSchema WithForm where
139 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")
140
141 ------------------------------------------------------------------------
142 type AsyncJobs event ctI input output =
143 Flat (AsyncJobsAPI' 'Unsafe 'Safe ctI '[JSON] Maybe event input output)
144 ------------------------------------------------------------------------
145
146 type Upload = Summary "Corpus Upload endpoint"
147 :> "corpus"
148 :> Capture "corpus_id" CorpusId
149 :<|> "addWithquery" :> AsyncJobsAPI ScraperStatus WithQuery ScraperStatus
150 :<|> "addWithfile" :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
151
152
153 type AddWithQuery = Summary "Add with Query to corpus endpoint"
154 :> "corpus"
155 :> Capture "corpus_id" CorpusId
156 :> "add"
157 :> "query"
158 :> "async"
159 :> AsyncJobsAPI ScraperStatus WithQuery ScraperStatus
160
161 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
162 :> "corpus"
163 :> Capture "corpus_id" CorpusId
164 :> "add"
165 :> "file"
166 :> MultipartForm Mem (MultipartData Mem)
167 :> QueryParam "fileType" FileType
168 :> "async"
169 :> AsyncJobs ScraperStatus '[JSON] () ScraperStatus
170
171 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
172 :> "corpus"
173 :> Capture "corpus_id" CorpusId
174 :> "add"
175 :> "form"
176 :> "async"
177 :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
178
179 ------------------------------------------------------------------------
180 -- TODO WithQuery also has a corpus id
181 addToCorpusJobFunction :: FlowCmdM env err m
182 => CorpusId
183 -> WithQuery
184 -> (ScraperStatus -> m ())
185 -> m ScraperStatus
186 addToCorpusJobFunction _cid (WithQuery _q _dbs _l) logStatus = do
187 -- TODO ...
188 logStatus ScraperStatus { _scst_succeeded = Just 10
189 , _scst_failed = Just 2
190 , _scst_remaining = Just 138
191 , _scst_events = Just []
192 }
193 -- TODO ...
194 pure ScraperStatus { _scst_succeeded = Just 137
195 , _scst_failed = Just 13
196 , _scst_remaining = Just 0
197 , _scst_events = Just []
198 }
199
200
201 addToCorpusWithFile :: FlowCmdM env err m
202 => CorpusId
203 -> MultipartData Mem
204 -> Maybe FileType
205 -> (ScraperStatus -> m ())
206 -> m ScraperStatus
207 addToCorpusWithFile cid input filetype logStatus = do
208 logStatus ScraperStatus { _scst_succeeded = Just 10
209 , _scst_failed = Just 2
210 , _scst_remaining = Just 138
211 , _scst_events = Just []
212 }
213 _h <- postUpload cid filetype input
214
215 pure ScraperStatus { _scst_succeeded = Just 137
216 , _scst_failed = Just 13
217 , _scst_remaining = Just 0
218 , _scst_events = Just []
219 }
220
221 addToCorpusWithForm :: FlowCmdM env err m
222 => CorpusId
223 -> WithForm
224 -> (ScraperStatus -> m ())
225 -> m ScraperStatus
226 addToCorpusWithForm cid (WithForm ft d l) logStatus = do
227
228 printDebug "ft" ft
229
230 let
231 parse = case ft of
232 CSV_HAL -> Parser.parseFormat Parser.CsvHal
233 CSV -> Parser.parseFormat Parser.CsvGargV3
234 WOS -> Parser.parseFormat Parser.WOS
235 PresseRIS -> Parser.parseFormat Parser.RisPresse
236
237 docs <- liftIO
238 $ splitEvery 500
239 <$> take 1000000
240 <$> parse (cs d)
241
242 logStatus ScraperStatus { _scst_succeeded = Just 1
243 , _scst_failed = Just 0
244 , _scst_remaining = Just 1
245 , _scst_events = Just []
246 }
247 cid' <- flowCorpus "user1"
248 (Right [cid])
249 (Multi $ fromMaybe EN l)
250 (map (map toHyperdataDocument) docs)
251
252 printDebug "cid'" cid'
253
254 pure ScraperStatus { _scst_succeeded = Just 2
255 , _scst_failed = Just 0
256 , _scst_remaining = Just 0
257 , _scst_events = Just []
258 }
259