]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Corpus/New.hs
[WIP] Starting query Garg for a Hello Word.
[gargantext.git] / src / Gargantext / API / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE DeriveGeneric #-}
18 {-# LANGUAGE DataKinds #-}
19 {-# LANGUAGE TypeOperators #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE FlexibleContexts #-}
22 {-# LANGUAGE RankNTypes #-}
23 {-# OPTIONS_GHC -fno-warn-orphans #-}
24
25 module Gargantext.API.Corpus.New
26 where
27
28 --import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat(..))
29 import Control.Lens hiding (elements)
30 import Data.Aeson
31 import Data.Aeson.TH (deriveJSON)
32 import Data.Maybe (fromMaybe)
33 import Data.Either
34 import Data.Swagger
35 import Data.Text (Text)
36 import GHC.Generics (Generic)
37 import Gargantext.API.Corpus.New.File
38 import Gargantext.API.Orchestrator.Types
39 import Gargantext.Core (Lang(..))
40 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
41 import Gargantext.Database.Flow (FlowCmdM, flowCorpus)
42 import Gargantext.Database.Flow (flowCorpusSearchInDatabase)
43 import Gargantext.Database.Types.Node (CorpusId)
44 import Gargantext.Database.Types.Node (ToHyperdataDocument(..))
45 import Gargantext.Core.Types.Individu (UserId, User(..))
46 import Gargantext.Prelude
47 import qualified Gargantext.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
48 import Gargantext.Text.Terms (TermType(..))
49 import Servant
50 import Servant.API.Flatten (Flat)
51 import Servant.Job.Core
52 import Servant.Job.Types
53 import Servant.Job.Utils (jsonOptions)
54 import Servant.Multipart
55 import Test.QuickCheck (elements)
56 import Test.QuickCheck.Arbitrary
57 import Web.FormUrlEncoded (FromForm)
58 import qualified Gargantext.Text.Corpus.API as API
59
60 data Query = Query { query_query :: Text
61 , query_corpus_id :: Int
62 , query_databases :: [API.ExternalAPIs]
63 }
64 deriving (Eq, Show, Generic)
65
66 deriveJSON (unPrefix "query_") 'Query
67
68 instance Arbitrary Query where
69 arbitrary = elements [ Query q n fs
70 | q <- ["a","b"]
71 , n <- [0..10]
72 , fs <- take 3 $ repeat API.externalAPIs
73 ]
74
75 instance ToSchema Query where
76 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
77
78 ------------------------------------------------------------------------
79
80 type Api = PostApi
81 :<|> GetApi
82
83 type PostApi = Summary "New Corpus endpoint"
84 :> ReqBody '[JSON] Query
85 :> Post '[JSON] CorpusId
86 type GetApi = Get '[JSON] ApiInfo
87
88 -- | TODO manage several apis
89 -- TODO-ACCESS
90 -- TODO this is only the POST
91 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
92 api uid (Query q _ as) = do
93 cId <- case head as of
94 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
95 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
96 Just a -> do
97 docs <- liftBase $ API.get a q (Just 1000)
98 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
99 pure cId'
100
101 pure cId
102
103 ------------------------------------------------
104 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
105 deriving (Generic)
106 instance Arbitrary ApiInfo where
107 arbitrary = ApiInfo <$> arbitrary
108
109 deriveJSON (unPrefix "") 'ApiInfo
110
111 instance ToSchema ApiInfo
112
113 info :: FlowCmdM env err m => UserId -> m ApiInfo
114 info _u = pure $ ApiInfo API.externalAPIs
115
116 ------------------------------------------------------------------------
117 ------------------------------------------------------------------------
118 data WithQuery = WithQuery
119 { _wq_query :: !Text
120 , _wq_databases :: ![ExternalAPIs]
121 , _wq_lang :: !(Maybe Lang)
122 }
123 deriving Generic
124
125 makeLenses ''WithQuery
126 instance FromJSON WithQuery where
127 parseJSON = genericParseJSON $ jsonOptions "_wq_"
128 instance ToSchema WithQuery where
129 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
130
131 -------------------------------------------------------
132 data WithForm = WithForm
133 { _wf_filetype :: !FileType
134 , _wf_data :: !Text
135 , _wf_lang :: !(Maybe Lang)
136 , _wf_name :: !Text
137 } deriving (Eq, Show, Generic)
138
139 makeLenses ''WithForm
140 instance FromForm WithForm
141 instance FromJSON WithForm where
142 parseJSON = genericParseJSON $ jsonOptions "_wf_"
143 instance ToSchema WithForm where
144 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")
145
146 ------------------------------------------------------------------------
147 type AsyncJobs event ctI input output =
148 Flat (AsyncJobsAPI' 'Unsafe 'Safe ctI '[JSON] Maybe event input output)
149 ------------------------------------------------------------------------
150
151 type Upload = Summary "Corpus Upload endpoint"
152 :> "corpus"
153 :> Capture "corpus_id" CorpusId
154 :<|> "addWithquery" :> AsyncJobsAPI ScraperStatus WithQuery ScraperStatus
155 :<|> "addWithfile" :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
156
157 type AddWithQuery = Summary "Add with Query to corpus endpoint"
158 :> "corpus"
159 :> Capture "corpus_id" CorpusId
160 :> "add"
161 :> "query"
162 :> "async"
163 :> AsyncJobsAPI ScraperStatus WithQuery ScraperStatus
164
165 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
166 :> "corpus"
167 :> Capture "corpus_id" CorpusId
168 :> "add"
169 :> "file"
170 :> MultipartForm Mem (MultipartData Mem)
171 :> QueryParam "fileType" FileType
172 :> "async"
173 :> AsyncJobs ScraperStatus '[JSON] () ScraperStatus
174
175 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
176 :> "corpus"
177 :> Capture "corpus_id" CorpusId
178 :> "add"
179 :> "form"
180 :> "async"
181 :> AsyncJobs ScraperStatus '[FormUrlEncoded] WithForm ScraperStatus
182
183 ------------------------------------------------------------------------
184 -- TODO WithQuery also has a corpus id
185 addToCorpusWithQuery :: FlowCmdM env err m
186 => User
187 -> CorpusId
188 -> WithQuery
189 -> (ScraperStatus -> m ())
190 -> m ScraperStatus
191 addToCorpusWithQuery u cid (WithQuery q _dbs l) logStatus = do
192 -- TODO ...
193 logStatus ScraperStatus { _scst_succeeded = Just 10
194 , _scst_failed = Just 2
195 , _scst_remaining = Just 138
196 , _scst_events = Just []
197 }
198 printDebug "addToCorpusWithQuery" cid
199 -- TODO add cid
200 -- TODO if cid is folder -> create Corpus
201 -- if cid is corpus -> add to corpus
202 -- if cid is root -> create corpus in Private
203 cids <- flowCorpusSearchInDatabase u (maybe EN identity l) q
204 printDebug "corpus id" cids
205 -- TODO ...
206 pure ScraperStatus { _scst_succeeded = Just 137
207 , _scst_failed = Just 13
208 , _scst_remaining = Just 0
209 , _scst_events = Just []
210 }
211
212 addToCorpusWithFile :: FlowCmdM env err m
213 => CorpusId
214 -> MultipartData Mem
215 -> Maybe FileType
216 -> (ScraperStatus -> m ())
217 -> m ScraperStatus
218 addToCorpusWithFile cid input filetype logStatus = do
219 logStatus ScraperStatus { _scst_succeeded = Just 10
220 , _scst_failed = Just 2
221 , _scst_remaining = Just 138
222 , _scst_events = Just []
223 }
224 printDebug "addToCorpusWithFile" cid
225 _h <- postUpload cid filetype input
226
227 pure ScraperStatus { _scst_succeeded = Just 137
228 , _scst_failed = Just 13
229 , _scst_remaining = Just 0
230 , _scst_events = Just []
231 }
232
233 {- | Model to fork the flow
234 -- This is not really optimized since it increases the need RAM
235 -- and freezes the whole system
236 -- This is mainly for documentation (see a better solution in the function below)
237 -- Each process has to be tailored
238 addToCorpusWithForm' :: FlowCmdM env err m
239 => CorpusId
240 -> WithForm
241 -> (ScraperStatus -> m ())
242 -> m ScraperStatus
243 addToCorpusWithForm' cid (WithForm ft d l) logStatus = do
244 newStatus <- liftBase newEmptyMVar
245 s <- addToCorpusWithForm cid (WithForm ft d l) logStatus
246 _ <- liftBase $ forkIO $ putMVar newStatus s
247 s' <- liftBase $ takeMVar newStatus
248 pure s'
249 -}
250 addToCorpusWithForm :: FlowCmdM env err m
251 => User
252 -> CorpusId
253 -> WithForm
254 -> (ScraperStatus -> m ())
255 -> m ScraperStatus
256 addToCorpusWithForm user cid (WithForm ft d l _n) logStatus = do
257
258 let
259 parse = case ft of
260 CSV_HAL -> Parser.parseFormat Parser.CsvHal
261 CSV -> Parser.parseFormat Parser.CsvGargV3
262 WOS -> Parser.parseFormat Parser.WOS
263 PresseRIS -> Parser.parseFormat Parser.RisPresse
264
265 logStatus ScraperStatus { _scst_succeeded = Just 1
266 , _scst_failed = Just 0
267 , _scst_remaining = Just 1
268 , _scst_events = Just []
269 }
270
271 printDebug "Parsing corpus: " cid
272
273 -- TODO granularity of the logStatus
274 docs <- liftBase $ splitEvery 500
275 <$> take 1000000
276 <$> parse (cs d)
277
278 printDebug "Parsing corpus finished : " cid
279 printDebug "Starting extraction : " cid
280
281 -- TODO granularity of the logStatus
282 _cid' <- flowCorpus user
283 (Right [cid])
284 (Multi $ fromMaybe EN l)
285 (map (map toHyperdataDocument) docs)
286
287 printDebug "Extraction finished : " cid
288
289 pure ScraperStatus { _scst_succeeded = Just 2
290 , _scst_failed = Just 0
291 , _scst_remaining = Just 0
292 , _scst_events = Just []
293 }
294