2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
18 module Gargantext.API.Node.Corpus.New
21 import Control.Lens hiding (elements, Empty)
23 import Data.Aeson.TH (deriveJSON)
25 import Data.Maybe (fromMaybe)
27 import Data.Text (Text)
28 import qualified Data.Text as T
29 import GHC.Generics (Generic)
31 import Servant.Job.Utils (jsonOptions)
32 -- import Servant.Multipart
33 -- import Test.QuickCheck (elements)
34 import Test.QuickCheck.Arbitrary
36 import Gargantext.Prelude
38 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
39 import qualified Gargantext.API.Admin.Orchestrator.Types as T
40 import Gargantext.API.Admin.Types (HasSettings)
41 import Gargantext.API.Node.Corpus.New.File
42 import Gargantext.API.Node.Types
43 import Gargantext.Core (Lang(..){-, allLangs-})
44 import Gargantext.Database.Action.Mail (sendMail)
45 import Gargantext.Core.Types.Individu (User(..))
46 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
47 import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, getDataText, flowDataText, TermType(..), DataOrigin(..){-, allDataOrigins-})
48 import Gargantext.Database.Action.User (getUserId)
49 import Gargantext.Database.Action.Node (mkNodeWithParent)
50 import Gargantext.Database.Admin.Types.Hyperdata
51 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
52 import Gargantext.Database.Query.Table.Node (getNodeWith)
53 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
54 import Gargantext.Database.Schema.Node (node_hyperdata)
55 import qualified Gargantext.Prelude.Utils as GPU
56 import qualified Gargantext.Core.Text.Corpus.API as API
57 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
59 ------------------------------------------------------------------------
61 data Query = Query { query_query :: Text
62 , query_node_id :: Int
64 , query_databases :: [DataOrigin]
66 deriving (Eq, Generic)
68 deriveJSON (unPrefix "query_") 'Query
70 instance Arbitrary Query where
71 arbitrary = elements [ Query q n la fs
72 | q <- ["honeybee* AND collapse"
77 , fs <- take 3 $ repeat allDataOrigins
80 instance ToSchema Query where
81 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
84 ------------------------------------------------------------------------
90 type PostApi = Summary "New Corpus endpoint"
91 :> ReqBody '[JSON] Query
92 :> Post '[JSON] CorpusId
93 type GetApi = Get '[JSON] ApiInfo
96 -- | TODO manage several apis
98 -- TODO this is only the POST
100 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
101 api uid (Query q _ as) = do
102 cId <- case head as of
103 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
104 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
106 docs <- liftBase $ API.get a q (Just 1000)
107 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
113 ------------------------------------------------
114 -- TODO use this route for Client implementation
115 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
117 instance Arbitrary ApiInfo where
118 arbitrary = ApiInfo <$> arbitrary
120 deriveJSON (unPrefix "") 'ApiInfo
122 instance ToSchema ApiInfo
124 info :: FlowCmdM env err m => UserId -> m ApiInfo
125 info _u = pure $ ApiInfo API.externalAPIs
127 ------------------------------------------------------------------------
129 data Database = Empty
134 deriving (Eq, Show, Generic)
136 deriveJSON (unPrefix "") ''Database
137 instance ToSchema Database
139 database2origin :: Database -> DataOrigin
140 database2origin Empty = InternalOrigin T.IsTex
141 database2origin PubMed = ExternalOrigin T.PubMed
142 database2origin HAL = ExternalOrigin T.HAL
143 database2origin IsTex = ExternalOrigin T.IsTex
144 database2origin Isidore = ExternalOrigin T.Isidore
146 ------------------------------------------------------------------------
147 data WithQuery = WithQuery
149 , _wq_databases :: !Database
151 , _wq_node_id :: !Int
155 makeLenses ''WithQuery
156 instance FromJSON WithQuery where
157 parseJSON = genericParseJSON $ jsonOptions "_wq_"
158 instance ToSchema WithQuery where
159 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
161 ------------------------------------------------------------------------
163 type AddWithQuery = Summary "Add with Query to corpus endpoint"
165 :> Capture "corpus_id" CorpusId
167 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
170 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
172 :> Capture "corpus_id" CorpusId
175 :> MultipartForm Mem (MultipartData Mem)
176 :> QueryParam "fileType" FileType
178 :> AsyncJobs JobLog '[JSON] () JobLog
182 ------------------------------------------------------------------------
183 -- TODO WithQuery also has a corpus id
184 addToCorpusWithQuery :: FlowCmdM env err m
191 addToCorpusWithQuery user cid (WithQuery q dbs l _nid) maybeLimit logStatus = do
193 logStatus JobLog { _scst_succeeded = Just 0
194 , _scst_failed = Just 0
195 , _scst_remaining = Just 5
196 , _scst_events = Just []
198 printDebug "addToCorpusWithQuery" (cid, dbs)
200 -- TODO if cid is folder -> create Corpus
201 -- if cid is corpus -> add to corpus
202 -- if cid is root -> create corpus in Private
203 txts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
205 logStatus JobLog { _scst_succeeded = Just 2
206 , _scst_failed = Just 0
207 , _scst_remaining = Just 1
208 , _scst_events = Just []
211 cids <- mapM (\txt -> flowDataText user txt (Multi l) cid) txts
212 printDebug "corpus id" cids
213 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
216 pure JobLog { _scst_succeeded = Just 3
217 , _scst_failed = Just 0
218 , _scst_remaining = Just 0
219 , _scst_events = Just []
223 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
225 :> Capture "corpus_id" CorpusId
229 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
231 addToCorpusWithForm :: FlowCmdM env err m
237 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus = do
239 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
240 printDebug "[addToCorpusWithForm] fileType" ft
241 logStatus JobLog { _scst_succeeded = Just 0
242 , _scst_failed = Just 0
243 , _scst_remaining = Just 2
244 , _scst_events = Just []
248 CSV_HAL -> Parser.parseFormat Parser.CsvHal
249 CSV -> Parser.parseFormat Parser.CsvGargV3
250 WOS -> Parser.parseFormat Parser.WOS
251 PresseRIS -> Parser.parseFormat Parser.RisPresse
253 -- TODO granularity of the logStatus
254 docs <- liftBase $ splitEvery 500
258 printDebug "Parsing corpus finished : " cid
259 logStatus JobLog { _scst_succeeded = Just 1
260 , _scst_failed = Just 0
261 , _scst_remaining = Just 1
262 , _scst_events = Just []
266 printDebug "Starting extraction : " cid
267 -- TODO granularity of the logStatus
268 _cid' <- flowCorpus user
270 (Multi $ fromMaybe EN l)
271 (map (map toHyperdataDocument) docs)
273 printDebug "Extraction finished : " cid
274 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
277 pure JobLog { _scst_succeeded = Just 2
278 , _scst_failed = Just 0
279 , _scst_remaining = Just 0
280 , _scst_events = Just []
284 addToCorpusWithFile :: FlowCmdM env err m
290 addToCorpusWithFile cid input filetype logStatus = do
291 logStatus JobLog { _scst_succeeded = Just 10
292 , _scst_failed = Just 2
293 , _scst_remaining = Just 138
294 , _scst_events = Just []
296 printDebug "addToCorpusWithFile" cid
297 _h <- postUpload cid filetype input
299 pure JobLog { _scst_succeeded = Just 137
300 , _scst_failed = Just 13
301 , _scst_remaining = Just 0
302 , _scst_events = Just []
308 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
310 :> Capture "corpus_id" CorpusId
314 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
316 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
322 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
324 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
325 logStatus JobLog { _scst_succeeded = Just 0
326 , _scst_failed = Just 0
327 , _scst_remaining = Just 1
328 , _scst_events = Just []
331 fPath <- GPU.writeFile nwf
332 printDebug "[addToCorpusWithFile] File saved as: " fPath
334 uId <- getUserId user
335 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
339 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
340 let hl = node ^. node_hyperdata
341 _ <- updateHyperdata nId $ hl { _hff_name = fName
342 , _hff_path = T.pack fPath }
344 printDebug "[addToCorpusWithFile] Created node with id: " nId
347 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
349 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
352 pure $ JobLog { _scst_succeeded = Just 1
353 , _scst_failed = Just 0
354 , _scst_remaining = Just 0
355 , _scst_events = Just []