2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
18 module Gargantext.API.Node.Corpus.New
21 import Control.Lens hiding (elements, Empty)
23 import Data.Aeson.TH (deriveJSON)
25 import Data.Maybe (fromMaybe)
27 import Data.Text (Text)
28 import qualified Data.Text as T
29 import GHC.Generics (Generic)
31 import Servant.Job.Utils (jsonOptions)
32 -- import Servant.Multipart
33 -- import Test.QuickCheck (elements)
34 import Test.QuickCheck.Arbitrary
36 import Gargantext.Prelude
38 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
39 import qualified Gargantext.API.Admin.Orchestrator.Types as T
40 import Gargantext.API.Admin.Types (HasSettings)
41 import Gargantext.API.Node.Corpus.New.File
42 import Gargantext.API.Node.Types
43 import Gargantext.Core (Lang(..){-, allLangs-})
44 import Gargantext.Database.Action.Mail (sendMail)
45 import Gargantext.Core.Types.Individu (User(..))
46 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
47 import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, getDataText, flowDataText, TermType(..), DataOrigin(..){-, allDataOrigins-})
48 import Gargantext.Database.Action.User (getUserId)
49 import Gargantext.Database.Action.Node (mkNodeWithParent)
50 import Gargantext.Database.Admin.Types.Hyperdata
51 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
52 import Gargantext.Database.Query.Table.Node (getNodeWith)
53 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
54 import Gargantext.Database.Schema.Node (node_hyperdata)
55 import qualified Gargantext.Database.GargDB as GargDB
56 import qualified Gargantext.Core.Text.Corpus.API as API
57 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
59 ------------------------------------------------------------------------
61 data Query = Query { query_query :: Text
62 , query_node_id :: Int
64 , query_databases :: [DataOrigin]
66 deriving (Eq, Generic)
68 deriveJSON (unPrefix "query_") 'Query
70 instance Arbitrary Query where
71 arbitrary = elements [ Query q n la fs
72 | q <- ["honeybee* AND collapse"
77 , fs <- take 3 $ repeat allDataOrigins
80 instance ToSchema Query where
81 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
84 ------------------------------------------------------------------------
90 type PostApi = Summary "New Corpus endpoint"
91 :> ReqBody '[JSON] Query
92 :> Post '[JSON] CorpusId
93 type GetApi = Get '[JSON] ApiInfo
96 -- | TODO manage several apis
98 -- TODO this is only the POST
100 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
101 api uid (Query q _ as) = do
102 cId <- case head as of
103 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
104 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
106 docs <- liftBase $ API.get a q (Just 1000)
107 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
113 ------------------------------------------------
114 -- TODO use this route for Client implementation
115 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
117 instance Arbitrary ApiInfo where
118 arbitrary = ApiInfo <$> arbitrary
120 deriveJSON (unPrefix "") 'ApiInfo
122 instance ToSchema ApiInfo
124 info :: FlowCmdM env err m => UserId -> m ApiInfo
125 info _u = pure $ ApiInfo API.externalAPIs
127 ------------------------------------------------------------------------
129 data Database = Empty
134 deriving (Eq, Show, Generic)
136 deriveJSON (unPrefix "") ''Database
137 instance ToSchema Database
139 database2origin :: Database -> DataOrigin
140 database2origin Empty = InternalOrigin T.IsTex
141 database2origin PubMed = ExternalOrigin T.PubMed
142 database2origin HAL = ExternalOrigin T.HAL
143 database2origin IsTex = ExternalOrigin T.IsTex
144 database2origin Isidore = ExternalOrigin T.Isidore
146 ------------------------------------------------------------------------
147 data WithQuery = WithQuery
149 , _wq_databases :: !Database
151 , _wq_node_id :: !Int
155 makeLenses ''WithQuery
156 instance FromJSON WithQuery where
157 parseJSON = genericParseJSON $ jsonOptions "_wq_"
158 instance ToJSON WithQuery where
159 toJSON = genericToJSON $ jsonOptions "_wq_"
160 instance ToSchema WithQuery where
161 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
163 ------------------------------------------------------------------------
165 type AddWithQuery = Summary "Add with Query to corpus endpoint"
167 :> Capture "corpus_id" CorpusId
169 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
172 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
174 :> Capture "corpus_id" CorpusId
177 :> MultipartForm Mem (MultipartData Mem)
178 :> QueryParam "fileType" FileType
180 :> AsyncJobs JobLog '[JSON] () JobLog
184 ------------------------------------------------------------------------
185 -- TODO WithQuery also has a corpus id
186 addToCorpusWithQuery :: FlowCmdM env err m
193 addToCorpusWithQuery user cid (WithQuery q dbs l _nid) maybeLimit logStatus = do
195 logStatus JobLog { _scst_succeeded = Just 0
196 , _scst_failed = Just 0
197 , _scst_remaining = Just 5
198 , _scst_events = Just []
200 printDebug "addToCorpusWithQuery" (cid, dbs)
202 -- TODO if cid is folder -> create Corpus
203 -- if cid is corpus -> add to corpus
204 -- if cid is root -> create corpus in Private
205 txts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
207 logStatus JobLog { _scst_succeeded = Just 2
208 , _scst_failed = Just 0
209 , _scst_remaining = Just 1
210 , _scst_events = Just []
213 cids <- mapM (\txt -> flowDataText user txt (Multi l) cid) txts
214 printDebug "corpus id" cids
215 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
218 pure JobLog { _scst_succeeded = Just 3
219 , _scst_failed = Just 0
220 , _scst_remaining = Just 0
221 , _scst_events = Just []
225 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
227 :> Capture "corpus_id" CorpusId
231 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
233 addToCorpusWithForm :: FlowCmdM env err m
239 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus = do
240 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
241 printDebug "[addToCorpusWithForm] fileType" ft
242 logStatus JobLog { _scst_succeeded = Just 0
243 , _scst_failed = Just 0
244 , _scst_remaining = Just 2
245 , _scst_events = Just []
249 CSV_HAL -> Parser.parseFormat Parser.CsvHal
250 CSV -> Parser.parseFormat Parser.CsvGargV3
251 WOS -> Parser.parseFormat Parser.WOS
252 PresseRIS -> Parser.parseFormat Parser.RisPresse
254 -- TODO granularity of the logStatus
255 docs <- liftBase $ splitEvery 500
259 printDebug "Parsing corpus finished : " cid
260 logStatus JobLog { _scst_succeeded = Just 1
261 , _scst_failed = Just 0
262 , _scst_remaining = Just 1
263 , _scst_events = Just []
267 printDebug "Starting extraction : " cid
268 -- TODO granularity of the logStatus
269 _cid' <- flowCorpus user
271 (Multi $ fromMaybe EN l)
272 (map (map toHyperdataDocument) docs)
274 printDebug "Extraction finished : " cid
275 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
278 pure JobLog { _scst_succeeded = Just 2
279 , _scst_failed = Just 0
280 , _scst_remaining = Just 0
281 , _scst_events = Just []
285 addToCorpusWithFile :: FlowCmdM env err m
291 addToCorpusWithFile cid input filetype logStatus = do
292 logStatus JobLog { _scst_succeeded = Just 10
293 , _scst_failed = Just 2
294 , _scst_remaining = Just 138
295 , _scst_events = Just []
297 printDebug "addToCorpusWithFile" cid
298 _h <- postUpload cid filetype input
300 pure JobLog { _scst_succeeded = Just 137
301 , _scst_failed = Just 13
302 , _scst_remaining = Just 0
303 , _scst_events = Just []
309 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
311 :> Capture "corpus_id" CorpusId
315 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
317 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
323 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
325 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
326 logStatus JobLog { _scst_succeeded = Just 0
327 , _scst_failed = Just 0
328 , _scst_remaining = Just 1
329 , _scst_events = Just []
332 fPath <- GargDB.writeFile nwf
333 printDebug "[addToCorpusWithFile] File saved as: " fPath
335 uId <- getUserId user
336 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
340 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
341 let hl = node ^. node_hyperdata
342 _ <- updateHyperdata nId $ hl { _hff_name = fName
343 , _hff_path = T.pack fPath }
345 printDebug "[addToCorpusWithFile] Created node with id: " nId
348 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
350 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
353 pure $ JobLog { _scst_succeeded = Just 1
354 , _scst_failed = Just 0
355 , _scst_remaining = Just 0
356 , _scst_events = Just []