2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
18 module Gargantext.API.Node.Corpus.New
21 import Control.Lens hiding (elements, Empty)
23 import Data.Aeson.TH (deriveJSON)
25 import Data.Maybe (fromMaybe)
27 import Data.Text (Text)
28 import qualified Data.Text as T
29 import GHC.Generics (Generic)
31 import Servant.Job.Utils (jsonOptions)
32 -- import Servant.Multipart
33 -- import Test.QuickCheck (elements)
34 import Test.QuickCheck.Arbitrary
36 import Gargantext.Prelude
38 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
39 import Gargantext.API.Admin.Types (HasSettings)
40 import Gargantext.API.Node.Corpus.New.File
41 import Gargantext.API.Node.Corpus.Searx
42 import Gargantext.API.Node.Corpus.Types
43 import Gargantext.API.Node.Types
44 import Gargantext.Core (Lang(..){-, allLangs-})
45 import qualified Gargantext.Core.Text.Corpus.API as API
46 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
47 import Gargantext.Core.Types.Individu (User(..))
48 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
49 import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
50 import Gargantext.Database.Action.Mail (sendMail)
51 import Gargantext.Database.Action.Node (mkNodeWithParent)
52 import Gargantext.Database.Action.User (getUserId)
53 import Gargantext.Database.Admin.Types.Hyperdata
54 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
55 import Gargantext.Database.Query.Table.Node (getNodeWith)
56 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
57 import Gargantext.Database.Schema.Node (node_hyperdata)
58 import qualified Gargantext.Database.GargDB as GargDB
60 ------------------------------------------------------------------------
62 data Query = Query { query_query :: Text
63 , query_node_id :: Int
65 , query_databases :: [DataOrigin]
67 deriving (Eq, Generic)
69 deriveJSON (unPrefix "query_") 'Query
71 instance Arbitrary Query where
72 arbitrary = elements [ Query q n la fs
73 | q <- ["honeybee* AND collapse"
78 , fs <- take 3 $ repeat allDataOrigins
81 instance ToSchema Query where
82 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
85 ------------------------------------------------------------------------
91 type PostApi = Summary "New Corpus endpoint"
92 :> ReqBody '[JSON] Query
93 :> Post '[JSON] CorpusId
94 type GetApi = Get '[JSON] ApiInfo
97 -- | TODO manage several apis
99 -- TODO this is only the POST
101 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
102 api uid (Query q _ as) = do
103 cId <- case head as of
104 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
105 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
107 docs <- liftBase $ API.get a q (Just 1000)
108 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
114 ------------------------------------------------
115 -- TODO use this route for Client implementation
116 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
118 instance Arbitrary ApiInfo where
119 arbitrary = ApiInfo <$> arbitrary
121 deriveJSON (unPrefix "") 'ApiInfo
123 instance ToSchema ApiInfo
125 info :: FlowCmdM env err m => UserId -> m ApiInfo
126 info _u = pure $ ApiInfo API.externalAPIs
128 ------------------------------------------------------------------------
129 ------------------------------------------------------------------------
130 data WithQuery = WithQuery
132 , _wq_databases :: !Database
133 , _wq_datafield :: !Datafield
135 , _wq_node_id :: !Int
139 makeLenses ''WithQuery
140 instance FromJSON WithQuery where
141 parseJSON = genericParseJSON $ jsonOptions "_wq_"
142 instance ToJSON WithQuery where
143 toJSON = genericToJSON $ jsonOptions "_wq_"
144 instance ToSchema WithQuery where
145 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
147 ------------------------------------------------------------------------
149 type AddWithQuery = Summary "Add with Query to corpus endpoint"
151 :> Capture "corpus_id" CorpusId
153 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
156 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
158 :> Capture "corpus_id" CorpusId
161 :> MultipartForm Mem (MultipartData Mem)
162 :> QueryParam "fileType" FileType
164 :> AsyncJobs JobLog '[JSON] () JobLog
168 ------------------------------------------------------------------------
169 -- TODO WithQuery also has a corpus id
170 addToCorpusWithQuery :: FlowCmdM env err m
177 addToCorpusWithQuery user cid (WithQuery q dbs datafield l _nid) maybeLimit logStatus = do
179 logStatus JobLog { _scst_succeeded = Just 0
180 , _scst_failed = Just 0
181 , _scst_remaining = Just 3
182 , _scst_events = Just []
184 printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
185 printDebug "[addToCorpusWithQuery] datafield" datafield
189 printDebug "[addToCorpusWithQuery] processing web request" datafield
191 _ <- triggerSearxSearch cid q l
193 pure JobLog { _scst_succeeded = Just 3
194 , _scst_failed = Just 0
195 , _scst_remaining = Just 0
196 , _scst_events = Just []
201 -- TODO if cid is folder -> create Corpus
202 -- if cid is corpus -> add to corpus
203 -- if cid is root -> create corpus in Private
204 txts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
206 logStatus JobLog { _scst_succeeded = Just 2
207 , _scst_failed = Just 0
208 , _scst_remaining = Just 1
209 , _scst_events = Just []
212 cids <- mapM (\txt -> flowDataText user txt (Multi l) cid) txts
213 printDebug "corpus id" cids
214 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
217 pure JobLog { _scst_succeeded = Just 3
218 , _scst_failed = Just 0
219 , _scst_remaining = Just 0
220 , _scst_events = Just []
224 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
226 :> Capture "corpus_id" CorpusId
230 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
232 addToCorpusWithForm :: FlowCmdM env err m
238 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus = do
239 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
240 printDebug "[addToCorpusWithForm] fileType" ft
241 logStatus JobLog { _scst_succeeded = Just 0
242 , _scst_failed = Just 0
243 , _scst_remaining = Just 2
244 , _scst_events = Just []
248 CSV_HAL -> Parser.parseFormat Parser.CsvHal
249 CSV -> Parser.parseFormat Parser.CsvGargV3
250 WOS -> Parser.parseFormat Parser.WOS
251 PresseRIS -> Parser.parseFormat Parser.RisPresse
253 -- TODO granularity of the logStatus
254 docs <- liftBase $ splitEvery 500
258 printDebug "Parsing corpus finished : " cid
259 logStatus JobLog { _scst_succeeded = Just 1
260 , _scst_failed = Just 0
261 , _scst_remaining = Just 1
262 , _scst_events = Just []
266 printDebug "Starting extraction : " cid
267 -- TODO granularity of the logStatus
268 _cid' <- flowCorpus user
270 (Multi $ fromMaybe EN l)
271 (map (map toHyperdataDocument) docs)
273 printDebug "Extraction finished : " cid
274 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
277 pure JobLog { _scst_succeeded = Just 2
278 , _scst_failed = Just 0
279 , _scst_remaining = Just 0
280 , _scst_events = Just []
284 addToCorpusWithFile :: FlowCmdM env err m
290 addToCorpusWithFile cid input filetype logStatus = do
291 logStatus JobLog { _scst_succeeded = Just 10
292 , _scst_failed = Just 2
293 , _scst_remaining = Just 138
294 , _scst_events = Just []
296 printDebug "addToCorpusWithFile" cid
297 _h <- postUpload cid filetype input
299 pure JobLog { _scst_succeeded = Just 137
300 , _scst_failed = Just 13
301 , _scst_remaining = Just 0
302 , _scst_events = Just []
308 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
310 :> Capture "corpus_id" CorpusId
314 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
316 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
322 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
324 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
325 logStatus JobLog { _scst_succeeded = Just 0
326 , _scst_failed = Just 0
327 , _scst_remaining = Just 1
328 , _scst_events = Just []
331 fPath <- GargDB.writeFile nwf
332 printDebug "[addToCorpusWithFile] File saved as: " fPath
334 uId <- getUserId user
335 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
339 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
340 let hl = node ^. node_hyperdata
341 _ <- updateHyperdata nId $ hl { _hff_name = fName
342 , _hff_path = T.pack fPath }
344 printDebug "[addToCorpusWithFile] Created node with id: " nId
347 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
349 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
352 pure $ JobLog { _scst_succeeded = Just 1
353 , _scst_failed = Just 0
354 , _scst_remaining = Just 0
355 , _scst_events = Just []