2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
18 module Gargantext.API.Node.Corpus.New
21 import Control.Lens hiding (elements, Empty)
23 import Data.Aeson.TH (deriveJSON)
25 import Data.Maybe (fromMaybe)
27 import Data.Text (Text)
28 import qualified Data.Text as T
29 import GHC.Generics (Generic)
30 import qualified Prelude as Prelude
31 import Protolude (readFile)
33 import Servant.Job.Utils (jsonOptions)
34 -- import Servant.Multipart
35 -- import Test.QuickCheck (elements)
36 import Test.QuickCheck.Arbitrary
38 import Gargantext.Prelude
40 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
41 import Gargantext.API.Admin.Types (HasSettings)
42 import Gargantext.API.Job (jobLogSuccess, jobLogFailTotal)
43 import Gargantext.API.Node.Corpus.New.File
44 import Gargantext.API.Node.Corpus.Searx
45 import Gargantext.API.Node.Corpus.Types
46 import Gargantext.API.Node.Types
47 import Gargantext.Core (Lang(..){-, allLangs-})
48 import qualified Gargantext.Core.Text.Corpus.API as API
49 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
50 import Gargantext.Core.Types.Individu (User(..))
51 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
52 import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
53 import Gargantext.Database.Action.Mail (sendMail)
54 import Gargantext.Database.Action.Node (mkNodeWithParent)
55 import Gargantext.Database.Action.User (getUserId)
56 import Gargantext.Database.Admin.Types.Hyperdata
57 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
58 import Gargantext.Database.Query.Table.Node (getNodeWith)
59 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
60 import Gargantext.Database.Schema.Node (node_hyperdata)
61 import qualified Gargantext.Database.GargDB as GargDB
63 ------------------------------------------------------------------------
65 data Query = Query { query_query :: Text
66 , query_node_id :: Int
68 , query_databases :: [DataOrigin]
70 deriving (Eq, Generic)
72 deriveJSON (unPrefix "query_") 'Query
74 instance Arbitrary Query where
75 arbitrary = elements [ Query q n la fs
76 | q <- ["honeybee* AND collapse"
81 , fs <- take 3 $ repeat allDataOrigins
84 instance ToSchema Query where
85 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
88 ------------------------------------------------------------------------
94 type PostApi = Summary "New Corpus endpoint"
95 :> ReqBody '[JSON] Query
96 :> Post '[JSON] CorpusId
97 type GetApi = Get '[JSON] ApiInfo
100 -- | TODO manage several apis
102 -- TODO this is only the POST
104 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
105 api uid (Query q _ as) = do
106 cId <- case head as of
107 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
108 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
110 docs <- liftBase $ API.get a q (Just 1000)
111 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
117 ------------------------------------------------
118 -- TODO use this route for Client implementation
119 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
121 instance Arbitrary ApiInfo where
122 arbitrary = ApiInfo <$> arbitrary
124 deriveJSON (unPrefix "") 'ApiInfo
126 instance ToSchema ApiInfo
128 info :: FlowCmdM env err m => UserId -> m ApiInfo
129 info _u = pure $ ApiInfo API.externalAPIs
131 ------------------------------------------------------------------------
132 ------------------------------------------------------------------------
133 data WithQuery = WithQuery
135 , _wq_databases :: !Database
136 , _wq_datafield :: !Datafield
138 , _wq_node_id :: !Int
142 makeLenses ''WithQuery
143 instance FromJSON WithQuery where
144 parseJSON = genericParseJSON $ jsonOptions "_wq_"
145 instance ToJSON WithQuery where
146 toJSON = genericToJSON $ jsonOptions "_wq_"
147 instance ToSchema WithQuery where
148 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
150 ------------------------------------------------------------------------
152 type AddWithQuery = Summary "Add with Query to corpus endpoint"
154 :> Capture "corpus_id" CorpusId
156 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
159 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
161 :> Capture "corpus_id" CorpusId
164 :> MultipartForm Mem (MultipartData Mem)
165 :> QueryParam "fileType" FileType
167 :> AsyncJobs JobLog '[JSON] () JobLog
171 ------------------------------------------------------------------------
172 -- TODO WithQuery also has a corpus id
173 addToCorpusWithQuery :: FlowCmdM env err m
180 addToCorpusWithQuery user cid (WithQuery q dbs datafield l _nid) maybeLimit logStatus = do
182 logStatus JobLog { _scst_succeeded = Just 0
183 , _scst_failed = Just 0
184 , _scst_remaining = Just 3
185 , _scst_events = Just []
187 printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
188 printDebug "[addToCorpusWithQuery] datafield" datafield
192 printDebug "[addToCorpusWithQuery] processing web request" datafield
194 _ <- triggerSearxSearch cid q l
196 pure JobLog { _scst_succeeded = Just 3
197 , _scst_failed = Just 0
198 , _scst_remaining = Just 0
199 , _scst_events = Just []
204 -- TODO if cid is folder -> create Corpus
205 -- if cid is corpus -> add to corpus
206 -- if cid is root -> create corpus in Private
207 txts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
209 logStatus JobLog { _scst_succeeded = Just 2
210 , _scst_failed = Just 0
211 , _scst_remaining = Just 1
212 , _scst_events = Just []
215 cids <- mapM (\txt -> flowDataText user txt (Multi l) cid) txts
216 printDebug "corpus id" cids
217 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
220 pure JobLog { _scst_succeeded = Just 3
221 , _scst_failed = Just 0
222 , _scst_remaining = Just 0
223 , _scst_events = Just []
227 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
229 :> Capture "corpus_id" CorpusId
233 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
235 addToCorpusWithForm :: FlowCmdM env err m
242 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
243 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
244 printDebug "[addToCorpusWithForm] fileType" ft
248 CSV_HAL -> Parser.parseFormat Parser.CsvHal
249 CSV -> Parser.parseFormat Parser.CsvGargV3
250 WOS -> Parser.parseFormat Parser.WOS
251 PresseRIS -> Parser.parseFormat Parser.RisPresse
253 -- TODO granularity of the logStatus
254 eDocs <- liftBase $ parse $ cs d
257 let docs = splitEvery 500 $ take 1000000 docs'
259 printDebug "Parsing corpus finished : " cid
262 printDebug "Starting extraction : " cid
263 -- TODO granularity of the logStatus
264 _cid' <- flowCorpus user
266 (Multi $ fromMaybe EN l)
267 (map (map toHyperdataDocument) docs)
269 printDebug "Extraction finished : " cid
270 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
281 jobLog2 = jobLogSuccess jobLog
282 jobLog3 = jobLogSuccess jobLog2
283 jobLogE = jobLogFailTotal jobLog
285 parseCsvGargV3Path :: [Char] -> IO (Either Prelude.String [HyperdataDocument])
286 parseCsvGargV3Path fp = do
287 contents <- readFile fp
288 Parser.parseFormat Parser.CsvGargV3 $ cs contents
291 addToCorpusWithFile :: FlowCmdM env err m
297 addToCorpusWithFile cid input filetype logStatus = do
298 logStatus JobLog { _scst_succeeded = Just 10
299 , _scst_failed = Just 2
300 , _scst_remaining = Just 138
301 , _scst_events = Just []
303 printDebug "addToCorpusWithFile" cid
304 _h <- postUpload cid filetype input
306 pure JobLog { _scst_succeeded = Just 137
307 , _scst_failed = Just 13
308 , _scst_remaining = Just 0
309 , _scst_events = Just []
315 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
317 :> Capture "corpus_id" CorpusId
321 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
323 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
329 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
331 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
332 logStatus JobLog { _scst_succeeded = Just 0
333 , _scst_failed = Just 0
334 , _scst_remaining = Just 1
335 , _scst_events = Just []
338 fPath <- GargDB.writeFile nwf
339 printDebug "[addToCorpusWithFile] File saved as: " fPath
341 uId <- getUserId user
342 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
346 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
347 let hl = node ^. node_hyperdata
348 _ <- updateHyperdata nId $ hl { _hff_name = fName
349 , _hff_path = T.pack fPath }
351 printDebug "[addToCorpusWithFile] Created node with id: " nId
354 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
356 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
359 pure $ JobLog { _scst_succeeded = Just 1
360 , _scst_failed = Just 0
361 , _scst_remaining = Just 0
362 , _scst_events = Just []