2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
18 module Gargantext.API.Node.Corpus.New
21 import Control.Lens hiding (elements, Empty)
23 import Data.Aeson.TH (deriveJSON)
25 import Data.Maybe (fromMaybe)
27 import Data.Text (Text)
28 import qualified Data.Text as T
29 import GHC.Generics (Generic)
30 import qualified Prelude as Prelude
31 import Protolude (readFile)
33 import Servant.Job.Utils (jsonOptions)
34 -- import Servant.Multipart
35 -- import Test.QuickCheck (elements)
36 import Test.QuickCheck.Arbitrary
38 import Gargantext.Prelude
40 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
41 import Gargantext.API.Admin.Types (HasSettings)
42 import Gargantext.API.Job (jobLogSuccess, jobLogFailTotal)
43 import Gargantext.API.Node.Corpus.New.File
44 import Gargantext.API.Node.Corpus.Searx
45 import Gargantext.API.Node.Corpus.Types
46 import Gargantext.API.Node.Types
47 import Gargantext.Core (Lang(..){-, allLangs-})
48 import qualified Gargantext.Core.Text.Corpus.API as API
49 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
50 import Gargantext.Core.Types.Individu (User(..))
51 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
52 import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
53 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
54 import Gargantext.Database.Action.Mail (sendMail)
55 import Gargantext.Database.Action.Node (mkNodeWithParent)
56 import Gargantext.Database.Action.User (getUserId)
57 import Gargantext.Database.Admin.Types.Hyperdata
58 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
59 import Gargantext.Database.Query.Table.Node (getNodeWith)
60 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
61 import Gargantext.Database.Schema.Node (node_hyperdata)
62 import qualified Gargantext.Database.GargDB as GargDB
64 ------------------------------------------------------------------------
66 data Query = Query { query_query :: Text
67 , query_node_id :: Int
69 , query_databases :: [DataOrigin]
71 deriving (Eq, Generic)
73 deriveJSON (unPrefix "query_") 'Query
75 instance Arbitrary Query where
76 arbitrary = elements [ Query q n la fs
77 | q <- ["honeybee* AND collapse"
82 , fs <- take 3 $ repeat allDataOrigins
85 instance ToSchema Query where
86 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
89 ------------------------------------------------------------------------
95 type PostApi = Summary "New Corpus endpoint"
96 :> ReqBody '[JSON] Query
97 :> Post '[JSON] CorpusId
98 type GetApi = Get '[JSON] ApiInfo
101 -- | TODO manage several apis
103 -- TODO this is only the POST
105 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
106 api uid (Query q _ as) = do
107 cId <- case head as of
108 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
109 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
111 docs <- liftBase $ API.get a q (Just 1000)
112 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
118 ------------------------------------------------
119 -- TODO use this route for Client implementation
120 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
122 instance Arbitrary ApiInfo where
123 arbitrary = ApiInfo <$> arbitrary
125 deriveJSON (unPrefix "") 'ApiInfo
127 instance ToSchema ApiInfo
129 info :: FlowCmdM env err m => UserId -> m ApiInfo
130 info _u = pure $ ApiInfo API.externalAPIs
132 ------------------------------------------------------------------------
133 ------------------------------------------------------------------------
134 data WithQuery = WithQuery
136 , _wq_databases :: !Database
137 , _wq_datafield :: !Datafield
139 , _wq_node_id :: !Int
143 makeLenses ''WithQuery
144 instance FromJSON WithQuery where
145 parseJSON = genericParseJSON $ jsonOptions "_wq_"
146 instance ToJSON WithQuery where
147 toJSON = genericToJSON $ jsonOptions "_wq_"
148 instance ToSchema WithQuery where
149 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
151 ------------------------------------------------------------------------
153 type AddWithQuery = Summary "Add with Query to corpus endpoint"
155 :> Capture "corpus_id" CorpusId
157 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
160 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
162 :> Capture "corpus_id" CorpusId
165 :> MultipartForm Mem (MultipartData Mem)
166 :> QueryParam "fileType" FileType
168 :> AsyncJobs JobLog '[JSON] () JobLog
172 ------------------------------------------------------------------------
173 -- TODO WithQuery also has a corpus id
174 addToCorpusWithQuery :: FlowCmdM env err m
181 addToCorpusWithQuery user cid (WithQuery q dbs datafield l _nid) maybeLimit logStatus = do
183 logStatus JobLog { _scst_succeeded = Just 0
184 , _scst_failed = Just 0
185 , _scst_remaining = Just 3
186 , _scst_events = Just []
188 printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
189 printDebug "[addToCorpusWithQuery] datafield" datafield
193 printDebug "[addToCorpusWithQuery] processing web request" datafield
195 _ <- triggerSearxSearch cid q l
197 pure JobLog { _scst_succeeded = Just 3
198 , _scst_failed = Just 0
199 , _scst_remaining = Just 0
200 , _scst_events = Just []
205 -- TODO if cid is folder -> create Corpus
206 -- if cid is corpus -> add to corpus
207 -- if cid is root -> create corpus in Private
208 txts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
210 logStatus JobLog { _scst_succeeded = Just 2
211 , _scst_failed = Just 0
212 , _scst_remaining = Just 1
213 , _scst_events = Just []
216 cids <- mapM (\txt -> flowDataText user txt (Multi l) cid) txts
217 printDebug "corpus id" cids
218 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
221 pure JobLog { _scst_succeeded = Just 3
222 , _scst_failed = Just 0
223 , _scst_remaining = Just 0
224 , _scst_events = Just []
228 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
230 :> Capture "corpus_id" CorpusId
234 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
236 addToCorpusWithForm :: FlowCmdM env err m
243 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
244 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
245 printDebug "[addToCorpusWithForm] fileType" ft
249 CSV_HAL -> Parser.parseFormat Parser.CsvHal
250 CSV -> Parser.parseFormat Parser.CsvGargV3
251 WOS -> Parser.parseFormat Parser.WOS
252 PresseRIS -> Parser.parseFormat Parser.RisPresse
254 -- TODO granularity of the logStatus
255 eDocs <- liftBase $ parse $ cs d
258 let docs = splitEvery 500 $ take 1000000 docs'
260 printDebug "Parsing corpus finished : " cid
263 printDebug "Starting extraction : " cid
264 -- TODO granularity of the logStatus
265 _cid' <- flowCorpus user
267 (Multi $ fromMaybe EN l)
268 (map (map toHyperdataDocument) docs)
270 printDebug "Extraction finished : " cid
271 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
282 jobLog2 = jobLogSuccess jobLog
283 jobLog3 = jobLogSuccess jobLog2
284 jobLogE = jobLogFailTotal jobLog
286 parseCsvGargV3Path :: [Char] -> IO (Either Prelude.String [HyperdataDocument])
287 parseCsvGargV3Path fp = do
288 contents <- readFile fp
289 Parser.parseFormat Parser.CsvGargV3 $ cs contents
292 addToCorpusWithFile :: FlowCmdM env err m
298 addToCorpusWithFile cid input filetype logStatus = do
299 logStatus JobLog { _scst_succeeded = Just 10
300 , _scst_failed = Just 2
301 , _scst_remaining = Just 138
302 , _scst_events = Just []
304 printDebug "addToCorpusWithFile" cid
305 _h <- postUpload cid filetype input
307 pure JobLog { _scst_succeeded = Just 137
308 , _scst_failed = Just 13
309 , _scst_remaining = Just 0
310 , _scst_events = Just []
316 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
318 :> Capture "corpus_id" CorpusId
322 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
324 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
330 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
332 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
333 logStatus JobLog { _scst_succeeded = Just 0
334 , _scst_failed = Just 0
335 , _scst_remaining = Just 1
336 , _scst_events = Just []
339 fPath <- GargDB.writeFile nwf
340 printDebug "[addToCorpusWithFile] File saved as: " fPath
342 uId <- getUserId user
343 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
347 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
348 let hl = node ^. node_hyperdata
349 _ <- updateHyperdata nId $ hl { _hff_name = fName
350 , _hff_path = T.pack fPath }
352 printDebug "[addToCorpusWithFile] Created node with id: " nId
355 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
357 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
360 pure $ JobLog { _scst_succeeded = Just 1
361 , _scst_failed = Just 0
362 , _scst_remaining = Just 0
363 , _scst_events = Just []