2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
18 module Gargantext.API.Node.Corpus.New
23 import Control.Lens hiding (elements, Empty)
25 import Data.Aeson.TH (deriveJSON)
26 import qualified Data.ByteString.Base64 as BSB64
27 import Data.Conduit.Internal (zipSources)
29 import Data.Maybe (fromMaybe)
31 import Data.Text (Text)
32 import qualified Data.Text as T
33 import GHC.Generics (Generic)
35 import Servant.Job.Utils (jsonOptions)
36 -- import Servant.Multipart
37 import qualified Data.Text.Encoding as TE
38 -- import Test.QuickCheck (elements)
39 import Test.QuickCheck.Arbitrary
41 import Gargantext.Prelude
43 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs, ScraperEvent(..), scst_events)
44 import Gargantext.API.Admin.Types (HasSettings)
45 import Gargantext.API.Job (addEvent, jobLogSuccess, jobLogFailTotal)
46 import Gargantext.API.Node.Corpus.New.Types
47 import Gargantext.API.Node.Corpus.Searx
48 import Gargantext.API.Node.Corpus.Types
49 import Gargantext.API.Node.Types
50 import Gargantext.Core (Lang(..){-, allLangs-})
51 import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
52 import qualified Gargantext.Core.Text.Corpus.API as API
53 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileType(..), parseFormatC)
54 import Gargantext.Core.Types.Individu (User(..))
55 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
56 import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
57 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
58 import Gargantext.Database.Action.Mail (sendMail)
59 import Gargantext.Database.Action.Node (mkNodeWithParent)
60 import Gargantext.Database.Action.User (getUserId)
61 import Gargantext.Database.Admin.Types.Hyperdata
62 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
63 import Gargantext.Database.Prelude (hasConfig)
64 import Gargantext.Database.Query.Table.Node (getNodeWith)
65 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
66 import Gargantext.Database.Schema.Node (node_hyperdata)
67 import qualified Gargantext.Database.GargDB as GargDB
68 import Gargantext.Prelude.Config (gc_max_docs_parsers)
69 ------------------------------------------------------------------------
71 data Query = Query { query_query :: Text
72 , query_node_id :: Int
74 , query_databases :: [DataOrigin]
76 deriving (Eq, Generic)
78 deriveJSON (unPrefix "query_") 'Query
80 instance Arbitrary Query where
81 arbitrary = elements [ Query q n la fs
82 | q <- ["honeybee* AND collapse"
87 , fs <- take 3 $ repeat allDataOrigins
90 instance ToSchema Query where
91 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
94 ------------------------------------------------------------------------
100 type PostApi = Summary "New Corpus endpoint"
101 :> ReqBody '[JSON] Query
102 :> Post '[JSON] CorpusId
103 type GetApi = Get '[JSON] ApiInfo
106 -- | TODO manage several apis
108 -- TODO this is only the POST
110 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
111 api uid (Query q _ as) = do
112 cId <- case head as of
113 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
114 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
116 docs <- liftBase $ API.get a q (Just 1000)
117 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
123 ------------------------------------------------
124 -- TODO use this route for Client implementation
125 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
127 instance Arbitrary ApiInfo where
128 arbitrary = ApiInfo <$> arbitrary
130 deriveJSON (unPrefix "") 'ApiInfo
132 instance ToSchema ApiInfo
134 info :: FlowCmdM env err m => UserId -> m ApiInfo
135 info _u = pure $ ApiInfo API.externalAPIs
137 ------------------------------------------------------------------------
138 ------------------------------------------------------------------------
139 data WithQuery = WithQuery
141 , _wq_databases :: !Database
142 , _wq_datafield :: !(Maybe Datafield)
144 , _wq_node_id :: !Int
145 , _wq_flowListWith :: !FlowSocialListWith
149 makeLenses ''WithQuery
150 instance FromJSON WithQuery where
151 parseJSON = genericParseJSON $ jsonOptions "_wq_"
152 instance ToJSON WithQuery where
153 toJSON = genericToJSON $ jsonOptions "_wq_"
154 instance ToSchema WithQuery where
155 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
157 ------------------------------------------------------------------------
159 type AddWithQuery = Summary "Add with Query to corpus endpoint"
161 :> Capture "corpus_id" CorpusId
163 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
166 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
168 :> Capture "corpus_id" CorpusId
171 :> MultipartForm Mem (MultipartData Mem)
172 :> QueryParam "fileType" FileType
174 :> AsyncJobs JobLog '[JSON] () JobLog
178 ------------------------------------------------------------------------
179 -- TODO WithQuery also has a corpus id
180 addToCorpusWithQuery :: FlowCmdM env err m
187 addToCorpusWithQuery user cid (WithQuery { _wq_query = q
188 , _wq_databases = dbs
189 , _wq_datafield = datafield
191 , _wq_flowListWith = flw }) maybeLimit logStatus = do
193 logStatus JobLog { _scst_succeeded = Just 0
194 , _scst_failed = Just 0
195 , _scst_remaining = Just 3
196 , _scst_events = Just []
198 printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
199 printDebug "[addToCorpusWithQuery] datafield" datafield
200 printDebug "[addToCorpusWithQuery] flowListWith" flw
204 printDebug "[addToCorpusWithQuery] processing web request" datafield
206 _ <- triggerSearxSearch user cid q l logStatus
208 pure JobLog { _scst_succeeded = Just 3
209 , _scst_failed = Just 0
210 , _scst_remaining = Just 0
211 , _scst_events = Just []
216 -- TODO if cid is folder -> create Corpus
217 -- if cid is corpus -> add to corpus
218 -- if cid is root -> create corpus in Private
219 eTxts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
220 let lTxts = lefts eTxts
223 let txts = rights eTxts
224 -- TODO Sum lenghts of each txt elements
225 logStatus $ JobLog { _scst_succeeded = Just 2
226 , _scst_failed = Just 0
227 , _scst_remaining = Just $ 1 + length txts
228 , _scst_events = Just []
231 cids <- mapM (\txt -> do
232 flowDataText user txt (Multi l) cid Nothing logStatus) txts
233 printDebug "corpus id" cids
234 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
237 pure JobLog { _scst_succeeded = Just 3
238 , _scst_failed = Just 0
239 , _scst_remaining = Just 0
240 , _scst_events = Just []
244 pure $ addEvent "ERROR" (T.pack $ show err) $
245 JobLog { _scst_succeeded = Just 2
246 , _scst_failed = Just 1
247 , _scst_remaining = Just 0
248 , _scst_events = Just []
252 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
254 :> Capture "corpus_id" CorpusId
258 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
260 addToCorpusWithForm :: (FlowCmdM env err m)
267 addToCorpusWithForm user cid (NewWithForm ft ff d l _n) logStatus jobLog = do
268 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
269 printDebug "[addToCorpusWithForm] fileType" ft
270 printDebug "[addToCorpusWithForm] fileFormat" ff
272 limit' <- view $ hasConfig . gc_max_docs_parsers
273 let limit = fromIntegral limit' :: Integer
276 CSV_HAL -> Parser.parseFormatC Parser.CsvHal
277 CSV -> Parser.parseFormatC Parser.CsvGargV3
278 WOS -> Parser.parseFormatC Parser.WOS
279 PresseRIS -> Parser.parseFormatC Parser.RisPresse
281 -- TODO granularity of the logStatus
282 let data' = case ff of
284 ZIP -> case BSB64.decode $ TE.encodeUtf8 d of
285 Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err
286 Right decoded -> decoded
287 eDocsC <- liftBase $ parseC ff data'
289 Right (mCount, docsC) -> do
290 -- TODO Add progress (jobStatus) update for docs - this is a
293 let docsC' = zipSources (yieldMany [1..]) docsC
294 .| mapMC (\(idx, doc) ->
295 if idx > limit then do
296 --printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show limit)
297 let panicMsg' = [ "[addToCorpusWithForm] number of docs "
298 , "exceeds the MAX_DOCS_PARSERS limit ("
301 let panicMsg = T.concat $ T.pack <$> panicMsg'
302 --logStatus $ jobLogFailTotalWithMessage panicMsg jobLog
306 .| mapC toHyperdataDocument
308 --printDebug "Parsing corpus finished : " cid
311 --printDebug "Starting extraction : " cid
312 -- TODO granularity of the logStatus
313 printDebug "flowCorpus with lang" l
315 _cid' <- flowCorpus user
317 (Multi $ fromMaybe EN l)
319 --(Just $ fromIntegral $ length docs, docsC')
320 (mCount, transPipe liftBase docsC') -- TODO fix number of docs
321 --(map (map toHyperdataDocument) docs)
324 printDebug "Extraction finished : " cid
325 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
326 -- TODO uncomment this
332 printDebug "[addToCorpusWithForm] parse error" e
334 let evt = ScraperEvent { _scev_message = Just $ T.pack e
335 , _scev_level = Just "ERROR"
336 , _scev_date = Nothing }
338 logStatus $ over (scst_events . _Just) (\evt' -> evt' <> [evt]) jobLogE
341 jobLog2 = jobLogSuccess jobLog
342 jobLog3 = jobLogSuccess jobLog2
343 jobLogE = jobLogFailTotal jobLog
346 addToCorpusWithFile :: FlowCmdM env err m
352 addToCorpusWithFile cid input filetype logStatus = do
353 logStatus JobLog { _scst_succeeded = Just 10
354 , _scst_failed = Just 2
355 , _scst_remaining = Just 138
356 , _scst_events = Just []
358 printDebug "addToCorpusWithFile" cid
359 _h <- postUpload cid filetype input
361 pure JobLog { _scst_succeeded = Just 137
362 , _scst_failed = Just 13
363 , _scst_remaining = Just 0
364 , _scst_events = Just []
370 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
372 :> Capture "corpus_id" CorpusId
376 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
378 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
384 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
386 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
387 logStatus JobLog { _scst_succeeded = Just 0
388 , _scst_failed = Just 0
389 , _scst_remaining = Just 1
390 , _scst_events = Just []
393 fPath <- GargDB.writeFile nwf
394 printDebug "[addToCorpusWithFile] File saved as: " fPath
396 uId <- getUserId user
397 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
401 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
402 let hl = node ^. node_hyperdata
403 _ <- updateHyperdata nId $ hl { _hff_name = fName
404 , _hff_path = T.pack fPath }
406 printDebug "[addToCorpusWithFile] Created node with id: " nId
409 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
411 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
414 pure $ JobLog { _scst_succeeded = Just 1
415 , _scst_failed = Just 0
416 , _scst_remaining = Just 0
417 , _scst_events = Just []