2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
18 module Gargantext.API.Node.Corpus.New
21 import Control.Lens hiding (elements, Empty)
23 import Data.Aeson.TH (deriveJSON)
25 import Data.Maybe (fromMaybe)
27 import Data.Text (Text)
28 import qualified Data.Text as T
29 import GHC.Generics (Generic)
30 import qualified Prelude as Prelude
31 import Protolude (readFile)
33 import Servant.Job.Utils (jsonOptions)
34 -- import Servant.Multipart
35 -- import Test.QuickCheck (elements)
36 import Test.QuickCheck.Arbitrary
38 import Gargantext.Prelude
40 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
41 import Gargantext.API.Admin.Types (HasSettings)
42 import Gargantext.API.Job (jobLogSuccess, jobLogFailTotal)
43 import Gargantext.API.Node.Corpus.New.File
44 import Gargantext.API.Node.Corpus.Searx
45 import Gargantext.API.Node.Corpus.Types
46 import Gargantext.API.Node.Types
47 import Gargantext.Core (Lang(..){-, allLangs-})
48 import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
49 import qualified Gargantext.Core.Text.Corpus.API as API
50 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
51 import Gargantext.Core.Types.Individu (User(..))
52 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
53 import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
54 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
55 import Gargantext.Database.Action.Mail (sendMail)
56 import Gargantext.Database.Action.Node (mkNodeWithParent)
57 import Gargantext.Database.Action.User (getUserId)
58 import Gargantext.Database.Admin.Types.Hyperdata
59 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
60 import Gargantext.Database.Query.Table.Node (getNodeWith)
61 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
62 import Gargantext.Database.Schema.Node (node_hyperdata)
63 import qualified Gargantext.Database.GargDB as GargDB
65 ------------------------------------------------------------------------
67 data Query = Query { query_query :: Text
68 , query_node_id :: Int
70 , query_databases :: [DataOrigin]
72 deriving (Eq, Generic)
74 deriveJSON (unPrefix "query_") 'Query
76 instance Arbitrary Query where
77 arbitrary = elements [ Query q n la fs
78 | q <- ["honeybee* AND collapse"
83 , fs <- take 3 $ repeat allDataOrigins
86 instance ToSchema Query where
87 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
90 ------------------------------------------------------------------------
96 type PostApi = Summary "New Corpus endpoint"
97 :> ReqBody '[JSON] Query
98 :> Post '[JSON] CorpusId
99 type GetApi = Get '[JSON] ApiInfo
102 -- | TODO manage several apis
104 -- TODO this is only the POST
106 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
107 api uid (Query q _ as) = do
108 cId <- case head as of
109 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
110 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
112 docs <- liftBase $ API.get a q (Just 1000)
113 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
119 ------------------------------------------------
120 -- TODO use this route for Client implementation
121 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
123 instance Arbitrary ApiInfo where
124 arbitrary = ApiInfo <$> arbitrary
126 deriveJSON (unPrefix "") 'ApiInfo
128 instance ToSchema ApiInfo
130 info :: FlowCmdM env err m => UserId -> m ApiInfo
131 info _u = pure $ ApiInfo API.externalAPIs
133 ------------------------------------------------------------------------
134 ------------------------------------------------------------------------
135 data WithQuery = WithQuery
137 , _wq_databases :: !Database
138 , _wq_datafield :: !(Maybe Datafield)
140 , _wq_node_id :: !Int
141 , _wq_flowListWith :: !FlowSocialListWith
145 makeLenses ''WithQuery
146 instance FromJSON WithQuery where
147 parseJSON = genericParseJSON $ jsonOptions "_wq_"
148 instance ToJSON WithQuery where
149 toJSON = genericToJSON $ jsonOptions "_wq_"
150 instance ToSchema WithQuery where
151 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
153 ------------------------------------------------------------------------
155 type AddWithQuery = Summary "Add with Query to corpus endpoint"
157 :> Capture "corpus_id" CorpusId
159 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
162 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
164 :> Capture "corpus_id" CorpusId
167 :> MultipartForm Mem (MultipartData Mem)
168 :> QueryParam "fileType" FileType
170 :> AsyncJobs JobLog '[JSON] () JobLog
174 ------------------------------------------------------------------------
175 -- TODO WithQuery also has a corpus id
176 addToCorpusWithQuery :: FlowCmdM env err m
183 addToCorpusWithQuery user cid (WithQuery { _wq_query = q
184 , _wq_databases = dbs
185 , _wq_datafield = datafield
187 , _wq_flowListWith = flw }) maybeLimit logStatus = do
189 logStatus JobLog { _scst_succeeded = Just 0
190 , _scst_failed = Just 0
191 , _scst_remaining = Just 3
192 , _scst_events = Just []
194 printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
195 printDebug "[addToCorpusWithQuery] datafield" datafield
196 printDebug "[addToCorpusWithQuery] flowListWith" flw
200 printDebug "[addToCorpusWithQuery] processing web request" datafield
202 _ <- triggerSearxSearch cid q l
204 pure JobLog { _scst_succeeded = Just 3
205 , _scst_failed = Just 0
206 , _scst_remaining = Just 0
207 , _scst_events = Just []
212 -- TODO if cid is folder -> create Corpus
213 -- if cid is corpus -> add to corpus
214 -- if cid is root -> create corpus in Private
215 txts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
217 logStatus JobLog { _scst_succeeded = Just 2
218 , _scst_failed = Just 0
219 , _scst_remaining = Just 1
220 , _scst_events = Just []
223 cids <- mapM (\txt -> flowDataText user txt (Multi l) cid Nothing) txts
224 printDebug "corpus id" cids
225 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
228 pure JobLog { _scst_succeeded = Just 3
229 , _scst_failed = Just 0
230 , _scst_remaining = Just 0
231 , _scst_events = Just []
235 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
237 :> Capture "corpus_id" CorpusId
241 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
243 addToCorpusWithForm :: FlowCmdM env err m
250 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
251 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
252 printDebug "[addToCorpusWithForm] fileType" ft
256 CSV_HAL -> Parser.parseFormat Parser.CsvHal
257 CSV -> Parser.parseFormat Parser.CsvGargV3
258 WOS -> Parser.parseFormat Parser.WOS
259 PresseRIS -> Parser.parseFormat Parser.RisPresse
260 ZIP -> Parser.parseFormat Parser.ZIP
262 -- TODO granularity of the logStatus
263 eDocs <- liftBase $ parse $ cs d
266 let docs = splitEvery 500 $ take 1000000 docs'
268 printDebug "Parsing corpus finished : " cid
271 printDebug "Starting extraction : " cid
272 -- TODO granularity of the logStatus
273 _cid' <- flowCorpus user
275 (Multi $ fromMaybe EN l)
277 (map (map toHyperdataDocument) docs)
279 printDebug "Extraction finished : " cid
280 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
291 jobLog2 = jobLogSuccess jobLog
292 jobLog3 = jobLogSuccess jobLog2
293 jobLogE = jobLogFailTotal jobLog
295 parseCsvGargV3Path :: [Char] -> IO (Either Prelude.String [HyperdataDocument])
296 parseCsvGargV3Path fp = do
297 contents <- readFile fp
298 Parser.parseFormat Parser.CsvGargV3 $ cs contents
301 addToCorpusWithFile :: FlowCmdM env err m
307 addToCorpusWithFile cid input filetype logStatus = do
308 logStatus JobLog { _scst_succeeded = Just 10
309 , _scst_failed = Just 2
310 , _scst_remaining = Just 138
311 , _scst_events = Just []
313 printDebug "addToCorpusWithFile" cid
314 _h <- postUpload cid filetype input
316 pure JobLog { _scst_succeeded = Just 137
317 , _scst_failed = Just 13
318 , _scst_remaining = Just 0
319 , _scst_events = Just []
325 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
327 :> Capture "corpus_id" CorpusId
331 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
333 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
339 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
341 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
342 logStatus JobLog { _scst_succeeded = Just 0
343 , _scst_failed = Just 0
344 , _scst_remaining = Just 1
345 , _scst_events = Just []
348 fPath <- GargDB.writeFile nwf
349 printDebug "[addToCorpusWithFile] File saved as: " fPath
351 uId <- getUserId user
352 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
356 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
357 let hl = node ^. node_hyperdata
358 _ <- updateHyperdata nId $ hl { _hff_name = fName
359 , _hff_path = T.pack fPath }
361 printDebug "[addToCorpusWithFile] Created node with id: " nId
364 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
366 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
369 pure $ JobLog { _scst_succeeded = Just 1
370 , _scst_failed = Just 0
371 , _scst_remaining = Just 0
372 , _scst_events = Just []