2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
18 module Gargantext.API.Node.Corpus.New
21 import Control.Lens hiding (elements, Empty)
23 import Data.Aeson.TH (deriveJSON)
25 import Data.Maybe (fromMaybe)
27 import Data.Text (Text)
28 import qualified Data.Text as T
29 import GHC.Generics (Generic)
30 import qualified Prelude as Prelude
31 import Protolude (readFile)
33 import Servant.Job.Utils (jsonOptions)
34 -- import Servant.Multipart
35 -- import Test.QuickCheck (elements)
36 import Test.QuickCheck.Arbitrary
38 import Gargantext.Prelude
40 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
41 import Gargantext.API.Admin.Types (HasSettings)
42 import Gargantext.API.Job (jobLogSuccess, jobLogFailTotal)
43 import Gargantext.API.Node.Corpus.New.File
44 import Gargantext.API.Node.Corpus.Searx
45 import Gargantext.API.Node.Corpus.Types
46 import Gargantext.API.Node.Types
47 import Gargantext.Core (Lang(..){-, allLangs-})
48 import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
49 import qualified Gargantext.Core.Text.Corpus.API as API
50 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
51 import Gargantext.Core.Types.Individu (User(..))
52 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
53 import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
54 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
55 import Gargantext.Database.Action.Mail (sendMail)
56 import Gargantext.Database.Action.Node (mkNodeWithParent)
57 import Gargantext.Database.Action.User (getUserId)
58 import Gargantext.Database.Admin.Types.Hyperdata
59 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
60 import Gargantext.Database.Query.Table.Node (getNodeWith)
61 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
62 import Gargantext.Database.Schema.Node (node_hyperdata)
63 import qualified Gargantext.Database.GargDB as GargDB
65 ------------------------------------------------------------------------
67 data Query = Query { query_query :: Text
68 , query_node_id :: Int
70 , query_databases :: [DataOrigin]
72 deriving (Eq, Generic)
74 deriveJSON (unPrefix "query_") 'Query
76 instance Arbitrary Query where
77 arbitrary = elements [ Query q n la fs
78 | q <- ["honeybee* AND collapse"
83 , fs <- take 3 $ repeat allDataOrigins
86 instance ToSchema Query where
87 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
90 ------------------------------------------------------------------------
96 type PostApi = Summary "New Corpus endpoint"
97 :> ReqBody '[JSON] Query
98 :> Post '[JSON] CorpusId
99 type GetApi = Get '[JSON] ApiInfo
102 -- | TODO manage several apis
104 -- TODO this is only the POST
106 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
107 api uid (Query q _ as) = do
108 cId <- case head as of
109 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
110 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
112 docs <- liftBase $ API.get a q (Just 1000)
113 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
119 ------------------------------------------------
120 -- TODO use this route for Client implementation
121 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
123 instance Arbitrary ApiInfo where
124 arbitrary = ApiInfo <$> arbitrary
126 deriveJSON (unPrefix "") 'ApiInfo
128 instance ToSchema ApiInfo
130 info :: FlowCmdM env err m => UserId -> m ApiInfo
131 info _u = pure $ ApiInfo API.externalAPIs
133 ------------------------------------------------------------------------
134 ------------------------------------------------------------------------
135 data WithQuery = WithQuery
137 , _wq_databases :: !Database
138 , _wq_datafield :: !(Maybe Datafield)
140 , _wq_node_id :: !Int
141 , _wq_flowListWith :: !FlowSocialListWith
145 makeLenses ''WithQuery
146 instance FromJSON WithQuery where
147 parseJSON = genericParseJSON $ jsonOptions "_wq_"
148 instance ToJSON WithQuery where
149 toJSON = genericToJSON $ jsonOptions "_wq_"
150 instance ToSchema WithQuery where
151 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
153 ------------------------------------------------------------------------
155 type AddWithQuery = Summary "Add with Query to corpus endpoint"
157 :> Capture "corpus_id" CorpusId
159 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
162 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
164 :> Capture "corpus_id" CorpusId
167 :> MultipartForm Mem (MultipartData Mem)
168 :> QueryParam "fileType" FileType
170 :> AsyncJobs JobLog '[JSON] () JobLog
174 ------------------------------------------------------------------------
175 -- TODO WithQuery also has a corpus id
176 addToCorpusWithQuery :: FlowCmdM env err m
183 addToCorpusWithQuery user cid (WithQuery { _wq_query = q
184 , _wq_databases = dbs
185 , _wq_datafield = datafield
187 , _wq_flowListWith = flw }) maybeLimit logStatus = do
189 logStatus JobLog { _scst_succeeded = Just 0
190 , _scst_failed = Just 0
191 , _scst_remaining = Just 3
192 , _scst_events = Just []
194 printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
195 printDebug "[addToCorpusWithQuery] datafield" datafield
196 printDebug "[addToCorpusWithQuery] flowListWith" flw
200 printDebug "[addToCorpusWithQuery] processing web request" datafield
202 _ <- triggerSearxSearch cid q l
204 pure JobLog { _scst_succeeded = Just 3
205 , _scst_failed = Just 0
206 , _scst_remaining = Just 0
207 , _scst_events = Just []
212 -- TODO if cid is folder -> create Corpus
213 -- if cid is corpus -> add to corpus
214 -- if cid is root -> create corpus in Private
215 txts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
217 logStatus JobLog { _scst_succeeded = Just 2
218 , _scst_failed = Just 0
219 , _scst_remaining = Just 1
220 , _scst_events = Just []
223 cids <- mapM (\txt -> flowDataText user txt (Multi l) cid Nothing) txts
224 printDebug "corpus id" cids
225 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
228 pure JobLog { _scst_succeeded = Just 3
229 , _scst_failed = Just 0
230 , _scst_remaining = Just 0
231 , _scst_events = Just []
235 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
237 :> Capture "corpus_id" CorpusId
241 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
243 addToCorpusWithForm :: FlowCmdM env err m
250 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
251 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
252 printDebug "[addToCorpusWithForm] fileType" ft
256 CSV_HAL -> Parser.parseFormat Parser.CsvHal
257 CSV -> Parser.parseFormat Parser.CsvGargV3
258 WOS -> Parser.parseFormat Parser.WOS
259 PresseRIS -> Parser.parseFormat Parser.RisPresse
261 -- TODO granularity of the logStatus
262 eDocs <- liftBase $ parse $ cs d
265 let docs = splitEvery 500 $ take 1000000 docs'
267 printDebug "Parsing corpus finished : " cid
270 printDebug "Starting extraction : " cid
271 -- TODO granularity of the logStatus
272 _cid' <- flowCorpus user
274 (Multi $ fromMaybe EN l)
276 (map (map toHyperdataDocument) docs)
278 printDebug "Extraction finished : " cid
279 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
290 jobLog2 = jobLogSuccess jobLog
291 jobLog3 = jobLogSuccess jobLog2
292 jobLogE = jobLogFailTotal jobLog
294 parseCsvGargV3Path :: [Char] -> IO (Either Prelude.String [HyperdataDocument])
295 parseCsvGargV3Path fp = do
296 contents <- readFile fp
297 Parser.parseFormat Parser.CsvGargV3 $ cs contents
300 addToCorpusWithFile :: FlowCmdM env err m
306 addToCorpusWithFile cid input filetype logStatus = do
307 logStatus JobLog { _scst_succeeded = Just 10
308 , _scst_failed = Just 2
309 , _scst_remaining = Just 138
310 , _scst_events = Just []
312 printDebug "addToCorpusWithFile" cid
313 _h <- postUpload cid filetype input
315 pure JobLog { _scst_succeeded = Just 137
316 , _scst_failed = Just 13
317 , _scst_remaining = Just 0
318 , _scst_events = Just []
324 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
326 :> Capture "corpus_id" CorpusId
330 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
332 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
338 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
340 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
341 logStatus JobLog { _scst_succeeded = Just 0
342 , _scst_failed = Just 0
343 , _scst_remaining = Just 1
344 , _scst_events = Just []
347 fPath <- GargDB.writeFile nwf
348 printDebug "[addToCorpusWithFile] File saved as: " fPath
350 uId <- getUserId user
351 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
355 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
356 let hl = node ^. node_hyperdata
357 _ <- updateHyperdata nId $ hl { _hff_name = fName
358 , _hff_path = T.pack fPath }
360 printDebug "[addToCorpusWithFile] Created node with id: " nId
363 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
365 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
368 pure $ JobLog { _scst_succeeded = Just 1
369 , _scst_failed = Just 0
370 , _scst_remaining = Just 0
371 , _scst_events = Just []