2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
18 module Gargantext.API.Node.Corpus.New
21 import Control.Lens hiding (elements, Empty)
23 import Data.Aeson.TH (deriveJSON)
25 import Data.Maybe (fromMaybe)
27 import Data.Text (Text)
28 import qualified Data.Text as T
29 import GHC.Generics (Generic)
30 import qualified Prelude as Prelude
31 import Protolude (readFile)
33 import Servant.Job.Utils (jsonOptions)
34 -- import Servant.Multipart
35 -- import Test.QuickCheck (elements)
36 import Test.QuickCheck.Arbitrary
38 import Gargantext.Prelude
40 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
41 import Gargantext.API.Admin.Types (HasSettings)
42 import Gargantext.API.Job (jobLogSuccess, jobLogFailTotal)
43 import Gargantext.API.Node.Corpus.New.File
44 import Gargantext.API.Node.Corpus.Searx
45 import Gargantext.API.Node.Corpus.Types
46 import Gargantext.API.Node.Types
47 import Gargantext.Core (Lang(..){-, allLangs-})
48 import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
49 import qualified Gargantext.Core.Text.Corpus.API as API
50 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
51 import Gargantext.Core.Types.Individu (User(..))
52 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
53 import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
54 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
55 import Gargantext.Database.Action.Mail (sendMail)
56 import Gargantext.Database.Action.Node (mkNodeWithParent)
57 import Gargantext.Database.Action.User (getUserId)
58 import Gargantext.Database.Admin.Types.Hyperdata
59 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
60 import Gargantext.Database.Query.Table.Node (getNodeWith)
61 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
62 import Gargantext.Database.Schema.Node (node_hyperdata)
63 import qualified Gargantext.Database.GargDB as GargDB
65 ------------------------------------------------------------------------
67 data Query = Query { query_query :: Text
68 , query_node_id :: Int
70 , query_databases :: [DataOrigin]
72 deriving (Eq, Generic)
74 deriveJSON (unPrefix "query_") 'Query
76 instance Arbitrary Query where
77 arbitrary = elements [ Query q n la fs
78 | q <- ["honeybee* AND collapse"
83 , fs <- take 3 $ repeat allDataOrigins
86 instance ToSchema Query where
87 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
90 ------------------------------------------------------------------------
96 type PostApi = Summary "New Corpus endpoint"
97 :> ReqBody '[JSON] Query
98 :> Post '[JSON] CorpusId
99 type GetApi = Get '[JSON] ApiInfo
102 -- | TODO manage several apis
104 -- TODO this is only the POST
106 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
107 api uid (Query q _ as) = do
108 cId <- case head as of
109 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
110 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
112 docs <- liftBase $ API.get a q (Just 1000)
113 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
119 ------------------------------------------------
120 -- TODO use this route for Client implementation
121 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
123 instance Arbitrary ApiInfo where
124 arbitrary = ApiInfo <$> arbitrary
126 deriveJSON (unPrefix "") 'ApiInfo
128 instance ToSchema ApiInfo
130 info :: FlowCmdM env err m => UserId -> m ApiInfo
131 info _u = pure $ ApiInfo API.externalAPIs
133 ------------------------------------------------------------------------
134 ------------------------------------------------------------------------
135 data WithQuery = WithQuery
137 , _wq_databases :: !Database
138 , _wq_datafield :: !(Maybe Datafield)
140 , _wq_node_id :: !Int
141 , _wq_flowListWith :: !FlowSocialListWith
145 makeLenses ''WithQuery
146 instance FromJSON WithQuery where
147 parseJSON = genericParseJSON $ jsonOptions "_wq_"
148 instance ToJSON WithQuery where
149 toJSON = genericToJSON $ jsonOptions "_wq_"
150 instance ToSchema WithQuery where
151 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
153 ------------------------------------------------------------------------
155 type AddWithQuery = Summary "Add with Query to corpus endpoint"
157 :> Capture "corpus_id" CorpusId
159 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
162 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
164 :> Capture "corpus_id" CorpusId
167 :> MultipartForm Mem (MultipartData Mem)
168 :> QueryParam "fileType" FileType
170 :> AsyncJobs JobLog '[JSON] () JobLog
174 ------------------------------------------------------------------------
175 -- TODO WithQuery also has a corpus id
176 addToCorpusWithQuery :: FlowCmdM env err m
183 addToCorpusWithQuery user cid (WithQuery q dbs datafield l _nid) maybeLimit logStatus = do
185 logStatus JobLog { _scst_succeeded = Just 0
186 , _scst_failed = Just 0
187 , _scst_remaining = Just 3
188 , _scst_events = Just []
190 printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
191 printDebug "[addToCorpusWithQuery] datafield" datafield
195 printDebug "[addToCorpusWithQuery] processing web request" datafield
197 _ <- triggerSearxSearch cid q l
199 pure JobLog { _scst_succeeded = Just 3
200 , _scst_failed = Just 0
201 , _scst_remaining = Just 0
202 , _scst_events = Just []
207 -- TODO if cid is folder -> create Corpus
208 -- if cid is corpus -> add to corpus
209 -- if cid is root -> create corpus in Private
210 txts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
212 logStatus JobLog { _scst_succeeded = Just 2
213 , _scst_failed = Just 0
214 , _scst_remaining = Just 1
215 , _scst_events = Just []
218 cids <- mapM (\txt -> flowDataText user txt (Multi l) cid Nothing) txts
219 printDebug "corpus id" cids
220 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
223 pure JobLog { _scst_succeeded = Just 3
224 , _scst_failed = Just 0
225 , _scst_remaining = Just 0
226 , _scst_events = Just []
230 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
232 :> Capture "corpus_id" CorpusId
236 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
238 addToCorpusWithForm :: FlowCmdM env err m
245 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
246 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
247 printDebug "[addToCorpusWithForm] fileType" ft
251 CSV_HAL -> Parser.parseFormat Parser.CsvHal
252 CSV -> Parser.parseFormat Parser.CsvGargV3
253 WOS -> Parser.parseFormat Parser.WOS
254 PresseRIS -> Parser.parseFormat Parser.RisPresse
256 -- TODO granularity of the logStatus
257 eDocs <- liftBase $ parse $ cs d
260 let docs = splitEvery 500 $ take 1000000 docs'
262 printDebug "Parsing corpus finished : " cid
265 printDebug "Starting extraction : " cid
266 -- TODO granularity of the logStatus
267 _cid' <- flowCorpus user
269 (Multi $ fromMaybe EN l)
271 (map (map toHyperdataDocument) docs)
273 printDebug "Extraction finished : " cid
274 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
285 jobLog2 = jobLogSuccess jobLog
286 jobLog3 = jobLogSuccess jobLog2
287 jobLogE = jobLogFailTotal jobLog
289 parseCsvGargV3Path :: [Char] -> IO (Either Prelude.String [HyperdataDocument])
290 parseCsvGargV3Path fp = do
291 contents <- readFile fp
292 Parser.parseFormat Parser.CsvGargV3 $ cs contents
295 addToCorpusWithFile :: FlowCmdM env err m
301 addToCorpusWithFile cid input filetype logStatus = do
302 logStatus JobLog { _scst_succeeded = Just 10
303 , _scst_failed = Just 2
304 , _scst_remaining = Just 138
305 , _scst_events = Just []
307 printDebug "addToCorpusWithFile" cid
308 _h <- postUpload cid filetype input
310 pure JobLog { _scst_succeeded = Just 137
311 , _scst_failed = Just 13
312 , _scst_remaining = Just 0
313 , _scst_events = Just []
319 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
321 :> Capture "corpus_id" CorpusId
325 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
327 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
333 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
335 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
336 logStatus JobLog { _scst_succeeded = Just 0
337 , _scst_failed = Just 0
338 , _scst_remaining = Just 1
339 , _scst_events = Just []
342 fPath <- GargDB.writeFile nwf
343 printDebug "[addToCorpusWithFile] File saved as: " fPath
345 uId <- getUserId user
346 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
350 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
351 let hl = node ^. node_hyperdata
352 _ <- updateHyperdata nId $ hl { _hff_name = fName
353 , _hff_path = T.pack fPath }
355 printDebug "[addToCorpusWithFile] Created node with id: " nId
358 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
360 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
363 pure $ JobLog { _scst_succeeded = Just 1
364 , _scst_failed = Just 0
365 , _scst_remaining = Just 0
366 , _scst_events = Just []