2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
18 module Gargantext.API.Node.Corpus.New
21 import Control.Lens hiding (elements, Empty)
23 import Data.Aeson.TH (deriveJSON)
25 import Data.Maybe (fromMaybe)
27 import Data.Text (Text)
28 import qualified Data.Text as T
29 import GHC.Generics (Generic)
30 import qualified Prelude as Prelude
31 import Protolude (readFile)
33 import Servant.Job.Utils (jsonOptions)
34 -- import Servant.Multipart
35 -- import Test.QuickCheck (elements)
36 import Test.QuickCheck.Arbitrary
38 import Gargantext.Prelude
40 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
41 import Gargantext.API.Admin.Types (HasSettings)
42 import Gargantext.API.Job (jobLogSuccess, jobLogFailTotal)
43 import Gargantext.API.Node.Corpus.New.File
44 import Gargantext.API.Node.Corpus.Searx
45 import Gargantext.API.Node.Corpus.Types
46 import Gargantext.API.Node.Types
47 import Gargantext.Core (Lang(..){-, allLangs-})
48 import qualified Gargantext.Core.Text.Corpus.API as API
49 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
50 import Gargantext.Core.Types.Individu (User(..))
51 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
52 import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
53 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
54 import Gargantext.Database.Action.Mail (sendMail)
55 import Gargantext.Database.Action.Node (mkNodeWithParent)
56 import Gargantext.Database.Action.User (getUserId)
57 import Gargantext.Database.Admin.Types.Hyperdata
58 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
59 import Gargantext.Database.Query.Table.Node (getNodeWith)
60 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
61 import Gargantext.Database.Schema.Node (node_hyperdata)
62 import qualified Gargantext.Database.GargDB as GargDB
64 ------------------------------------------------------------------------
66 data Query = Query { query_query :: Text
67 , query_node_id :: Int
69 , query_databases :: [DataOrigin]
71 deriving (Eq, Generic)
73 deriveJSON (unPrefix "query_") 'Query
75 instance Arbitrary Query where
76 arbitrary = elements [ Query q n la fs
77 | q <- ["honeybee* AND collapse"
82 , fs <- take 3 $ repeat allDataOrigins
85 instance ToSchema Query where
86 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
89 ------------------------------------------------------------------------
95 type PostApi = Summary "New Corpus endpoint"
96 :> ReqBody '[JSON] Query
97 :> Post '[JSON] CorpusId
98 type GetApi = Get '[JSON] ApiInfo
101 -- | TODO manage several apis
103 -- TODO this is only the POST
105 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
106 api uid (Query q _ as) = do
107 cId <- case head as of
108 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
109 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
111 docs <- liftBase $ API.get a q (Just 1000)
112 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
118 ------------------------------------------------
119 -- TODO use this route for Client implementation
120 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
122 instance Arbitrary ApiInfo where
123 arbitrary = ApiInfo <$> arbitrary
125 deriveJSON (unPrefix "") 'ApiInfo
127 instance ToSchema ApiInfo
129 info :: FlowCmdM env err m => UserId -> m ApiInfo
130 info _u = pure $ ApiInfo API.externalAPIs
132 ------------------------------------------------------------------------
133 ------------------------------------------------------------------------
134 data WithQuery = WithQuery
136 , _wq_databases :: !Database
137 , _wq_datafield :: !Datafield
139 , _wq_node_id :: !Int
140 -- , _wq_flowListWith :: !FlowSocialListWith
144 makeLenses ''WithQuery
145 instance FromJSON WithQuery where
146 parseJSON = genericParseJSON $ jsonOptions "_wq_"
147 instance ToJSON WithQuery where
148 toJSON = genericToJSON $ jsonOptions "_wq_"
149 instance ToSchema WithQuery where
150 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
152 ------------------------------------------------------------------------
154 type AddWithQuery = Summary "Add with Query to corpus endpoint"
156 :> Capture "corpus_id" CorpusId
158 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
161 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
163 :> Capture "corpus_id" CorpusId
166 :> MultipartForm Mem (MultipartData Mem)
167 :> QueryParam "fileType" FileType
169 :> AsyncJobs JobLog '[JSON] () JobLog
173 ------------------------------------------------------------------------
174 -- TODO WithQuery also has a corpus id
175 addToCorpusWithQuery :: FlowCmdM env err m
182 addToCorpusWithQuery user cid (WithQuery q dbs datafield l _nid) maybeLimit logStatus = do
184 logStatus JobLog { _scst_succeeded = Just 0
185 , _scst_failed = Just 0
186 , _scst_remaining = Just 3
187 , _scst_events = Just []
189 printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
190 printDebug "[addToCorpusWithQuery] datafield" datafield
194 printDebug "[addToCorpusWithQuery] processing web request" datafield
196 _ <- triggerSearxSearch cid q l
198 pure JobLog { _scst_succeeded = Just 3
199 , _scst_failed = Just 0
200 , _scst_remaining = Just 0
201 , _scst_events = Just []
206 -- TODO if cid is folder -> create Corpus
207 -- if cid is corpus -> add to corpus
208 -- if cid is root -> create corpus in Private
209 txts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
211 logStatus JobLog { _scst_succeeded = Just 2
212 , _scst_failed = Just 0
213 , _scst_remaining = Just 1
214 , _scst_events = Just []
217 cids <- mapM (\txt -> flowDataText user txt (Multi l) cid Nothing) txts
218 printDebug "corpus id" cids
219 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
222 pure JobLog { _scst_succeeded = Just 3
223 , _scst_failed = Just 0
224 , _scst_remaining = Just 0
225 , _scst_events = Just []
229 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
231 :> Capture "corpus_id" CorpusId
235 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
237 addToCorpusWithForm :: FlowCmdM env err m
244 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
245 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
246 printDebug "[addToCorpusWithForm] fileType" ft
250 CSV_HAL -> Parser.parseFormat Parser.CsvHal
251 CSV -> Parser.parseFormat Parser.CsvGargV3
252 WOS -> Parser.parseFormat Parser.WOS
253 PresseRIS -> Parser.parseFormat Parser.RisPresse
255 -- TODO granularity of the logStatus
256 eDocs <- liftBase $ parse $ cs d
259 let docs = splitEvery 500 $ take 1000000 docs'
261 printDebug "Parsing corpus finished : " cid
264 printDebug "Starting extraction : " cid
265 -- TODO granularity of the logStatus
266 _cid' <- flowCorpus user
268 (Multi $ fromMaybe EN l)
270 (map (map toHyperdataDocument) docs)
272 printDebug "Extraction finished : " cid
273 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
284 jobLog2 = jobLogSuccess jobLog
285 jobLog3 = jobLogSuccess jobLog2
286 jobLogE = jobLogFailTotal jobLog
288 parseCsvGargV3Path :: [Char] -> IO (Either Prelude.String [HyperdataDocument])
289 parseCsvGargV3Path fp = do
290 contents <- readFile fp
291 Parser.parseFormat Parser.CsvGargV3 $ cs contents
294 addToCorpusWithFile :: FlowCmdM env err m
300 addToCorpusWithFile cid input filetype logStatus = do
301 logStatus JobLog { _scst_succeeded = Just 10
302 , _scst_failed = Just 2
303 , _scst_remaining = Just 138
304 , _scst_events = Just []
306 printDebug "addToCorpusWithFile" cid
307 _h <- postUpload cid filetype input
309 pure JobLog { _scst_succeeded = Just 137
310 , _scst_failed = Just 13
311 , _scst_remaining = Just 0
312 , _scst_events = Just []
318 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
320 :> Capture "corpus_id" CorpusId
324 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
326 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
332 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
334 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
335 logStatus JobLog { _scst_succeeded = Just 0
336 , _scst_failed = Just 0
337 , _scst_remaining = Just 1
338 , _scst_events = Just []
341 fPath <- GargDB.writeFile nwf
342 printDebug "[addToCorpusWithFile] File saved as: " fPath
344 uId <- getUserId user
345 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
349 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
350 let hl = node ^. node_hyperdata
351 _ <- updateHyperdata nId $ hl { _hff_name = fName
352 , _hff_path = T.pack fPath }
354 printDebug "[addToCorpusWithFile] Created node with id: " nId
357 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
359 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
362 pure $ JobLog { _scst_succeeded = Just 1
363 , _scst_failed = Just 0
364 , _scst_remaining = Just 0
365 , _scst_events = Just []