3 Description : Gargantext Import Corpus
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Import a corpus binary.
14 {-# LANGUAGE Strict #-}
18 import Control.Exception (finally)
20 import Data.Maybe (Maybe(..))
21 import Data.Text (Text)
22 import System.Environment (getArgs)
23 import qualified Data.Text as Text
24 import Text.Read (readMaybe)
26 import Gargantext.API.Dev (withDevEnv, runCmdGargDev)
27 import Gargantext.API.Admin.EnvTypes (DevEnv(..), DevJobHandle(..))
28 import Gargantext.API.Admin.Orchestrator.Types (JobLog)
29 import Gargantext.API.Node () -- instances
30 import Gargantext.API.Prelude (GargError)
31 import Gargantext.Core (Lang(..))
32 import Gargantext.Core.Types.Individu (User(..))
33 import Gargantext.Database.Action.Flow (flowCorpusFile, flowAnnuaire, TermType(..))
34 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
35 import Gargantext.Database.Admin.Types.Hyperdata (toHyperdataDocument)
36 import Gargantext.Database.Admin.Types.Node (CorpusId)
37 import Gargantext.Database.Prelude (Cmd)
38 import Gargantext.Prelude
39 import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..))
40 import Gargantext.Utils.Jobs (MonadJobStatus, JobHandle)
44 [fun, user, name, iniPath, limit, corpusPath] <- getArgs
49 --tt = (Unsupervised EN 6 0 Nothing)
51 format = CsvGargV3 -- CsvHal --WOS
52 limit' = case (readMaybe limit :: Maybe Int) of
53 Nothing -> panic $ "Cannot read limit: " <> (Text.pack limit)
55 corpus :: forall m. (FlowCmdM DevEnv GargError m, MonadJobStatus m, JobHandle m ~ DevJobHandle) => m CorpusId
56 corpus = flowCorpusFile (UserName $ cs user) (Left (cs name :: Text)) limit' tt format Plain corpusPath Nothing DevJobHandle
58 corpusCsvHal :: forall m. (FlowCmdM DevEnv GargError m, MonadJobStatus m, JobHandle m ~ DevJobHandle) => m CorpusId
59 corpusCsvHal = flowCorpusFile (UserName $ cs user) (Left (cs name :: Text)) limit' tt CsvHal Plain corpusPath Nothing DevJobHandle
61 annuaire :: forall m. (FlowCmdM DevEnv GargError m, MonadJobStatus m, JobHandle m ~ DevJobHandle) => m CorpusId
62 annuaire = flowAnnuaire (UserName $ cs user) (Left "Annuaire") (Multi EN) corpusPath DevJobHandle
65 let debatCorpus :: forall m. FlowCmdM DevEnv GargError m => m CorpusId
67 docs <- liftIO ( splitEvery 500
68 <$> take (read limit :: Int)
69 <$> readFile corpusPath
70 :: IO [[GrandDebatReference ]]
72 flowCorpus (Text.pack user) (Text.pack name) (Multi FR) (map (map toHyperdataDocument) docs)
75 withDevEnv iniPath $ \env -> do
76 _ <- if fun == "corpus"
77 then runCmdGargDev env corpus
78 else pure 0 --(cs "false")
80 _ <- if fun == "corpusCsvHal"
81 then runCmdGargDev env corpusCsvHal
82 else pure 0 --(cs "false")
84 _ <- if fun == "annuaire"
85 then runCmdGargDev env annuaire
88 _ <- if corpusType == "csv"
89 then runCmdDev env csvCorpus
90 else if corpusType == "debat"
91 then runCmdDev env debatCorpus
92 else panic "corpusType unknown: try \"csv\" or \"debat\""