module Main where
+-- import Debug.Trace (trace)
import Control.Concurrent.Async (mapConcurrently)
import Crypto.Hash.SHA256 (hash)
import Data.Aeson
-import Data.Either (Either(..))
+import Data.Either (Either(..), fromRight)
import Data.List (concat, nub, isSuffixOf)
+import Data.List.Split
import Data.Maybe (fromMaybe)
import Data.String (String)
-import GHC.IO (FilePath)
-import qualified Prelude as Prelude
-import System.Environment
-import System.Directory (listDirectory,doesFileExist)
import Data.Text (Text, unwords, unpack, replace, pack)
-import Data.Time.Calendar (fromGregorian, diffGregorianDurationClip, cdMonths, diffDays, showGregorian)
-
-import qualified Data.ByteString.Char8 as C8
-import qualified Data.ByteString.Lazy as Lazy
-import qualified Data.Vector as Vector
-import qualified Data.Text as T
-
-import Gargantext.Prelude
-import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
+import GHC.IO (FilePath)
+import Gargantext.API.Ngrams.Prelude (toTermList)
+import Gargantext.API.Ngrams.Types
import Gargantext.Core.Text.Context (TermList)
-import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year, csv_publication_month, csv_publication_day,
- csv'_source, csv'_title, csv'_abstract, csv'_publication_year, csv'_publication_month, csv'_publication_day, csv'_weight)
-import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as Csv
-import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..),parseFile)
+import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), parseFile)
+import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year, csv_publication_month, csv_publication_day, csv'_source, csv'_title, csv'_abstract, csv'_publication_year, csv'_publication_month, csv'_publication_day, csv'_weight)
import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
import Gargantext.Core.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
+import Gargantext.Core.Types.Main (ListType(..))
import Gargantext.Core.Viz.Phylo
-import Gargantext.Core.Viz.Phylo.PhyloMaker (toPhylo, toPhyloStep)
-import Gargantext.Core.Viz.Phylo.PhyloTools (printIOMsg, printIOComment, setConfig)
+import Gargantext.Core.Viz.Phylo.API.Tools
import Gargantext.Core.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
--- import Gargantext.API.Ngrams.Prelude (toTermList)
-
--- import Debug.Trace (trace)
+import Gargantext.Core.Viz.Phylo.PhyloMaker (toPhylo, toPhyloWithoutLink)
+import Gargantext.Core.Viz.Phylo.PhyloTools (printIOMsg, printIOComment, setConfig)
+import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
+import Gargantext.Database.Schema.Ngrams (NgramsType(..))
+import Gargantext.Prelude
+import System.Directory (listDirectory,doesFileExist)
+import System.Environment
+import qualified Data.ByteString.Char8 as C8
+import qualified Data.Text as T
+import qualified Data.Vector as Vector
+import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as Csv
-data PhyloStage = PhyloWithCliques | PhyloWithLinks deriving (Show)
+data Backup = BackupPhyloWithoutLink | BackupPhylo deriving (Show)
---------------
-- | Tools | --
---------------
-
-- | To get all the files in a directory or just a file
-getFilesFromPath :: FilePath -> IO([FilePath])
+getFilesFromPath :: FilePath -> IO [FilePath]
getFilesFromPath path = do
if (isSuffixOf "/" path)
then (listDirectory path)
else return [path]
----------------
--- | Dates | --
----------------
-toMonths :: Integer -> Int -> Int -> Date
-toMonths y m d = fromIntegral $ cdMonths
- $ diffGregorianDurationClip (fromGregorian y m d) (fromGregorian 0000 0 0)
-
-
-toDays :: Integer -> Int -> Int -> Date
-toDays y m d = fromIntegral
- $ diffDays (fromGregorian y m d) (fromGregorian 0000 0 0)
-
-
-toPhyloDate :: Int -> Int -> Int -> TimeUnit -> Date
-toPhyloDate y m d tu = case tu of
- Year _ _ _ -> y
- Month _ _ _ -> toMonths (Prelude.toInteger y) m d
- Week _ _ _ -> div (toDays (Prelude.toInteger y) m d) 7
- Day _ _ _ -> toDays (Prelude.toInteger y) m d
-
-
--- Function to use in Database export
-toPhyloDate' :: Int -> Int -> Int -> Text
-toPhyloDate' y m d = pack $ showGregorian $ fromGregorian (Prelude.toInteger y) m d
-
-
---------------
--- | Json | --
---------------
-
-
--- | To read and decode a Json file
-readJson :: FilePath -> IO Lazy.ByteString
-readJson path = Lazy.readFile path
-
-
----------------
-- | Parser | --
----------------
-- | To transform a Wos file (or [file]) into a list of Docs
wosToDocs :: Int -> Patterns -> TimeUnit -> FilePath -> IO [Document]
wosToDocs limit patterns time path = do
- files <- getFilesFromPath path
- let parseFile' file = do
- eParsed <- parseFile WOS (path <> file)
- case eParsed of
- Right ps -> pure ps
- Left e -> panic $ "Error: " <> (pack e)
- take limit
- <$> map (\d -> let title = fromJust $ _hd_title d
- abstr = if (isJust $ _hd_abstract d)
- then fromJust $ _hd_abstract d
- else ""
- in Document (toPhyloDate
- (fromIntegral $ fromJust $ _hd_publication_year d)
- (fromJust $ _hd_publication_month d)
- (fromJust $ _hd_publication_day d) time)
- (toPhyloDate'
- (fromIntegral $ fromJust $ _hd_publication_year d)
- (fromJust $ _hd_publication_month d)
- (fromJust $ _hd_publication_day d))
- (termsInText patterns $ title <> " " <> abstr) Nothing [])
- <$> concat
- <$> mapConcurrently (\file ->
- filter (\d -> (isJust $ _hd_publication_year d)
- && (isJust $ _hd_title d))
- <$> parseFile' file) files
+ files <- getFilesFromPath path
+ take limit
+ <$> map (\d -> let title = fromJust $ _hd_title d
+ abstr = if (isJust $ _hd_abstract d)
+ then fromJust $ _hd_abstract d
+ else ""
+ in Document (toPhyloDate
+ (fromIntegral $ fromJust $ _hd_publication_year d)
+ (fromJust $ _hd_publication_month d)
+ (fromJust $ _hd_publication_day d) time)
+ (toPhyloDate'
+ (fromIntegral $ fromJust $ _hd_publication_year d)
+ (fromJust $ _hd_publication_month d)
+ (fromJust $ _hd_publication_day d) time)
+ (termsInText patterns $ title <> " " <> abstr) Nothing [])
+ <$> concat
+ <$> mapConcurrently (\file ->
+ filter (\d -> (isJust $ _hd_publication_year d)
+ && (isJust $ _hd_title d))
+ <$> fromRight [] <$> parseFile WOS Plain (path <> file) ) files
-- To transform a Csv file into a list of Document
csvToDocs parser patterns time path =
case parser of
Wos _ -> undefined
- Csv limit -> do
- eR <- Csv.readFile path
- case eR of
- Right r ->
- pure $ Vector.toList
- $ Vector.take limit
- $ Vector.map (\row -> Document (toPhyloDate (Csv.fromMIntOrDec Csv.defaultYear $ csv_publication_year row)
- (fromMaybe Csv.defaultMonth $ csv_publication_month row)
- (fromMaybe Csv.defaultDay $ csv_publication_day row)
- time)
- (toPhyloDate' (Csv.fromMIntOrDec Csv.defaultYear $ csv_publication_year row)
- (fromMaybe Csv.defaultMonth $ csv_publication_month row)
- (fromMaybe Csv.defaultDay $ csv_publication_day row))
- (termsInText patterns $ (csv_title row) <> " " <> (csv_abstract row))
- Nothing
- []
- ) $ snd r
- Left e -> panic $ "Error: " <> (pack e)
+ Csv limit -> Vector.toList
+ <$> Vector.take limit
+ <$> Vector.map (\row -> Document (toPhyloDate (Csv.fromMIntOrDec Csv.defaultYear $ csv_publication_year row) (fromMaybe Csv.defaultMonth $ csv_publication_month row) (fromMaybe Csv.defaultDay $ csv_publication_day row) time)
+ (toPhyloDate' (Csv.fromMIntOrDec Csv.defaultYear $ csv_publication_year row) (fromMaybe Csv.defaultMonth $ csv_publication_month row) (fromMaybe Csv.defaultDay $ csv_publication_day row) time)
+ (termsInText patterns $ (csv_title row) <> " " <> (csv_abstract row))
+ Nothing
+ []
+ ) <$> snd <$> either (\err -> panic $ cs $ "CSV error" <> (show err)) identity <$> Csv.readCSVFile path
Csv' limit -> Vector.toList
<$> Vector.take limit
<$> Vector.map (\row -> Document (toPhyloDate (csv'_publication_year row) (csv'_publication_month row) (csv'_publication_day row) time)
- (toPhyloDate' (csv'_publication_year row) (csv'_publication_month row) (csv'_publication_day row))
+ (toPhyloDate' (csv'_publication_year row) (csv'_publication_month row) (csv'_publication_day row) time)
(termsInText patterns $ (csv'_title row) <> " " <> (csv'_abstract row))
(Just $ csv'_weight row)
- [csv'_source row]
+ (map (T.strip . pack) $ splitOn ";" (unpack $ (csv'_source row)))
) <$> snd <$> Csv.readWeightedCsv path
-- Config time parameters to label
-timeToLabel :: Config -> [Char]
+timeToLabel :: PhyloConfig -> [Char]
timeToLabel config = case (timeUnit config) of
- Year p s f -> ("time_years" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
- Month p s f -> ("time_months"<> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
- Week p s f -> ("time_weeks" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
- Day p s f -> ("time_days" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
+ Epoch p s f -> ("time_epochs" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
+ Year p s f -> ("time_years" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
+ Month p s f -> ("time_months" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
+ Week p s f -> ("time_weeks" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
+ Day p s f -> ("time_days" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
-seaToLabel :: Config -> [Char]
+seaToLabel :: PhyloConfig -> [Char]
seaToLabel config = case (seaElevation config) of
Constante start step -> ("sea_cst_" <> (show start) <> "_" <> (show step))
Adaptative granularity -> ("sea_adapt" <> (show granularity))
-sensToLabel :: Config -> [Char]
+sensToLabel :: PhyloConfig -> [Char]
sensToLabel config = case (phyloProximity config) of
- Hamming -> undefined
- WeightedLogJaccard s -> ("WeightedLogJaccard_" <> show s)
- WeightedLogSim s -> ( "WeightedLogSim-sens_" <> show s)
+ Hamming _ _ -> undefined
+ WeightedLogJaccard s _ -> ("WeightedLogJaccard_" <> show s)
+ WeightedLogSim s _ -> ( "WeightedLogSim-sens_" <> show s)
-cliqueToLabel :: Config -> [Char]
+cliqueToLabel :: PhyloConfig -> [Char]
cliqueToLabel config = case (clique config) of
Fis s s' -> "fis_" <> (show s) <> "_" <> (show s')
MaxClique s t f -> "clique_" <> (show s)<> "_" <> (show f)<> "_" <> (show t)
-syncToLabel :: Config -> [Char]
+syncToLabel :: PhyloConfig -> [Char]
syncToLabel config = case (phyloSynchrony config) of
ByProximityThreshold scl sync_sens scope _ -> ("scale_" <> (show scope) <> "_" <> (show sync_sens) <> "_" <> (show scl))
ByProximityDistribution _ _ -> undefined
-qualToConfig :: Config -> [Char]
+qualToConfig :: PhyloConfig -> [Char]
qualToConfig config = case (phyloQuality config) of
Quality g m -> "quality_" <> (show g) <> "_" <> (show m)
-- To set up the export file's label from the configuration
-configToLabel :: Config -> [Char]
+configToLabel :: PhyloConfig -> [Char]
configToLabel config = outputPath config
<> (unpack $ phyloName config)
<> "-" <> (timeToLabel config)
- <> "-scale_" <> (show (phyloLevel config))
+ <> "-scale_" <> (show (phyloScale config))
<> "-" <> (seaToLabel config)
<> "-" <> (sensToLabel config)
<> "-" <> (cliqueToLabel config)
-- To write a sha256 from a set of config's parameters
-configToSha :: PhyloStage -> Config -> [Char]
+configToSha :: Backup -> PhyloConfig -> [Char]
configToSha stage config = unpack
$ replace "/" "-"
$ T.pack (show (hash $ C8.pack label))
where
label :: [Char]
label = case stage of
- PhyloWithCliques -> (corpusPath config)
+ BackupPhyloWithoutLink -> (corpusPath config)
<> (listPath config)
<> (timeToLabel config)
<> (cliqueToLabel config)
- PhyloWithLinks -> (corpusPath config)
+ BackupPhylo -> (corpusPath config)
<> (listPath config)
<> (timeToLabel config)
<> (cliqueToLabel config)
<> (seaToLabel config)
<> (syncToLabel config)
<> (qualToConfig config)
- <> (show (phyloLevel config))
-
-
-writePhylo :: [Char] -> Phylo -> IO ()
-writePhylo path phylo = Lazy.writeFile path $ encode phylo
+ <> (show (phyloScale config))
-readPhylo :: [Char] -> IO Phylo
-readPhylo path = do
- phyloJson <- (eitherDecode <$> readJson path) :: IO (Either String Phylo)
- case phyloJson of
+readListV4 :: [Char] -> IO NgramsList
+readListV4 path = do
+ listJson <- (eitherDecode <$> readJson path) :: IO (Either String NgramsList)
+ case listJson of
Left err -> do
putStrLn err
undefined
- Right phylo -> pure phylo
+ Right listV4 -> pure listV4
+
+
+fileToList :: ListParser -> FilePath -> IO TermList
+fileToList parser path =
+ case parser of
+ V3 -> csvMapTermList path
+ V4 -> fromJust
+ <$> toTermList MapTerm NgramsTerms
+ <$> readListV4 path
--------------
printIOMsg "Read the configuration file"
[args] <- getArgs
- jsonArgs <- (eitherDecode <$> readJson args) :: IO (Either String Config)
+ jsonArgs <- (eitherDecode <$> readJson args) :: IO (Either String PhyloConfig)
case jsonArgs of
Left err -> putStrLn err
Right config -> do
printIOMsg "Parse the corpus"
- mapList <- csvMapTermList (listPath config)
+ mapList <- fileToList (listParser config) (listPath config)
corpus <- fileToDocs' (corpusParser config) (corpusPath config) (timeUnit config) mapList
printIOComment (show (length corpus) <> " parsed docs from the corpus")
printIOMsg "Reconstruct the phylo"
- let phyloWithCliquesFile = (outputPath config) <> "phyloWithCliques_" <> (configToSha PhyloWithCliques config) <> ".json"
- let phyloWithLinksFile = (outputPath config) <> "phyloWithLinks_" <> (configToSha PhyloWithLinks config) <> ".json"
-
- phyloWithCliquesExists <- doesFileExist phyloWithCliquesFile
- phyloWithLinksExists <- doesFileExist phyloWithLinksFile
-
- -- phyloStep <- if phyloWithCliquesExists
- -- then do
- -- printIOMsg "Reconstruct the phylo step from an existing file"
- -- readPhylo phyloWithCliquesFile
- -- else do
- -- printIOMsg "Reconstruct the phylo step from scratch"
- -- pure $ toPhyloStep corpus mapList config
-
- -- writePhylo phyloWithCliquesFile phyloStep
-
- -- let phylo = toPhylo (setConfig config phyloStep)
-
- -- QL: 2 files read from disk
- phyloWithLinks <- if phyloWithLinksExists
- then do
- printIOMsg "Reconstruct the phylo from an existing file with intertemporal links"
- readPhylo phyloWithLinksFile
- else do
- if phyloWithCliquesExists
- then do
- printIOMsg "Reconstruct the phylo from an existing file with cliques"
- phyloWithCliques <- readPhylo phyloWithCliquesFile
- writePhylo phyloWithCliquesFile phyloWithCliques
- pure $ toPhylo (setConfig config phyloWithCliques)
- else do
- printIOMsg "Reconstruct the phylo from scratch"
- phyloWithCliques <- pure $ toPhyloStep corpus mapList config
- writePhylo phyloWithCliquesFile phyloWithCliques
- pure $ toPhylo (setConfig config phyloWithCliques)
+ -- check the existing backup files
- writePhylo phyloWithLinksFile phyloWithLinks
+ let backupPhyloWithoutLink = (outputPath config) <> "backupPhyloWithoutLink_" <> (configToSha BackupPhyloWithoutLink config) <> ".json"
+ let backupPhylo = (outputPath config) <> "backupPhylo_" <> (configToSha BackupPhylo config) <> ".json"
+ phyloWithoutLinkExists <- doesFileExist backupPhyloWithoutLink
+ phyloExists <- doesFileExist backupPhylo
- -- probes
+ -- reconstruct the phylo
- -- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_synchronic_distance_cumu_jaccard.txt")
- -- $ synchronicDistance' phylo 1
+ phylo <- if phyloExists
+ then do
+ printIOMsg "Reconstruct the phylo from an existing file"
+ readPhylo backupPhylo
+ else do
+ if phyloWithoutLinkExists
+ then do
+ printIOMsg "Reconstruct the phylo from an existing file without links"
+ phyloWithoutLink <- readPhylo backupPhyloWithoutLink
+ writePhylo backupPhyloWithoutLink phyloWithoutLink
+ pure $ toPhylo (setConfig config phyloWithoutLink)
+ else do
+ printIOMsg "Reconstruct the phylo from scratch"
+ phyloWithoutLink <- pure $ toPhyloWithoutLink corpus mapList config
+ writePhylo backupPhyloWithoutLink phyloWithoutLink
+ pure $ toPhylo (setConfig config phyloWithoutLink)
- -- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_inflexion_points.txt")
- -- $ inflexionPoints phylo 1
+ writePhylo backupPhylo phylo
printIOMsg "End of reconstruction, start the export"
- let dot = toPhyloExport (setConfig config phyloWithLinks)
+ let dot = toPhyloExport (setConfig config phylo)
let output = configToLabel config