{-|
Module : Main.hs
-Description : Gargantext starter binary with Phylo
+Description : Gargantext starter binary with Adaptative Phylo
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-Phylo binaries
-
+Adaptative Phylo binaries
-}
{-# LANGUAGE StandaloneDeriving #-}
module Main where
-import Control.Concurrent.Async as CCA (mapConcurrently)
-import Control.Monad (mapM)
+-- import Debug.Trace (trace)
+import Control.Concurrent.Async (mapConcurrently)
+import Crypto.Hash.SHA256 (hash)
import Data.Aeson
-import Data.List ((++),concat)
-import Data.Maybe
-import Data.Text (Text, unwords, unlines)
-import GHC.Generics
+import Data.Either (Either(..), fromRight)
+import Data.List (concat, nub, isSuffixOf)
+import Data.List.Split
+import Data.Maybe (fromMaybe)
+import Data.String (String)
+import Data.Text (Text, unwords, unpack, replace, pack)
import GHC.IO (FilePath)
-import Gargantext.Database.Admin.Types.Hyperdata
-import Gargantext.Database.Admin.Types.Node
-import Gargantext.Prelude
+import Gargantext.API.Ngrams.Prelude (toTermList)
+import Gargantext.API.Ngrams.Types
import Gargantext.Core.Text.Context (TermList)
-import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..),parseFile)
-import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year)
+import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), parseFile)
+import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year, csv_publication_month, csv_publication_day, csv'_source, csv'_title, csv'_abstract, csv'_publication_year, csv'_publication_month, csv'_publication_day, csv'_weight)
import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
-import Gargantext.Core.Text.Terms.WithList
+import Gargantext.Core.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
+import Gargantext.Core.Types.Main (ListType(..))
import Gargantext.Core.Viz.Phylo
-import Gargantext.Core.Viz.Phylo.LevelMaker
-import Gargantext.Core.Viz.Phylo.Tools
-import Gargantext.Core.Viz.Phylo.View.Export
-import Gargantext.Core.Viz.Phylo.View.ViewMaker
-import System.Directory (doesFileExist)
+import Gargantext.Core.Viz.Phylo.API.Tools
+import Gargantext.Core.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
+import Gargantext.Core.Viz.Phylo.PhyloMaker (toPhylo, toPhyloStep)
+import Gargantext.Core.Viz.Phylo.PhyloTools (printIOMsg, printIOComment, setConfig)
+import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
+import Gargantext.Database.Schema.Ngrams (NgramsType(..))
+import Gargantext.Prelude
+import System.Directory (listDirectory,doesFileExist)
import System.Environment
-import qualified Data.ByteString.Lazy as L
-import qualified Data.List as DL
-import qualified Data.Map as DM
-import qualified Data.Text as DT
-import qualified Data.Vector as DV
-import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as CSV
-import qualified Prelude as P
+import qualified Data.ByteString.Char8 as C8
+import qualified Data.Text as T
+import qualified Data.Vector as Vector
+import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as Csv
+data PhyloStage = PhyloWithCliques | PhyloWithLinks deriving (Show)
---------------
--- | Conf | --
---------------
+---------------
+-- | Tools | --
+---------------
+-- | To get all the files in a directory or just a file
+getFilesFromPath :: FilePath -> IO [FilePath]
+getFilesFromPath path = do
+ if (isSuffixOf "/" path)
+ then (listDirectory path)
+ else return [path]
-type ListPath = FilePath
-type FisPath = FilePath
-type CorpusPath = FilePath
-data CorpusType = Wos | Csv deriving (Show,Generic)
-type Limit = Int
-
-data Conf =
- Conf { corpusPath :: CorpusPath
- , corpusType :: CorpusType
- , listPath :: ListPath
- , fisPath :: FilePath
- , outputPath :: FilePath
- , phyloName :: Text
- , limit :: Limit
- , timeGrain :: Int
- , timeStep :: Int
- , timeFrame :: Int
- , timeFrameTh :: Double
- , timeTh :: Double
- , timeSens :: Double
- , reBranchThr :: Double
- , reBranchNth :: Int
- , clusterTh :: Double
- , clusterSens :: Double
- , phyloLevel :: Int
- , viewLevel :: Int
- , fisSupport :: Int
- , fisClique :: Int
- , minSizeBranch :: Int
- } deriving (Show,Generic)
-
-instance FromJSON Conf
-instance ToJSON Conf
-
-instance FromJSON CorpusType
-instance ToJSON CorpusType
-
-
-decoder :: P.Either a b -> b
-decoder (P.Left _) = P.error "Error"
-decoder (P.Right x) = x
-
--- | Get the conf from a Json file
-getJson :: FilePath -> IO L.ByteString
-getJson path = L.readFile path
+----------------
+-- | Parser | --
+----------------
+
+-- | To filter the Ngrams of a document based on the termList
+termsInText :: Patterns -> Text -> [Text]
+termsInText pats txt = nub $ concat $ map (map unwords) $ extractTermsWithList pats txt
+
+
+-- | To transform a Wos file (or [file]) into a list of Docs
+wosToDocs :: Int -> Patterns -> TimeUnit -> FilePath -> IO [Document]
+wosToDocs limit patterns time path = do
+ files <- getFilesFromPath path
+ take limit
+ <$> map (\d -> let title = fromJust $ _hd_title d
+ abstr = if (isJust $ _hd_abstract d)
+ then fromJust $ _hd_abstract d
+ else ""
+ in Document (toPhyloDate
+ (fromIntegral $ fromJust $ _hd_publication_year d)
+ (fromJust $ _hd_publication_month d)
+ (fromJust $ _hd_publication_day d) time)
+ (toPhyloDate'
+ (fromIntegral $ fromJust $ _hd_publication_year d)
+ (fromJust $ _hd_publication_month d)
+ (fromJust $ _hd_publication_day d) time)
+ (termsInText patterns $ title <> " " <> abstr) Nothing [])
+ <$> concat
+ <$> mapConcurrently (\file ->
+ filter (\d -> (isJust $ _hd_publication_year d)
+ && (isJust $ _hd_title d))
+ <$> fromRight [] <$> parseFile WOS Plain (path <> file) ) files
+
+
+-- To transform a Csv file into a list of Document
+csvToDocs :: CorpusParser -> Patterns -> TimeUnit -> FilePath -> IO [Document]
+csvToDocs parser patterns time path =
+ case parser of
+ Wos _ -> undefined
+ Csv limit -> Vector.toList
+ <$> Vector.take limit
+ <$> Vector.map (\row -> Document (toPhyloDate (Csv.fromMIntOrDec Csv.defaultYear $ csv_publication_year row) (fromMaybe Csv.defaultMonth $ csv_publication_month row) (fromMaybe Csv.defaultDay $ csv_publication_day row) time)
+ (toPhyloDate' (Csv.fromMIntOrDec Csv.defaultYear $ csv_publication_year row) (fromMaybe Csv.defaultMonth $ csv_publication_month row) (fromMaybe Csv.defaultDay $ csv_publication_day row) time)
+ (termsInText patterns $ (csv_title row) <> " " <> (csv_abstract row))
+ Nothing
+ []
+ ) <$> snd <$> either (\err -> panic $ cs $ "CSV error" <> (show err)) identity <$> Csv.readCSVFile path
+ Csv' limit -> Vector.toList
+ <$> Vector.take limit
+ <$> Vector.map (\row -> Document (toPhyloDate (csv'_publication_year row) (csv'_publication_month row) (csv'_publication_day row) time)
+ (toPhyloDate' (csv'_publication_year row) (csv'_publication_month row) (csv'_publication_day row) time)
+ (termsInText patterns $ (csv'_title row) <> " " <> (csv'_abstract row))
+ (Just $ csv'_weight row)
+ (map (T.strip . pack) $ splitOn ";" (unpack $ (csv'_source row)))
+ ) <$> snd <$> Csv.readWeightedCsv path
+
+
+-- To parse a file into a list of Document
+fileToDocs' :: CorpusParser -> FilePath -> TimeUnit -> TermList -> IO [Document]
+fileToDocs' parser path time lst = do
+ let patterns = buildPatterns lst
+ case parser of
+ Wos limit -> wosToDocs limit patterns time path
+ Csv _ -> csvToDocs parser patterns time path
+ Csv' _ -> csvToDocs parser patterns time path
---------------
--- | Parse | --
+-- | Label | --
---------------
--- | To filter the Ngrams of a document based on the termList
-filterTerms :: Patterns -> (a, Text) -> (a, [Text])
-filterTerms patterns (y,d) = (y,termsInText patterns d)
+-- Config time parameters to label
+timeToLabel :: PhyloConfig -> [Char]
+timeToLabel config = case (timeUnit config) of
+ Epoch p s f -> ("time_epochs" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
+ Year p s f -> ("time_years" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
+ Month p s f -> ("time_months" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
+ Week p s f -> ("time_weeks" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
+ Day p s f -> ("time_days" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
+
+
+seaToLabel :: PhyloConfig -> [Char]
+seaToLabel config = case (seaElevation config) of
+ Constante start step -> ("sea_cst_" <> (show start) <> "_" <> (show step))
+ Adaptative granularity -> ("sea_adapt" <> (show granularity))
+
+
+sensToLabel :: PhyloConfig -> [Char]
+sensToLabel config = case (phyloProximity config) of
+ Hamming _ -> undefined
+ WeightedLogJaccard s -> ("WeightedLogJaccard_" <> show s)
+ WeightedLogSim s -> ( "WeightedLogSim-sens_" <> show s)
+
+
+cliqueToLabel :: PhyloConfig -> [Char]
+cliqueToLabel config = case (clique config) of
+ Fis s s' -> "fis_" <> (show s) <> "_" <> (show s')
+ MaxClique s t f -> "clique_" <> (show s)<> "_" <> (show f)<> "_" <> (show t)
+
+
+syncToLabel :: PhyloConfig -> [Char]
+syncToLabel config = case (phyloSynchrony config) of
+ ByProximityThreshold scl sync_sens scope _ -> ("scale_" <> (show scope) <> "_" <> (show sync_sens) <> "_" <> (show scl))
+ ByProximityDistribution _ _ -> undefined
+
+qualToConfig :: PhyloConfig -> [Char]
+qualToConfig config = case (phyloQuality config) of
+ Quality g m -> "quality_" <> (show g) <> "_" <> (show m)
+
+
+-- To set up the export file's label from the configuration
+configToLabel :: PhyloConfig -> [Char]
+configToLabel config = outputPath config
+ <> (unpack $ phyloName config)
+ <> "-" <> (timeToLabel config)
+ <> "-scale_" <> (show (phyloLevel config))
+ <> "-" <> (seaToLabel config)
+ <> "-" <> (sensToLabel config)
+ <> "-" <> (cliqueToLabel config)
+ <> "-level_" <> (show (_qua_granularity $ phyloQuality config))
+ <> "-" <> (syncToLabel config)
+ <> ".dot"
+
+
+-- To write a sha256 from a set of config's parameters
+configToSha :: PhyloStage -> PhyloConfig -> [Char]
+configToSha stage config = unpack
+ $ replace "/" "-"
+ $ T.pack (show (hash $ C8.pack label))
where
- --------------------------------------
- termsInText :: Patterns -> Text -> [Text]
- termsInText pats txt = DL.nub
- $ DL.concat
- $ map (map unwords)
- $ extractTermsWithList pats txt
- --------------------------------------
-
-
--- | To transform a Csv nfile into a readable corpus
-csvToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
-csvToCorpus limit csv = DV.toList
- -- . DV.reverse
- . DV.take limit
- -- . DV.reverse
- . DV.map (\n -> (csv_publication_year n, (csv_title n) <> " " <> (csv_abstract n)))
- . snd <$> CSV.readFile csv
-
-
--- | To transform a Wos nfile into a readable corpus
-wosToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
-wosToCorpus limit path = DL.take limit
- . map (\d -> ((fromJust $_hd_publication_year d)
- ,(fromJust $_hd_title d) <> " " <> (fromJust $_hd_abstract d)))
- . filter (\d -> (isJust $_hd_publication_year d)
- && (isJust $_hd_title d)
- && (isJust $_hd_abstract d))
- . concat
- <$> mapConcurrently (\idx -> parseFile WOS (path <> show(idx) <> ".txt")) [1..20]
-
-
--- | To use the correct parser given a CorpusType
-fileToCorpus :: CorpusType -> Limit -> CorpusPath -> IO ([(Int,Text)])
-fileToCorpus format limit path = case format of
- Wos -> wosToCorpus limit path
- Csv -> csvToCorpus limit path
-
-
--- | To parse a file into a list of Document
-parse :: CorpusType -> Limit -> CorpusPath -> TermList -> IO [Document]
-parse format limit path l = do
- corpus <- fileToCorpus format limit path
- let patterns = buildPatterns l
- pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
-
-
--- | To parse an existing Fis file
-parseFis :: FisPath -> Text -> Int -> Int -> Int -> Int -> IO [PhyloFis]
-parseFis path name grain step support clique = do
- fisExists <- doesFileExist (path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json")
- if fisExists
- then do
- fisJson <- (eitherDecode <$> getJson (path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json")) :: IO (P.Either P.String [PhyloFis])
- case fisJson of
- P.Left err -> do
- putStrLn err
- pure []
- P.Right fis -> pure fis
- else pure []
-
-writeFis :: FisPath -> Text -> Int -> Int -> Int -> Int -> DM.Map (Date,Date) [PhyloFis] -> IO ()
-writeFis path name grain step support clique fis = do
- let fisPath = path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json"
- L.writeFile fisPath $ encode (DL.concat $ DM.elems fis)
+ label :: [Char]
+ label = case stage of
+ PhyloWithCliques -> (corpusPath config)
+ <> (listPath config)
+ <> (timeToLabel config)
+ <> (cliqueToLabel config)
+ PhyloWithLinks -> (corpusPath config)
+ <> (listPath config)
+ <> (timeToLabel config)
+ <> (cliqueToLabel config)
+ <> (sensToLabel config)
+ <> (seaToLabel config)
+ <> (syncToLabel config)
+ <> (qualToConfig config)
+ <> (show (phyloLevel config))
+
+
+readListV4 :: [Char] -> IO NgramsList
+readListV4 path = do
+ listJson <- (eitherDecode <$> readJson path) :: IO (Either String NgramsList)
+ case listJson of
+ Left err -> do
+ putStrLn err
+ undefined
+ Right listV4 -> pure listV4
+
+
+fileToList :: ListParser -> FilePath -> IO TermList
+fileToList parser path =
+ case parser of
+ V3 -> csvMapTermList path
+ V4 -> fromJust
+ <$> toTermList MapTerm NgramsTerms
+ <$> readListV4 path
+
--------------
-- | Main | --
main :: IO ()
-main = do
+main = do
+
+ printIOMsg "Starting the reconstruction"
+
+ printIOMsg "Read the configuration file"
+ [args] <- getArgs
+ jsonArgs <- (eitherDecode <$> readJson args) :: IO (Either String PhyloConfig)
+
+ case jsonArgs of
+ Left err -> putStrLn err
+ Right config -> do
+
+ printIOMsg "Parse the corpus"
+ mapList <- fileToList (listParser config) (listPath config)
+ corpus <- fileToDocs' (corpusParser config) (corpusPath config) (timeUnit config) mapList
+ printIOComment (show (length corpus) <> " parsed docs from the corpus")
- [jsonPath] <- getArgs
+ printIOMsg "Reconstruct the phylo"
- confJson <- (eitherDecode <$> getJson jsonPath) :: IO (P.Either P.String Conf)
+ let phyloWithCliquesFile = (outputPath config) <> "phyloWithCliques_" <> (configToSha PhyloWithCliques config) <> ".json"
+ let phyloWithLinksFile = (outputPath config) <> "phyloWithLinks_" <> (configToSha PhyloWithLinks config) <> ".json"
- case confJson of
- P.Left err -> putStrLn err
- P.Right conf -> do
+ phyloWithCliquesExists <- doesFileExist phyloWithCliquesFile
+ phyloWithLinksExists <- doesFileExist phyloWithLinksFile
- termList <- csvMapTermList (listPath conf)
+ -- phyloStep <- if phyloWithCliquesExists
+ -- then do
+ -- printIOMsg "Reconstruct the phylo step from an existing file"
+ -- readPhylo phyloWithCliquesFile
+ -- else do
+ -- printIOMsg "Reconstruct the phylo step from scratch"
+ -- pure $ toPhyloStep corpus mapList config
- corpus <- parse (corpusType conf) (limit conf) (corpusPath conf) termList
+ -- writePhylo phyloWithCliquesFile phyloStep
- putStrLn $ ("\n" <> show (length corpus) <> " parsed docs")
+ -- let phylo = toPhylo (setConfig config phyloStep)
- fis <- parseFis (fisPath conf) (phyloName conf) (timeGrain conf) (timeStep conf) (fisSupport conf) (fisClique conf)
+ phyloWithLinks <- if phyloWithLinksExists
+ then do
+ printIOMsg "Reconstruct the phylo from an existing file with intertemporal links"
+ readPhylo phyloWithLinksFile
+ else do
+ if phyloWithCliquesExists
+ then do
+ printIOMsg "Reconstruct the phylo from an existing file with cliques"
+ phyloWithCliques <- readPhylo phyloWithCliquesFile
+ writePhylo phyloWithCliquesFile phyloWithCliques
+ pure $ toPhylo (setConfig config phyloWithCliques)
+ else do
+ printIOMsg "Reconstruct the phylo from scratch"
+ phyloWithCliques <- pure $ toPhyloStep corpus mapList config
+ writePhylo phyloWithCliquesFile phyloWithCliques
+ pure $ toPhylo (setConfig config phyloWithCliques)
- putStrLn $ ("\n" <> show (length fis) <> " parsed fis")
+ writePhylo phyloWithLinksFile phyloWithLinks
- let fis' = DM.fromListWith (++) $ DL.sortOn (fst . fst) $ map (\f -> (getFisPeriod f,[f])) fis
-
- let query = PhyloQueryBuild (phyloName conf) "" (timeGrain conf) (timeStep conf)
- (Fis $ FisParams True (fisSupport conf) (fisClique conf)) [] [] (WeightedLogJaccard $ WLJParams (timeTh conf) (timeSens conf)) (timeFrame conf) (timeFrameTh conf)
- (reBranchThr conf) (reBranchNth conf) (phyloLevel conf)
- (RelatedComponents $ RCParams $ WeightedLogJaccard $ WLJParams (clusterTh conf) (clusterSens conf))
- let queryView = PhyloQueryView (viewLevel conf) Merge False 1 [BranchAge,BranchBirth,BranchGroups] [SizeBranch $ SBParams (minSizeBranch conf)] [GroupLabelIncDyn,BranchPeakInc] (Just (ByBranchBirth,Asc)) Json Flat True
+ -- probes
- let phylo = toPhylo query corpus termList fis'
+ -- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_synchronic_distance_cumu_jaccard.txt")
+ -- $ synchronicDistance' phylo 1
- writeFis (fisPath conf) (phyloName conf) (timeGrain conf) (timeStep conf) (fisSupport conf) (fisClique conf) (getPhyloFis phylo)
+ -- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_inflexion_points.txt")
+ -- $ inflexionPoints phylo 1
- let view = toPhyloView queryView phylo
+ printIOMsg "End of reconstruction, start the export"
- putStrLn $ ("phylo completed until level " <> show (phyloLevel conf) <> ", export at level " <> show (viewLevel conf))
+ let dot = toPhyloExport (setConfig config phyloWithLinks)
- let outputFile = (outputPath conf) <> (DT.unpack $ phyloName conf)
- <> "_" <> show (limit conf) <> "_"
- <> "_" <> show (timeTh conf) <> "_"
- <> "_" <> show (timeSens conf) <> "_"
- <> "_" <> show (clusterTh conf) <> "_"
- <> "_" <> show (clusterSens conf)
- <> ".dot"
+ let output = configToLabel config
- P.writeFile outputFile $ dotToString $ viewToDot view
+ dotToFile output dot