3 Description : Gargantext starter binary with Adaptative Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Adaptative Phylo binaries
13 {-# LANGUAGE StandaloneDeriving #-}
14 {-# LANGUAGE TypeOperators #-}
15 {-# LANGUAGE Strict #-}
20 import Data.List (concat, nub, isSuffixOf)
21 import Data.String (String)
22 import Data.Text (Text, unwords, unpack, replace)
23 import Crypto.Hash.SHA256 (hash)
25 import Gargantext.Prelude
26 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
27 import Gargantext.Core.Text.Context (TermList)
28 import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year)
29 import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..),parseFile)
30 import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
31 import Gargantext.Core.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
32 import Gargantext.Core.Viz.AdaptativePhylo
33 import Gargantext.Core.Viz.Phylo.PhyloMaker (toPhylo, toPhyloStep)
34 import Gargantext.Core.Viz.Phylo.PhyloTools (printIOMsg, printIOComment, setConfig)
35 import Gargantext.Core.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
37 import GHC.IO (FilePath)
38 import Prelude (Either(Left, Right))
39 import System.Environment
40 import System.Directory (listDirectory,doesFileExist)
41 import Control.Concurrent.Async (mapConcurrently)
43 import qualified Data.ByteString.Char8 as C8
44 import qualified Data.ByteString.Lazy as Lazy
45 import qualified Data.Vector as Vector
46 import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as Csv
47 import qualified Data.Text as T
50 data PhyloStage = PhyloWithCliques | PhyloWithLinks deriving (Show)
59 -- | To get all the files in a directory or just a file
60 getFilesFromPath :: FilePath -> IO([FilePath])
61 getFilesFromPath path = do
62 if (isSuffixOf "/" path)
63 then (listDirectory path)
72 -- | To read and decode a Json file
73 readJson :: FilePath -> IO Lazy.ByteString
74 readJson path = Lazy.readFile path
81 -- | To filter the Ngrams of a document based on the termList
82 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
83 filterTerms patterns (y,d) = (y,termsInText patterns d)
85 --------------------------------------
86 termsInText :: Patterns -> Text -> [Text]
87 termsInText pats txt = nub $ concat $ map (map unwords) $ extractTermsWithList pats txt
88 --------------------------------------
91 -- | To transform a Wos file (or [file]) into a readable corpus
92 wosToCorpus :: Int -> FilePath -> IO ([(Int,Text)])
93 wosToCorpus limit path = do
94 files <- getFilesFromPath path
96 <$> map (\d -> let date' = fromJust $ _hd_publication_year d
97 title = fromJust $ _hd_title d
98 abstr = if (isJust $ _hd_abstract d)
99 then fromJust $ _hd_abstract d
101 in (date', title <> " " <> abstr))
103 <$> mapConcurrently (\file ->
104 filter (\d -> (isJust $ _hd_publication_year d)
105 && (isJust $ _hd_title d))
106 <$> parseFile WOS (path <> file) ) files
109 -- | To transform a Csv file into a readable corpus
110 csvToCorpus :: Int -> FilePath -> IO ([(Int,Text)])
111 csvToCorpus limit path = Vector.toList
112 <$> Vector.take limit
113 <$> Vector.map (\row -> (csv_publication_year row, (csv_title row) <> " " <> (csv_abstract row)))
114 <$> snd <$> Csv.readFile path
117 -- | To use the correct parser given a CorpusType
118 fileToCorpus :: CorpusParser -> FilePath -> IO ([(Int,Text)])
119 fileToCorpus parser path = case parser of
120 Wos limit -> wosToCorpus limit path
121 Csv limit -> csvToCorpus limit path
124 -- | To parse a file into a list of Document
125 fileToDocs :: CorpusParser -> FilePath -> TermList -> IO [Document]
126 fileToDocs parser path lst = do
127 corpus <- fileToCorpus parser path
128 let patterns = buildPatterns lst
129 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
132 -- Config time parameters to label
133 timeToLabel :: Config -> [Char]
134 timeToLabel config = case (timeUnit config) of
135 Year p s f -> ("time"<> "_"<> (show p) <> "_" <> (show s) <> "_" <> (show f))
138 seaToLabel :: Config -> [Char]
139 seaToLabel config = case (seaElevation config) of
140 Constante start step -> ("sea_cst_" <> (show start) <> "_" <> (show step))
141 Adaptative granularity -> ("sea_adapt" <> (show granularity))
144 sensToLabel :: Config -> [Char]
145 sensToLabel config = case (phyloProximity config) of
147 WeightedLogJaccard s -> ("WeightedLogJaccard_" <> show s)
148 WeightedLogSim s -> ( "WeightedLogSim-sens_" <> show s)
151 cliqueToLabel :: Config -> [Char]
152 cliqueToLabel config = case (clique config) of
153 Fis s s' -> "fis_" <> (show s) <> "_" <> (show s')
154 MaxClique s t f -> "clique_" <> (show s)<> "_" <> (show f)<> "_" <> (show t)
157 syncToLabel :: Config -> [Char]
158 syncToLabel config = case (phyloSynchrony config) of
159 ByProximityThreshold scl sync_sens scope _ -> ("scale_" <> (show scope) <> "_" <> (show sync_sens) <> "_" <> (show scl))
160 ByProximityDistribution _ _ -> undefined
162 qualToConfig :: Config -> [Char]
163 qualToConfig config = case (phyloQuality config) of
164 Quality g m -> "quality_" <> (show g) <> "_" <> (show m)
167 -- To set up the export file's label from the configuration
168 configToLabel :: Config -> [Char]
169 configToLabel config = outputPath config
170 <> (unpack $ phyloName config)
171 <> "-" <> (timeToLabel config)
172 <> "-scale_" <> (show (phyloLevel config))
173 <> "-" <> (seaToLabel config)
174 <> "-" <> (sensToLabel config)
175 <> "-" <> (cliqueToLabel config)
176 <> "-level_" <> (show (_qua_granularity $ phyloQuality config))
177 <> "-" <> (syncToLabel config)
181 -- To write a sha256 from a set of config's parameters
182 configToSha :: PhyloStage -> Config -> [Char]
183 configToSha stage config = unpack
185 $ T.pack (show (hash $ C8.pack label))
188 label = case stage of
189 PhyloWithCliques -> (corpusPath config)
191 <> (timeToLabel config)
192 <> (cliqueToLabel config)
193 PhyloWithLinks -> (corpusPath config)
195 <> (timeToLabel config)
196 <> (cliqueToLabel config)
197 <> (sensToLabel config)
198 <> (seaToLabel config)
199 <> (syncToLabel config)
200 <> (qualToConfig config)
201 <> (show (phyloLevel config))
204 writePhylo :: [Char] -> Phylo -> IO ()
205 writePhylo path phylo = Lazy.writeFile path $ encode phylo
208 readPhylo :: [Char] -> IO Phylo
210 phyloJson <- (eitherDecode <$> readJson path) :: IO (Either String Phylo)
215 Right phylo -> pure phylo
226 printIOMsg "Starting the reconstruction"
228 printIOMsg "Read the configuration file"
230 jsonArgs <- (eitherDecode <$> readJson args) :: IO (Either String Config)
233 Left err -> putStrLn err
236 printIOMsg "Parse the corpus"
237 mapList <- csvMapTermList (listPath config)
238 corpus <- fileToDocs (corpusParser config) (corpusPath config) mapList
239 printIOComment (show (length corpus) <> " parsed docs from the corpus")
241 printIOMsg "Reconstruct the phylo"
243 let phyloWithCliquesFile = (outputPath config) <> "phyloWithCliques_" <> (configToSha PhyloWithCliques config) <> ".json"
244 let phyloWithLinksFile = (outputPath config) <> "phyloWithLinks_" <> (configToSha PhyloWithLinks config) <> ".json"
246 phyloWithCliquesExists <- doesFileExist phyloWithCliquesFile
247 phyloWithLinksExists <- doesFileExist phyloWithLinksFile
249 -- phyloStep <- if phyloWithCliquesExists
251 -- printIOMsg "Reconstruct the phylo step from an existing file"
252 -- readPhylo phyloWithCliquesFile
254 -- printIOMsg "Reconstruct the phylo step from scratch"
255 -- pure $ toPhyloStep corpus mapList config
257 -- writePhylo phyloWithCliquesFile phyloStep
259 -- let phylo = toPhylo (setConfig config phyloStep)
261 phyloWithLinks <- if phyloWithLinksExists
263 printIOMsg "Reconstruct the phylo from an existing file with intertemporal links"
264 readPhylo phyloWithLinksFile
266 if phyloWithCliquesExists
268 printIOMsg "Reconstruct the phylo from an existing file with cliques"
269 phyloWithCliques <- readPhylo phyloWithCliquesFile
270 writePhylo phyloWithCliquesFile phyloWithCliques
271 pure $ toPhylo (setConfig config phyloWithCliques)
273 printIOMsg "Reconstruct the phylo from scratch"
274 phyloWithCliques <- pure $ toPhyloStep corpus mapList config
275 writePhylo phyloWithCliquesFile phyloWithCliques
276 pure $ toPhylo (setConfig config phyloWithCliques)
278 writePhylo phyloWithLinksFile phyloWithLinks
283 -- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_synchronic_distance_cumu_jaccard.txt")
284 -- $ synchronicDistance' phylo 1
286 -- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_inflexion_points.txt")
287 -- $ inflexionPoints phylo 1
289 printIOMsg "End of reconstruction, start the export"
291 let dot = toPhyloExport (setConfig config phyloWithLinks)
293 let output = configToLabel config