3 Description : Gargantext starter binary with Adaptative Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Adaptative Phylo binaries
13 {-# LANGUAGE StandaloneDeriving #-}
14 {-# LANGUAGE TypeOperators #-}
15 {-# LANGUAGE Strict #-}
20 import Data.List (concat, nub, isSuffixOf)
21 import Data.String (String)
22 import Data.Text (Text, unwords, unpack, replace, pack)
23 import Crypto.Hash.SHA256 (hash)
25 import Gargantext.Prelude
26 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
27 import Gargantext.Core.Text.Context (TermList)
28 import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year, csv_publication_month, csv_publication_day,
29 csv'_source, csv'_title, csv'_abstract, csv'_publication_year, csv'_publication_month, csv'_publication_day, csv'_weight)
30 import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..),parseFile)
31 import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
32 import Gargantext.Core.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
33 import Gargantext.Core.Viz.AdaptativePhylo
34 import Gargantext.Core.Viz.Phylo.PhyloMaker (toPhylo, toPhyloStep)
35 import Gargantext.Core.Viz.Phylo.PhyloTools (printIOMsg, printIOComment, setConfig)
36 import Gargantext.Core.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
37 -- import Gargantext.API.Ngrams.Prelude (toTermList)
39 import GHC.IO (FilePath)
40 import Prelude (Either(Left, Right),toInteger)
41 import System.Environment
42 import System.Directory (listDirectory,doesFileExist)
43 import Control.Concurrent.Async (mapConcurrently)
45 import Data.Time.Calendar (fromGregorian, diffGregorianDurationClip, cdMonths, diffDays, showGregorian)
47 import qualified Data.ByteString.Char8 as C8
48 import qualified Data.ByteString.Lazy as Lazy
49 import qualified Data.Vector as Vector
50 import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as Csv
51 import qualified Data.Text as T
53 -- import Debug.Trace (trace)
55 data PhyloStage = PhyloWithCliques | PhyloWithLinks deriving (Show)
62 -- | To get all the files in a directory or just a file
63 getFilesFromPath :: FilePath -> IO([FilePath])
64 getFilesFromPath path = do
65 if (isSuffixOf "/" path)
66 then (listDirectory path)
74 toMonths :: Integer -> Int -> Int -> Date
75 toMonths y m d = fromIntegral $ cdMonths
76 $ diffGregorianDurationClip (fromGregorian y m d) (fromGregorian 0000 0 0)
79 toDays :: Integer -> Int -> Int -> Date
80 toDays y m d = fromIntegral
81 $ diffDays (fromGregorian y m d) (fromGregorian 0000 0 0)
84 toPhyloDate :: Int -> Int -> Int -> TimeUnit -> Date
85 toPhyloDate y m d tu = case tu of
87 Month _ _ _ -> toMonths (toInteger y) m d
88 Week _ _ _ -> div (toDays (toInteger y) m d) 7
89 Day _ _ _ -> toDays (toInteger y) m d
92 toPhyloDate' :: Int -> Int -> Int -> Text
93 toPhyloDate' y m d = pack $ showGregorian $ fromGregorian (toInteger y) m d
101 -- | To read and decode a Json file
102 readJson :: FilePath -> IO Lazy.ByteString
103 readJson path = Lazy.readFile path
110 -- | To filter the Ngrams of a document based on the termList
111 termsInText :: Patterns -> Text -> [Text]
112 termsInText pats txt = nub $ concat $ map (map unwords) $ extractTermsWithList pats txt
115 -- | To transform a Wos file (or [file]) into a list of Docs
116 wosToDocs :: Int -> Patterns -> TimeUnit -> FilePath -> IO ([Document])
117 wosToDocs limit patterns time path = do
118 files <- getFilesFromPath path
120 <$> map (\d -> let title = fromJust $ _hd_title d
121 abstr = if (isJust $ _hd_abstract d)
122 then fromJust $ _hd_abstract d
124 in Document (toPhyloDate
125 (fromIntegral $ fromJust $ _hd_publication_year d)
126 (fromJust $ _hd_publication_month d)
127 (fromJust $ _hd_publication_day d) time)
129 (fromIntegral $ fromJust $ _hd_publication_year d)
130 (fromJust $ _hd_publication_month d)
131 (fromJust $ _hd_publication_day d))
132 (termsInText patterns $ title <> " " <> abstr) Nothing [])
134 <$> mapConcurrently (\file ->
135 filter (\d -> (isJust $ _hd_publication_year d)
136 && (isJust $ _hd_title d))
137 <$> parseFile WOS (path <> file) ) files
140 -- To transform a Csv file into a list of Document
141 csvToDocs :: CorpusParser -> Patterns -> TimeUnit -> FilePath -> IO ([Document])
142 csvToDocs parser patterns time path =
145 Csv limit -> Vector.toList
146 <$> Vector.take limit
147 <$> Vector.map (\row -> Document (toPhyloDate (csv_publication_year row) (csv_publication_month row) (csv_publication_day row) time)
148 (toPhyloDate' (csv_publication_year row) (csv_publication_month row) (csv_publication_day row))
149 (termsInText patterns $ (csv_title row) <> " " <> (csv_abstract row))
152 ) <$> snd <$> Csv.readFile path
153 Csv' limit -> Vector.toList
154 <$> Vector.take limit
155 <$> Vector.map (\row -> Document (toPhyloDate (csv'_publication_year row) (csv'_publication_month row) (csv'_publication_day row) time)
156 (toPhyloDate' (csv'_publication_year row) (csv'_publication_month row) (csv'_publication_day row))
157 (termsInText patterns $ (csv'_title row) <> " " <> (csv'_abstract row))
158 (Just $ csv'_weight row)
160 ) <$> snd <$> Csv.readWeightedCsv path
163 -- To parse a file into a list of Document
164 fileToDocs' :: CorpusParser -> FilePath -> TimeUnit -> TermList -> IO [Document]
165 fileToDocs' parser path time lst = do
166 let patterns = buildPatterns lst
168 Wos limit -> wosToDocs limit patterns time path
169 Csv _ -> csvToDocs parser patterns time path
170 Csv' _ -> csvToDocs parser patterns time path
178 -- Config time parameters to label
179 timeToLabel :: Config -> [Char]
180 timeToLabel config = case (timeUnit config) of
181 Year p s f -> ("time_years" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
182 Month p s f -> ("time_months"<> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
183 Week p s f -> ("time_weeks" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
184 Day p s f -> ("time_days" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
187 seaToLabel :: Config -> [Char]
188 seaToLabel config = case (seaElevation config) of
189 Constante start step -> ("sea_cst_" <> (show start) <> "_" <> (show step))
190 Adaptative granularity -> ("sea_adapt" <> (show granularity))
193 sensToLabel :: Config -> [Char]
194 sensToLabel config = case (phyloProximity config) of
196 WeightedLogJaccard s -> ("WeightedLogJaccard_" <> show s)
197 WeightedLogSim s -> ( "WeightedLogSim-sens_" <> show s)
200 cliqueToLabel :: Config -> [Char]
201 cliqueToLabel config = case (clique config) of
202 Fis s s' -> "fis_" <> (show s) <> "_" <> (show s')
203 MaxClique s t f -> "clique_" <> (show s)<> "_" <> (show f)<> "_" <> (show t)
206 syncToLabel :: Config -> [Char]
207 syncToLabel config = case (phyloSynchrony config) of
208 ByProximityThreshold scl sync_sens scope _ -> ("scale_" <> (show scope) <> "_" <> (show sync_sens) <> "_" <> (show scl))
209 ByProximityDistribution _ _ -> undefined
211 qualToConfig :: Config -> [Char]
212 qualToConfig config = case (phyloQuality config) of
213 Quality g m -> "quality_" <> (show g) <> "_" <> (show m)
216 -- To set up the export file's label from the configuration
217 configToLabel :: Config -> [Char]
218 configToLabel config = outputPath config
219 <> (unpack $ phyloName config)
220 <> "-" <> (timeToLabel config)
221 <> "-scale_" <> (show (phyloLevel config))
222 <> "-" <> (seaToLabel config)
223 <> "-" <> (sensToLabel config)
224 <> "-" <> (cliqueToLabel config)
225 <> "-level_" <> (show (_qua_granularity $ phyloQuality config))
226 <> "-" <> (syncToLabel config)
230 -- To write a sha256 from a set of config's parameters
231 configToSha :: PhyloStage -> Config -> [Char]
232 configToSha stage config = unpack
234 $ T.pack (show (hash $ C8.pack label))
237 label = case stage of
238 PhyloWithCliques -> (corpusPath config)
240 <> (timeToLabel config)
241 <> (cliqueToLabel config)
242 PhyloWithLinks -> (corpusPath config)
244 <> (timeToLabel config)
245 <> (cliqueToLabel config)
246 <> (sensToLabel config)
247 <> (seaToLabel config)
248 <> (syncToLabel config)
249 <> (qualToConfig config)
250 <> (show (phyloLevel config))
253 writePhylo :: [Char] -> Phylo -> IO ()
254 writePhylo path phylo = Lazy.writeFile path $ encode phylo
257 readPhylo :: [Char] -> IO Phylo
259 phyloJson <- (eitherDecode <$> readJson path) :: IO (Either String Phylo)
264 Right phylo -> pure phylo
275 printIOMsg "Starting the reconstruction"
277 printIOMsg "Read the configuration file"
279 jsonArgs <- (eitherDecode <$> readJson args) :: IO (Either String Config)
282 Left err -> putStrLn err
285 printIOMsg "Parse the corpus"
286 mapList <- csvMapTermList (listPath config)
287 corpus <- fileToDocs' (corpusParser config) (corpusPath config) (timeUnit config) mapList
288 printIOComment (show (length corpus) <> " parsed docs from the corpus")
290 printIOMsg "Reconstruct the phylo"
292 let phyloWithCliquesFile = (outputPath config) <> "phyloWithCliques_" <> (configToSha PhyloWithCliques config) <> ".json"
293 let phyloWithLinksFile = (outputPath config) <> "phyloWithLinks_" <> (configToSha PhyloWithLinks config) <> ".json"
295 phyloWithCliquesExists <- doesFileExist phyloWithCliquesFile
296 phyloWithLinksExists <- doesFileExist phyloWithLinksFile
298 -- phyloStep <- if phyloWithCliquesExists
300 -- printIOMsg "Reconstruct the phylo step from an existing file"
301 -- readPhylo phyloWithCliquesFile
303 -- printIOMsg "Reconstruct the phylo step from scratch"
304 -- pure $ toPhyloStep corpus mapList config
306 -- writePhylo phyloWithCliquesFile phyloStep
308 -- let phylo = toPhylo (setConfig config phyloStep)
310 phyloWithLinks <- if phyloWithLinksExists
312 printIOMsg "Reconstruct the phylo from an existing file with intertemporal links"
313 readPhylo phyloWithLinksFile
315 if phyloWithCliquesExists
317 printIOMsg "Reconstruct the phylo from an existing file with cliques"
318 phyloWithCliques <- readPhylo phyloWithCliquesFile
319 writePhylo phyloWithCliquesFile phyloWithCliques
320 pure $ toPhylo (setConfig config phyloWithCliques)
322 printIOMsg "Reconstruct the phylo from scratch"
323 phyloWithCliques <- pure $ toPhyloStep corpus mapList config
324 writePhylo phyloWithCliquesFile phyloWithCliques
325 pure $ toPhylo (setConfig config phyloWithCliques)
327 writePhylo phyloWithLinksFile phyloWithLinks
332 -- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_synchronic_distance_cumu_jaccard.txt")
333 -- $ synchronicDistance' phylo 1
335 -- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_inflexion_points.txt")
336 -- $ inflexionPoints phylo 1
338 printIOMsg "End of reconstruction, start the export"
340 let dot = toPhyloExport (setConfig config phyloWithLinks)
342 let output = configToLabel config