3 Description : Gargantext starter binary with Adaptative Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Adaptative Phylo binaries
13 {-# LANGUAGE StandaloneDeriving #-}
14 {-# LANGUAGE TypeOperators #-}
15 {-# LANGUAGE Strict #-}
19 -- import Debug.Trace (trace)
20 import Control.Concurrent.Async (mapConcurrently)
21 import Crypto.Hash.SHA256 (hash)
23 import Data.Either (Either(..), fromRight)
24 import Data.List (concat, nub, isSuffixOf,sort,tail)
25 import Data.List.Split
26 import Data.Maybe (fromMaybe)
27 import Data.String (String)
28 import Data.Text (Text, unwords, unpack, replace, pack)
29 import GHC.IO (FilePath)
30 import Gargantext.API.Ngrams.Prelude (toTermList)
31 import Gargantext.API.Ngrams.Types
32 import Gargantext.Core.Text.Context (TermList)
33 import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), parseFile)
34 import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year, csv_publication_month, csv_publication_day, csv'_source, csv'_title, csv'_abstract, csv'_publication_year, csv'_publication_month, csv'_publication_day, csv'_weight)
35 import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
36 import Gargantext.Core.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
37 import Gargantext.Core.Types.Main (ListType(..))
38 import Gargantext.Core.Viz.Phylo
39 import Gargantext.Core.Viz.Phylo.API.Tools
40 import Gargantext.Core.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
41 import Gargantext.Core.Viz.Phylo.PhyloMaker (toPhylo, toPhyloWithoutLink)
42 import Gargantext.Core.Viz.Phylo.PhyloTools (printIOMsg, printIOComment, setConfig, toPeriods, getTimePeriod, getTimeStep)
43 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
44 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
45 import Gargantext.Prelude
46 import System.Directory (listDirectory,doesFileExist)
47 import System.Environment
48 import qualified Data.ByteString.Char8 as C8
49 import qualified Data.Text as T
50 import qualified Data.Vector as Vector
51 import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as Csv
53 data Backup = BackupPhyloWithoutLink | BackupPhylo deriving (Show)
59 -- | To get all the files in a directory or just a file
60 getFilesFromPath :: FilePath -> IO [FilePath]
61 getFilesFromPath path = do
62 if (isSuffixOf "/" path)
63 then (listDirectory path)
70 -- | To filter the Ngrams of a document based on the termList
71 termsInText :: Patterns -> Text -> [Text]
72 termsInText pats txt = nub $ concat $ map (map unwords) $ extractTermsWithList pats txt
75 -- | To transform a Wos file (or [file]) into a list of Docs
76 wosToDocs :: Int -> Patterns -> TimeUnit -> FilePath -> IO [Document]
77 wosToDocs limit patterns time path = do
78 files <- getFilesFromPath path
80 <$> map (\d -> let title = fromJust $ _hd_title d
81 abstr = if (isJust $ _hd_abstract d)
82 then fromJust $ _hd_abstract d
84 in Document (toPhyloDate
85 (fromIntegral $ fromJust $ _hd_publication_year d)
86 (fromJust $ _hd_publication_month d)
87 (fromJust $ _hd_publication_day d) time)
89 (fromIntegral $ fromJust $ _hd_publication_year d)
90 (fromJust $ _hd_publication_month d)
91 (fromJust $ _hd_publication_day d) time)
92 (termsInText patterns $ title <> " " <> abstr) Nothing [] time)
94 <$> mapConcurrently (\file ->
95 filter (\d -> (isJust $ _hd_publication_year d)
96 && (isJust $ _hd_title d))
97 <$> fromRight [] <$> parseFile WOS Plain (path <> file) ) files
100 -- To transform a Csv file into a list of Document
101 csvToDocs :: CorpusParser -> Patterns -> TimeUnit -> FilePath -> IO [Document]
102 csvToDocs parser patterns time path =
105 Csv limit -> Vector.toList
106 <$> Vector.take limit
107 <$> Vector.map (\row -> Document (toPhyloDate (Csv.fromMIntOrDec Csv.defaultYear $ csv_publication_year row) (fromMaybe Csv.defaultMonth $ csv_publication_month row) (fromMaybe Csv.defaultDay $ csv_publication_day row) time)
108 (toPhyloDate' (Csv.fromMIntOrDec Csv.defaultYear $ csv_publication_year row) (fromMaybe Csv.defaultMonth $ csv_publication_month row) (fromMaybe Csv.defaultDay $ csv_publication_day row) time)
109 (termsInText patterns $ (csv_title row) <> " " <> (csv_abstract row))
113 ) <$> snd <$> either (\err -> panic $ cs $ "CSV error" <> (show err)) identity <$> Csv.readCSVFile path
114 Csv' limit -> Vector.toList
115 <$> Vector.take limit
116 <$> Vector.map (\row -> Document (toPhyloDate (csv'_publication_year row) (csv'_publication_month row) (csv'_publication_day row) time)
117 (toPhyloDate' (csv'_publication_year row) (csv'_publication_month row) (csv'_publication_day row) time)
118 (termsInText patterns $ (csv'_title row) <> " " <> (csv'_abstract row))
119 (Just $ csv'_weight row)
120 (map (T.strip . pack) $ splitOn ";" (unpack $ (csv'_source row)))
122 ) <$> snd <$> Csv.readWeightedCsv path
125 -- To parse a file into a list of Document
126 fileToDocsAdvanced :: CorpusParser -> FilePath -> TimeUnit -> TermList -> IO [Document]
127 fileToDocsAdvanced parser path time lst = do
128 let patterns = buildPatterns lst
130 Wos limit -> wosToDocs limit patterns time path
131 Csv _ -> csvToDocs parser patterns time path
132 Csv' _ -> csvToDocs parser patterns time path
134 fileToDocsDefault :: CorpusParser -> FilePath -> [TimeUnit] -> TermList -> IO [Document]
135 fileToDocsDefault parser path timeUnits lst =
136 if length timeUnits > 0
139 let timeUnit = (head' "fileToDocsDefault" timeUnits)
140 docs <- fileToDocsAdvanced parser path timeUnit lst
141 let periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod timeUnit) (getTimeStep timeUnit)
142 if (length periods < 3)
143 then fileToDocsDefault parser path (tail timeUnits) lst
145 else panic "this corpus is incompatible with the phylomemy reconstruction"
147 -- on passe à passer la time unit dans la conf envoyé au phyloMaker
148 -- dans le phyloMaker si default est true alors dans le setDefault ou pense à utiliser la TimeUnit de la conf
156 -- Config time parameters to label
157 timeToLabel :: PhyloConfig -> [Char]
158 timeToLabel config = case (timeUnit config) of
159 Epoch p s f -> ("time_epochs" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
160 Year p s f -> ("time_years" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
161 Month p s f -> ("time_months" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
162 Week p s f -> ("time_weeks" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
163 Day p s f -> ("time_days" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
166 seaToLabel :: PhyloConfig -> [Char]
167 seaToLabel config = case (seaElevation config) of
168 Constante start step -> ("sea_cst_" <> (show start) <> "_" <> (show step))
169 Adaptative granularity -> ("sea_adapt" <> (show granularity))
170 Evolving _ -> ("sea_evolv")
173 sensToLabel :: PhyloConfig -> [Char]
174 sensToLabel config = case (similarity config) of
175 Hamming _ _ -> undefined
176 WeightedLogJaccard s _ -> ("WeightedLogJaccard_" <> show s)
177 WeightedLogSim s _ -> ( "WeightedLogSim-sens_" <> show s)
180 cliqueToLabel :: PhyloConfig -> [Char]
181 cliqueToLabel config = case (clique config) of
182 Fis s s' -> "fis_" <> (show s) <> "_" <> (show s')
183 MaxClique s t f -> "clique_" <> (show s)<> "_" <> (show f)<> "_" <> (show t)
186 syncToLabel :: PhyloConfig -> [Char]
187 syncToLabel config = case (phyloSynchrony config) of
188 ByProximityThreshold scl sync_sens scope _ -> ("scale_" <> (show scope) <> "_" <> (show sync_sens) <> "_" <> (show scl))
189 ByProximityDistribution _ _ -> undefined
191 qualToConfig :: PhyloConfig -> [Char]
192 qualToConfig config = case (phyloQuality config) of
193 Quality g m -> "quality_" <> (show g) <> "_" <> (show m)
196 -- To set up the export file's label from the configuration
197 configToLabel :: PhyloConfig -> [Char]
198 configToLabel config = outputPath config
199 <> (unpack $ phyloName config)
200 <> "-" <> (timeToLabel config)
201 <> "-scale_" <> (show (phyloScale config))
202 <> "-" <> (seaToLabel config)
203 <> "-" <> (sensToLabel config)
204 <> "-" <> (cliqueToLabel config)
205 <> "-level_" <> (show (_qua_granularity $ phyloQuality config))
206 <> "-" <> (syncToLabel config)
210 -- To write a sha256 from a set of config's parameters
211 configToSha :: Backup -> PhyloConfig -> [Char]
212 configToSha stage config = unpack
214 $ T.pack (show (hash $ C8.pack label))
217 label = case stage of
218 BackupPhyloWithoutLink -> (corpusPath config)
220 <> (timeToLabel config)
221 <> (cliqueToLabel config)
222 BackupPhylo -> (corpusPath config)
224 <> (timeToLabel config)
225 <> (cliqueToLabel config)
226 <> (sensToLabel config)
227 <> (seaToLabel config)
228 <> (syncToLabel config)
229 <> (qualToConfig config)
230 <> (show (phyloScale config))
233 readListV4 :: [Char] -> IO NgramsList
235 listJson <- (eitherDecode <$> readJson path) :: IO (Either String NgramsList)
240 Right listV4 -> pure listV4
243 fileToList :: ListParser -> FilePath -> IO TermList
244 fileToList parser path =
246 V3 -> csvMapTermList path
248 <$> toTermList MapTerm NgramsTerms
260 printIOMsg "Starting the reconstruction"
262 printIOMsg "Read the configuration file"
264 jsonArgs <- (eitherDecode <$> readJson args) :: IO (Either String PhyloConfig)
267 Left err -> putStrLn err
270 printIOMsg "Parse the corpus"
271 mapList <- fileToList (listParser config) (listPath config)
273 corpus <- if (defaultMode config)
274 then fileToDocsDefault (corpusParser config) (corpusPath config) [Year 3 1 5,Month 3 1 5,Week 4 2 5] mapList
275 else fileToDocsAdvanced (corpusParser config) (corpusPath config) (timeUnit config) mapList
277 printIOComment (show (length corpus) <> " parsed docs from the corpus")
279 printIOComment (show (length $ nub $ concat $ map text corpus) <> " Size ngs_coterms")
281 printIOComment (show (length mapList) <> " Size ngs_terms List Map Ngrams")
283 printIOMsg "Reconstruct the phylo"
285 -- check the existing backup files
287 let backupPhyloWithoutLink = (outputPath config) <> "backupPhyloWithoutLink_" <> (configToSha BackupPhyloWithoutLink config) <> ".json"
288 let backupPhylo = (outputPath config) <> "backupPhylo_" <> (configToSha BackupPhylo config) <> ".json"
290 phyloWithoutLinkExists <- doesFileExist backupPhyloWithoutLink
291 phyloExists <- doesFileExist backupPhylo
293 -- reconstruct the phylo
295 phylo <- if phyloExists
297 printIOMsg "Reconstruct the phylo from an existing file"
298 readPhylo backupPhylo
300 if phyloWithoutLinkExists
302 printIOMsg "Reconstruct the phylo from an existing file without links"
303 phyloWithoutLink <- readPhylo backupPhyloWithoutLink
304 writePhylo backupPhyloWithoutLink phyloWithoutLink
305 pure $ toPhylo (setConfig config phyloWithoutLink)
307 printIOMsg "Reconstruct the phylo from scratch"
308 phyloWithoutLink <- pure $ toPhyloWithoutLink corpus config
309 writePhylo backupPhyloWithoutLink phyloWithoutLink
310 pure $ toPhylo (setConfig config phyloWithoutLink)
312 writePhylo backupPhylo phylo
314 printIOMsg "End of reconstruction, start the export"
316 let dot = toPhyloExport (setConfig config phylo)
318 let output = configToLabel config