3 Description : Gargantext starter binary with Adaptative Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Adaptative Phylo binaries
13 {-# LANGUAGE StandaloneDeriving #-}
14 {-# LANGUAGE TypeOperators #-}
15 {-# LANGUAGE Strict #-}
19 -- import Debug.Trace (trace)
20 import Control.Concurrent.Async (mapConcurrently)
21 import Crypto.Hash.SHA256 (hash)
23 import Data.Either (Either(..), fromRight)
24 import Data.List (concat, nub, isSuffixOf)
25 import Data.List.Split
26 import Data.Maybe (fromMaybe)
27 import Data.String (String)
28 import Data.Text (Text, unwords, unpack, replace, pack)
29 import GHC.IO (FilePath)
30 import Gargantext.API.Ngrams.Prelude (toTermList)
31 import Gargantext.API.Ngrams.Types
32 import Gargantext.Core.Text.Context (TermList)
33 import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), parseFile)
34 import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year, csv_publication_month, csv_publication_day, csv'_source, csv'_title, csv'_abstract, csv'_publication_year, csv'_publication_month, csv'_publication_day, csv'_weight)
35 import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
36 import Gargantext.Core.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
37 import Gargantext.Core.Types.Main (ListType(..))
38 import Gargantext.Core.Viz.Phylo
39 import Gargantext.Core.Viz.Phylo.API.Tools
40 import Gargantext.Core.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
41 import Gargantext.Core.Viz.Phylo.PhyloMaker (toPhylo, toPhyloWithoutLink)
42 import Gargantext.Core.Viz.Phylo.PhyloTools (printIOMsg, printIOComment, setConfig)
43 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
44 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
45 import Gargantext.Prelude
46 import System.Directory (listDirectory,doesFileExist)
47 import System.Environment
48 import qualified Data.ByteString.Char8 as C8
49 import qualified Data.Text as T
50 import qualified Data.Vector as Vector
51 import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as Csv
53 data Backup = BackupPhyloWithoutLink | BackupPhylo deriving (Show)
59 -- | To get all the files in a directory or just a file
60 getFilesFromPath :: FilePath -> IO [FilePath]
61 getFilesFromPath path = do
62 if (isSuffixOf "/" path)
63 then (listDirectory path)
70 -- | To filter the Ngrams of a document based on the termList
71 termsInText :: Patterns -> Text -> [Text]
72 termsInText pats txt = nub $ concat $ map (map unwords) $ extractTermsWithList pats txt
75 -- | To transform a Wos file (or [file]) into a list of Docs
76 wosToDocs :: Int -> Patterns -> TimeUnit -> FilePath -> IO [Document]
77 wosToDocs limit patterns time path = do
78 files <- getFilesFromPath path
80 <$> map (\d -> let title = fromJust $ _hd_title d
81 abstr = if (isJust $ _hd_abstract d)
82 then fromJust $ _hd_abstract d
84 in Document (toPhyloDate
85 (fromIntegral $ fromJust $ _hd_publication_year d)
86 (fromJust $ _hd_publication_month d)
87 (fromJust $ _hd_publication_day d) time)
89 (fromIntegral $ fromJust $ _hd_publication_year d)
90 (fromJust $ _hd_publication_month d)
91 (fromJust $ _hd_publication_day d) time)
92 (termsInText patterns $ title <> " " <> abstr) Nothing [])
94 <$> mapConcurrently (\file ->
95 filter (\d -> (isJust $ _hd_publication_year d)
96 && (isJust $ _hd_title d))
97 <$> fromRight [] <$> parseFile WOS Plain (path <> file) ) files
100 -- To transform a Csv file into a list of Document
101 csvToDocs :: CorpusParser -> Patterns -> TimeUnit -> FilePath -> IO [Document]
102 csvToDocs parser patterns time path =
105 Csv limit -> Vector.toList
106 <$> Vector.take limit
107 <$> Vector.map (\row -> Document (toPhyloDate (Csv.fromMIntOrDec Csv.defaultYear $ csv_publication_year row) (fromMaybe Csv.defaultMonth $ csv_publication_month row) (fromMaybe Csv.defaultDay $ csv_publication_day row) time)
108 (toPhyloDate' (Csv.fromMIntOrDec Csv.defaultYear $ csv_publication_year row) (fromMaybe Csv.defaultMonth $ csv_publication_month row) (fromMaybe Csv.defaultDay $ csv_publication_day row) time)
109 (termsInText patterns $ (csv_title row) <> " " <> (csv_abstract row))
112 ) <$> snd <$> either (\err -> panic $ cs $ "CSV error" <> (show err)) identity <$> Csv.readCSVFile path
113 Csv' limit -> Vector.toList
114 <$> Vector.take limit
115 <$> Vector.map (\row -> Document (toPhyloDate (csv'_publication_year row) (csv'_publication_month row) (csv'_publication_day row) time)
116 (toPhyloDate' (csv'_publication_year row) (csv'_publication_month row) (csv'_publication_day row) time)
117 (termsInText patterns $ (csv'_title row) <> " " <> (csv'_abstract row))
118 (Just $ csv'_weight row)
119 (map (T.strip . pack) $ splitOn ";" (unpack $ (csv'_source row)))
120 ) <$> snd <$> Csv.readWeightedCsv path
123 -- To parse a file into a list of Document
124 fileToDocs' :: CorpusParser -> FilePath -> TimeUnit -> TermList -> IO [Document]
125 fileToDocs' parser path time lst = do
126 let patterns = buildPatterns lst
128 Wos limit -> wosToDocs limit patterns time path
129 Csv _ -> csvToDocs parser patterns time path
130 Csv' _ -> csvToDocs parser patterns time path
138 -- Config time parameters to label
139 timeToLabel :: PhyloConfig -> [Char]
140 timeToLabel config = case (timeUnit config) of
141 Epoch p s f -> ("time_epochs" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
142 Year p s f -> ("time_years" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
143 Month p s f -> ("time_months" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
144 Week p s f -> ("time_weeks" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
145 Day p s f -> ("time_days" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
148 seaToLabel :: PhyloConfig -> [Char]
149 seaToLabel config = case (seaElevation config) of
150 Constante start step -> ("sea_cst_" <> (show start) <> "_" <> (show step))
151 Adaptative granularity -> ("sea_adapt" <> (show granularity))
154 sensToLabel :: PhyloConfig -> [Char]
155 sensToLabel config = case (similarity config) of
156 Hamming _ _ -> undefined
157 WeightedLogJaccard s _ -> ("WeightedLogJaccard_" <> show s)
158 WeightedLogSim s _ -> ( "WeightedLogSim-sens_" <> show s)
161 cliqueToLabel :: PhyloConfig -> [Char]
162 cliqueToLabel config = case (clique config) of
163 Fis s s' -> "fis_" <> (show s) <> "_" <> (show s')
164 MaxClique s t f -> "clique_" <> (show s)<> "_" <> (show f)<> "_" <> (show t)
167 syncToLabel :: PhyloConfig -> [Char]
168 syncToLabel config = case (phyloSynchrony config) of
169 ByProximityThreshold scl sync_sens scope _ -> ("scale_" <> (show scope) <> "_" <> (show sync_sens) <> "_" <> (show scl))
170 ByProximityDistribution _ _ -> undefined
172 qualToConfig :: PhyloConfig -> [Char]
173 qualToConfig config = case (phyloQuality config) of
174 Quality g m -> "quality_" <> (show g) <> "_" <> (show m)
177 -- To set up the export file's label from the configuration
178 configToLabel :: PhyloConfig -> [Char]
179 configToLabel config = outputPath config
180 <> (unpack $ phyloName config)
181 <> "-" <> (timeToLabel config)
182 <> "-scale_" <> (show (phyloScale config))
183 <> "-" <> (seaToLabel config)
184 <> "-" <> (sensToLabel config)
185 <> "-" <> (cliqueToLabel config)
186 <> "-level_" <> (show (_qua_granularity $ phyloQuality config))
187 <> "-" <> (syncToLabel config)
191 -- To write a sha256 from a set of config's parameters
192 configToSha :: Backup -> PhyloConfig -> [Char]
193 configToSha stage config = unpack
195 $ T.pack (show (hash $ C8.pack label))
198 label = case stage of
199 BackupPhyloWithoutLink -> (corpusPath config)
201 <> (timeToLabel config)
202 <> (cliqueToLabel config)
203 BackupPhylo -> (corpusPath config)
205 <> (timeToLabel config)
206 <> (cliqueToLabel config)
207 <> (sensToLabel config)
208 <> (seaToLabel config)
209 <> (syncToLabel config)
210 <> (qualToConfig config)
211 <> (show (phyloScale config))
214 readListV4 :: [Char] -> IO NgramsList
216 listJson <- (eitherDecode <$> readJson path) :: IO (Either String NgramsList)
221 Right listV4 -> pure listV4
224 fileToList :: ListParser -> FilePath -> IO TermList
225 fileToList parser path =
227 V3 -> csvMapTermList path
229 <$> toTermList MapTerm NgramsTerms
241 printIOMsg "Starting the reconstruction"
243 printIOMsg "Read the configuration file"
245 jsonArgs <- (eitherDecode <$> readJson args) :: IO (Either String PhyloConfig)
248 Left err -> putStrLn err
251 printIOMsg "Parse the corpus"
252 mapList <- fileToList (listParser config) (listPath config)
253 corpus <- fileToDocs' (corpusParser config) (corpusPath config) (timeUnit config) mapList
254 printIOComment (show (length corpus) <> " parsed docs from the corpus")
256 printIOMsg "Reconstruct the phylo"
258 -- check the existing backup files
260 let backupPhyloWithoutLink = (outputPath config) <> "backupPhyloWithoutLink_" <> (configToSha BackupPhyloWithoutLink config) <> ".json"
261 let backupPhylo = (outputPath config) <> "backupPhylo_" <> (configToSha BackupPhylo config) <> ".json"
263 phyloWithoutLinkExists <- doesFileExist backupPhyloWithoutLink
264 phyloExists <- doesFileExist backupPhylo
266 -- reconstruct the phylo
268 phylo <- if phyloExists
270 printIOMsg "Reconstruct the phylo from an existing file"
271 readPhylo backupPhylo
273 if phyloWithoutLinkExists
275 printIOMsg "Reconstruct the phylo from an existing file without links"
276 phyloWithoutLink <- readPhylo backupPhyloWithoutLink
277 writePhylo backupPhyloWithoutLink phyloWithoutLink
278 pure $ toPhylo (setConfig config phyloWithoutLink)
280 printIOMsg "Reconstruct the phylo from scratch"
281 phyloWithoutLink <- pure $ toPhyloWithoutLink corpus config
282 writePhylo backupPhyloWithoutLink phyloWithoutLink
283 pure $ toPhylo (setConfig config phyloWithoutLink)
285 writePhylo backupPhylo phylo
287 printIOMsg "End of reconstruction, start the export"
289 let dot = toPhyloExport (setConfig config phylo)
291 let output = configToLabel config