]> Git — Sourcephile - gargantext.git/blob - bin/gargantext-phylo/Main.hs
[MERGE] Phylo
[gargantext.git] / bin / gargantext-phylo / Main.hs
1 {-|
2 Module : Main.hs
3 Description : Gargantext starter binary with Adaptative Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Adaptative Phylo binaries
11 -}
12
13 {-# LANGUAGE StandaloneDeriving #-}
14 {-# LANGUAGE TypeOperators #-}
15 {-# LANGUAGE Strict #-}
16
17 module Main where
18
19 -- import Debug.Trace (trace)
20 import Control.Concurrent.Async (mapConcurrently)
21 import Crypto.Hash.SHA256 (hash)
22 import Data.Aeson
23 import Data.Either (Either(..), fromRight)
24 import Data.List (concat, nub, isSuffixOf,sort,tail)
25 import Data.List.Split
26 import Data.Maybe (fromMaybe)
27 import Data.String (String)
28 import Data.Text (Text, unwords, unpack, replace, pack)
29 import GHC.IO (FilePath)
30 import Gargantext.API.Ngrams.Prelude (toTermList)
31 import Gargantext.API.Ngrams.Types
32 import Gargantext.Core.Text.Context (TermList)
33 import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), parseFile)
34 import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year, csv_publication_month, csv_publication_day, csv'_source, csv'_title, csv'_abstract, csv'_publication_year, csv'_publication_month, csv'_publication_day, csv'_weight)
35 import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
36 import Gargantext.Core.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
37 import Gargantext.Core.Types.Main (ListType(..))
38 import Gargantext.Core.Viz.Phylo
39 import Gargantext.Core.Viz.Phylo.API.Tools
40 import Gargantext.Core.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
41 import Gargantext.Core.Viz.Phylo.PhyloMaker (toPhylo, toPhyloWithoutLink)
42 import Gargantext.Core.Viz.Phylo.PhyloTools (printIOMsg, printIOComment, setConfig, toPeriods, getTimePeriod, getTimeStep)
43 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
44 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
45 import Gargantext.Prelude
46 import System.Directory (listDirectory,doesFileExist)
47 import System.Environment
48 import qualified Data.ByteString.Char8 as C8
49 import qualified Data.Text as T
50 import qualified Data.Vector as Vector
51 import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as Csv
52
53 data Backup = BackupPhyloWithoutLink | BackupPhylo deriving (Show)
54
55 ---------------
56 -- | Tools | --
57 ---------------
58
59 -- | To get all the files in a directory or just a file
60 getFilesFromPath :: FilePath -> IO [FilePath]
61 getFilesFromPath path = do
62 if (isSuffixOf "/" path)
63 then (listDirectory path)
64 else return [path]
65
66 ----------------
67 -- | Parser | --
68 ----------------
69
70 -- | To filter the Ngrams of a document based on the termList
71 termsInText :: Patterns -> Text -> [Text]
72 termsInText pats txt = nub $ concat $ map (map unwords) $ extractTermsWithList pats txt
73
74
75 -- | To transform a Wos file (or [file]) into a list of Docs
76 wosToDocs :: Int -> Patterns -> TimeUnit -> FilePath -> IO [Document]
77 wosToDocs limit patterns time path = do
78 files <- getFilesFromPath path
79 take limit
80 <$> map (\d -> let title = fromJust $ _hd_title d
81 abstr = if (isJust $ _hd_abstract d)
82 then fromJust $ _hd_abstract d
83 else ""
84 in Document (toPhyloDate
85 (fromIntegral $ fromJust $ _hd_publication_year d)
86 (fromJust $ _hd_publication_month d)
87 (fromJust $ _hd_publication_day d) time)
88 (toPhyloDate'
89 (fromIntegral $ fromJust $ _hd_publication_year d)
90 (fromJust $ _hd_publication_month d)
91 (fromJust $ _hd_publication_day d) time)
92 (termsInText patterns $ title <> " " <> abstr) Nothing [] time)
93 <$> concat
94 <$> mapConcurrently (\file ->
95 filter (\d -> (isJust $ _hd_publication_year d)
96 && (isJust $ _hd_title d))
97 <$> fromRight [] <$> parseFile WOS Plain (path <> file) ) files
98
99
100 -- To transform a Csv file into a list of Document
101 csvToDocs :: CorpusParser -> Patterns -> TimeUnit -> FilePath -> IO [Document]
102 csvToDocs parser patterns time path =
103 case parser of
104 Wos _ -> undefined
105 Csv limit -> Vector.toList
106 <$> Vector.take limit
107 <$> Vector.map (\row -> Document (toPhyloDate (Csv.fromMIntOrDec Csv.defaultYear $ csv_publication_year row) (fromMaybe Csv.defaultMonth $ csv_publication_month row) (fromMaybe Csv.defaultDay $ csv_publication_day row) time)
108 (toPhyloDate' (Csv.fromMIntOrDec Csv.defaultYear $ csv_publication_year row) (fromMaybe Csv.defaultMonth $ csv_publication_month row) (fromMaybe Csv.defaultDay $ csv_publication_day row) time)
109 (termsInText patterns $ (csv_title row) <> " " <> (csv_abstract row))
110 Nothing
111 []
112 time
113 ) <$> snd <$> either (\err -> panic $ cs $ "CSV error" <> (show err)) identity <$> Csv.readCSVFile path
114 Csv' limit -> Vector.toList
115 <$> Vector.take limit
116 <$> Vector.map (\row -> Document (toPhyloDate (csv'_publication_year row) (csv'_publication_month row) (csv'_publication_day row) time)
117 (toPhyloDate' (csv'_publication_year row) (csv'_publication_month row) (csv'_publication_day row) time)
118 (termsInText patterns $ (csv'_title row) <> " " <> (csv'_abstract row))
119 (Just $ csv'_weight row)
120 (map (T.strip . pack) $ splitOn ";" (unpack $ (csv'_source row)))
121 time
122 ) <$> snd <$> Csv.readWeightedCsv path
123
124
125 -- To parse a file into a list of Document
126 fileToDocsAdvanced :: CorpusParser -> FilePath -> TimeUnit -> TermList -> IO [Document]
127 fileToDocsAdvanced parser path time lst = do
128 let patterns = buildPatterns lst
129 case parser of
130 Wos limit -> wosToDocs limit patterns time path
131 Csv _ -> csvToDocs parser patterns time path
132 Csv' _ -> csvToDocs parser patterns time path
133
134 fileToDocsDefault :: CorpusParser -> FilePath -> [TimeUnit] -> TermList -> IO [Document]
135 fileToDocsDefault parser path timeUnits lst =
136 if length timeUnits > 0
137 then
138 do
139 let timeUnit = (head' "fileToDocsDefault" timeUnits)
140 docs <- fileToDocsAdvanced parser path timeUnit lst
141 let periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod timeUnit) (getTimeStep timeUnit)
142 if (length periods < 3)
143 then fileToDocsDefault parser path (tail timeUnits) lst
144 else pure docs
145 else panic "this corpus is incompatible with the phylomemy reconstruction"
146
147 -- on passe à passer la time unit dans la conf envoyé au phyloMaker
148 -- dans le phyloMaker si default est true alors dans le setDefault ou pense à utiliser la TimeUnit de la conf
149
150
151 ---------------
152 -- | Label | --
153 ---------------
154
155
156 -- Config time parameters to label
157 timeToLabel :: PhyloConfig -> [Char]
158 timeToLabel config = case (timeUnit config) of
159 Epoch p s f -> ("time_epochs" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
160 Year p s f -> ("time_years" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
161 Month p s f -> ("time_months" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
162 Week p s f -> ("time_weeks" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
163 Day p s f -> ("time_days" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
164
165
166 seaToLabel :: PhyloConfig -> [Char]
167 seaToLabel config = case (seaElevation config) of
168 Constante start step -> ("sea_cst_" <> (show start) <> "_" <> (show step))
169 Adaptative granularity -> ("sea_adapt" <> (show granularity))
170 Evolving _ -> ("sea_evolv")
171
172
173 sensToLabel :: PhyloConfig -> [Char]
174 sensToLabel config = case (similarity config) of
175 Hamming _ _ -> undefined
176 WeightedLogJaccard s _ -> ("WeightedLogJaccard_" <> show s)
177 WeightedLogSim s _ -> ( "WeightedLogSim-sens_" <> show s)
178
179
180 cliqueToLabel :: PhyloConfig -> [Char]
181 cliqueToLabel config = case (clique config) of
182 Fis s s' -> "fis_" <> (show s) <> "_" <> (show s')
183 MaxClique s t f -> "clique_" <> (show s)<> "_" <> (show f)<> "_" <> (show t)
184
185
186 syncToLabel :: PhyloConfig -> [Char]
187 syncToLabel config = case (phyloSynchrony config) of
188 ByProximityThreshold scl sync_sens scope _ -> ("scale_" <> (show scope) <> "_" <> (show sync_sens) <> "_" <> (show scl))
189 ByProximityDistribution _ _ -> undefined
190
191 qualToConfig :: PhyloConfig -> [Char]
192 qualToConfig config = case (phyloQuality config) of
193 Quality g m -> "quality_" <> (show g) <> "_" <> (show m)
194
195
196 -- To set up the export file's label from the configuration
197 configToLabel :: PhyloConfig -> [Char]
198 configToLabel config = outputPath config
199 <> (unpack $ phyloName config)
200 <> "-" <> (timeToLabel config)
201 <> "-scale_" <> (show (phyloScale config))
202 <> "-" <> (seaToLabel config)
203 <> "-" <> (sensToLabel config)
204 <> "-" <> (cliqueToLabel config)
205 <> "-level_" <> (show (_qua_granularity $ phyloQuality config))
206 <> "-" <> (syncToLabel config)
207 <> ".dot"
208
209
210 -- To write a sha256 from a set of config's parameters
211 configToSha :: Backup -> PhyloConfig -> [Char]
212 configToSha stage config = unpack
213 $ replace "/" "-"
214 $ T.pack (show (hash $ C8.pack label))
215 where
216 label :: [Char]
217 label = case stage of
218 BackupPhyloWithoutLink -> (corpusPath config)
219 <> (listPath config)
220 <> (timeToLabel config)
221 <> (cliqueToLabel config)
222 BackupPhylo -> (corpusPath config)
223 <> (listPath config)
224 <> (timeToLabel config)
225 <> (cliqueToLabel config)
226 <> (sensToLabel config)
227 <> (seaToLabel config)
228 <> (syncToLabel config)
229 <> (qualToConfig config)
230 <> (show (phyloScale config))
231
232
233 readListV4 :: [Char] -> IO NgramsList
234 readListV4 path = do
235 listJson <- (eitherDecode <$> readJson path) :: IO (Either String NgramsList)
236 case listJson of
237 Left err -> do
238 putStrLn err
239 undefined
240 Right listV4 -> pure listV4
241
242
243 fileToList :: ListParser -> FilePath -> IO TermList
244 fileToList parser path =
245 case parser of
246 V3 -> csvMapTermList path
247 V4 -> fromJust
248 <$> toTermList MapTerm NgramsTerms
249 <$> readListV4 path
250
251
252 --------------
253 -- | Main | --
254 --------------
255
256
257 main :: IO ()
258 main = do
259
260 printIOMsg "Starting the reconstruction"
261
262 printIOMsg "Read the configuration file"
263 [args] <- getArgs
264 jsonArgs <- (eitherDecode <$> readJson args) :: IO (Either String PhyloConfig)
265
266 case jsonArgs of
267 Left err -> putStrLn err
268 Right config -> do
269
270 printIOMsg "Parse the corpus"
271 mapList <- fileToList (listParser config) (listPath config)
272
273 corpus <- if (defaultMode config)
274 then fileToDocsDefault (corpusParser config) (corpusPath config) [Year 3 1 5,Month 3 1 5,Week 4 2 5] mapList
275 else fileToDocsAdvanced (corpusParser config) (corpusPath config) (timeUnit config) mapList
276
277 printIOComment (show (length corpus) <> " parsed docs from the corpus")
278
279 printIOComment (show (length $ nub $ concat $ map text corpus) <> " Size ngs_coterms")
280
281 printIOComment (show (length mapList) <> " Size ngs_terms List Map Ngrams")
282
283 printIOMsg "Reconstruct the phylo"
284
285 -- check the existing backup files
286
287 let backupPhyloWithoutLink = (outputPath config) <> "backupPhyloWithoutLink_" <> (configToSha BackupPhyloWithoutLink config) <> ".json"
288 let backupPhylo = (outputPath config) <> "backupPhylo_" <> (configToSha BackupPhylo config) <> ".json"
289
290 phyloWithoutLinkExists <- doesFileExist backupPhyloWithoutLink
291 phyloExists <- doesFileExist backupPhylo
292
293 -- reconstruct the phylo
294
295 phylo <- if phyloExists
296 then do
297 printIOMsg "Reconstruct the phylo from an existing file"
298 readPhylo backupPhylo
299 else do
300 if phyloWithoutLinkExists
301 then do
302 printIOMsg "Reconstruct the phylo from an existing file without links"
303 phyloWithoutLink <- readPhylo backupPhyloWithoutLink
304 writePhylo backupPhyloWithoutLink phyloWithoutLink
305 pure $ toPhylo (setConfig config phyloWithoutLink)
306 else do
307 printIOMsg "Reconstruct the phylo from scratch"
308 phyloWithoutLink <- pure $ toPhyloWithoutLink corpus config
309 writePhylo backupPhyloWithoutLink phyloWithoutLink
310 pure $ toPhylo (setConfig config phyloWithoutLink)
311
312 writePhylo backupPhylo phylo
313
314 printIOMsg "End of reconstruction, start the export"
315
316 let dot = toPhyloExport (setConfig config phylo)
317
318 let output = configToLabel config
319
320 dotToFile output dot