]> Git — Sourcephile - gargantext.git/blob - bin/gargantext-adaptative-phylo/Main.hs
[BIN] upgrade script
[gargantext.git] / bin / gargantext-adaptative-phylo / Main.hs
1 {-|
2 Module : Main.hs
3 Description : Gargantext starter binary with Adaptative Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Adaptative Phylo binaries
11 -}
12
13 {-# LANGUAGE StandaloneDeriving #-}
14 {-# LANGUAGE TypeOperators #-}
15 {-# LANGUAGE Strict #-}
16
17 module Main where
18
19 import Control.Concurrent.Async (mapConcurrently)
20 import Crypto.Hash.SHA256 (hash)
21 import Data.Aeson
22 import Data.Either (Either(..))
23 import Data.List (concat, nub, isSuffixOf)
24 import Data.String (String)
25 import GHC.IO (FilePath)
26 import qualified Prelude as Prelude
27 import System.Environment
28 import System.Directory (listDirectory,doesFileExist)
29 import Data.Text (Text, unwords, unpack, replace, pack)
30 import Data.Time.Calendar (fromGregorian, diffGregorianDurationClip, cdMonths, diffDays, showGregorian)
31
32 import qualified Data.ByteString.Char8 as C8
33 import qualified Data.ByteString.Lazy as Lazy
34 import qualified Data.Vector as Vector
35 import qualified Data.Text as T
36
37 import Gargantext.Prelude
38 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
39 import Gargantext.Core.Text.Context (TermList)
40 import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year, csv_publication_month, csv_publication_day,
41 csv'_source, csv'_title, csv'_abstract, csv'_publication_year, csv'_publication_month, csv'_publication_day, csv'_weight)
42 import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as Csv
43 import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..),parseFile)
44 import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
45 import Gargantext.Core.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
46 import Gargantext.Core.Viz.AdaptativePhylo
47 import Gargantext.Core.Viz.Phylo.PhyloMaker (toPhylo, toPhyloStep)
48 import Gargantext.Core.Viz.Phylo.PhyloTools (printIOMsg, printIOComment, setConfig)
49 import Gargantext.Core.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
50 -- import Gargantext.API.Ngrams.Prelude (toTermList)
51
52 -- import Debug.Trace (trace)
53
54 data PhyloStage = PhyloWithCliques | PhyloWithLinks deriving (Show)
55
56 ---------------
57 -- | Tools | --
58 ---------------
59
60
61 -- | To get all the files in a directory or just a file
62 getFilesFromPath :: FilePath -> IO([FilePath])
63 getFilesFromPath path = do
64 if (isSuffixOf "/" path)
65 then (listDirectory path)
66 else return [path]
67
68 ---------------
69 -- | Dates | --
70 ---------------
71
72
73 toMonths :: Integer -> Int -> Int -> Date
74 toMonths y m d = fromIntegral $ cdMonths
75 $ diffGregorianDurationClip (fromGregorian y m d) (fromGregorian 0000 0 0)
76
77
78 toDays :: Integer -> Int -> Int -> Date
79 toDays y m d = fromIntegral
80 $ diffDays (fromGregorian y m d) (fromGregorian 0000 0 0)
81
82
83 toPhyloDate :: Int -> Int -> Int -> TimeUnit -> Date
84 toPhyloDate y m d tu = case tu of
85 Year _ _ _ -> y
86 Month _ _ _ -> toMonths (Prelude.toInteger y) m d
87 Week _ _ _ -> div (toDays (Prelude.toInteger y) m d) 7
88 Day _ _ _ -> toDays (Prelude.toInteger y) m d
89
90
91 toPhyloDate' :: Int -> Int -> Int -> Text
92 toPhyloDate' y m d = pack $ showGregorian $ fromGregorian (Prelude.toInteger y) m d
93
94
95 --------------
96 -- | Json | --
97 --------------
98
99
100 -- | To read and decode a Json file
101 readJson :: FilePath -> IO Lazy.ByteString
102 readJson path = Lazy.readFile path
103
104
105 ----------------
106 -- | Parser | --
107 ----------------
108
109 -- | To filter the Ngrams of a document based on the termList
110 termsInText :: Patterns -> Text -> [Text]
111 termsInText pats txt = nub $ concat $ map (map unwords) $ extractTermsWithList pats txt
112
113
114 -- | To transform a Wos file (or [file]) into a list of Docs
115 wosToDocs :: Int -> Patterns -> TimeUnit -> FilePath -> IO [Document]
116 wosToDocs limit patterns time path = do
117 files <- getFilesFromPath path
118 let parseFile' file = do
119 eParsed <- parseFile WOS (path <> file)
120 case eParsed of
121 Right ps -> pure ps
122 Left e -> panic $ "Error: " <> (pack e)
123 take limit
124 <$> map (\d -> let title = fromJust $ _hd_title d
125 abstr = if (isJust $ _hd_abstract d)
126 then fromJust $ _hd_abstract d
127 else ""
128 in Document (toPhyloDate
129 (fromIntegral $ fromJust $ _hd_publication_year d)
130 (fromJust $ _hd_publication_month d)
131 (fromJust $ _hd_publication_day d) time)
132 (toPhyloDate'
133 (fromIntegral $ fromJust $ _hd_publication_year d)
134 (fromJust $ _hd_publication_month d)
135 (fromJust $ _hd_publication_day d))
136 (termsInText patterns $ title <> " " <> abstr) Nothing [])
137 <$> concat
138 <$> mapConcurrently (\file ->
139 filter (\d -> (isJust $ _hd_publication_year d)
140 && (isJust $ _hd_title d))
141 <$> parseFile' file) files
142
143
144 -- To transform a Csv file into a list of Document
145 csvToDocs :: CorpusParser -> Patterns -> TimeUnit -> FilePath -> IO [Document]
146 csvToDocs parser patterns time path =
147 case parser of
148 Wos _ -> undefined
149 Csv limit -> do
150 eR <- Csv.readFile path
151 case eR of
152 Right r ->
153 pure $ Vector.toList
154 $ Vector.take limit
155 $ Vector.map (\row -> Document (toPhyloDate (Csv.unIntOrDec $ csv_publication_year row) (csv_publication_month row) (csv_publication_day row) time)
156 (toPhyloDate' (Csv.unIntOrDec $ csv_publication_year row) (csv_publication_month row) (csv_publication_day row))
157 (termsInText patterns $ (csv_title row) <> " " <> (csv_abstract row))
158 Nothing
159 []
160 ) $ snd r
161 Left e -> panic $ "Error: " <> (pack e)
162 Csv' limit -> Vector.toList
163 <$> Vector.take limit
164 <$> Vector.map (\row -> Document (toPhyloDate (csv'_publication_year row) (csv'_publication_month row) (csv'_publication_day row) time)
165 (toPhyloDate' (csv'_publication_year row) (csv'_publication_month row) (csv'_publication_day row))
166 (termsInText patterns $ (csv'_title row) <> " " <> (csv'_abstract row))
167 (Just $ csv'_weight row)
168 [csv'_source row]
169 ) <$> snd <$> Csv.readWeightedCsv path
170
171
172 -- To parse a file into a list of Document
173 fileToDocs' :: CorpusParser -> FilePath -> TimeUnit -> TermList -> IO [Document]
174 fileToDocs' parser path time lst = do
175 let patterns = buildPatterns lst
176 case parser of
177 Wos limit -> wosToDocs limit patterns time path
178 Csv _ -> csvToDocs parser patterns time path
179 Csv' _ -> csvToDocs parser patterns time path
180
181
182 ---------------
183 -- | Label | --
184 ---------------
185
186
187 -- Config time parameters to label
188 timeToLabel :: Config -> [Char]
189 timeToLabel config = case (timeUnit config) of
190 Year p s f -> ("time_years" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
191 Month p s f -> ("time_months"<> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
192 Week p s f -> ("time_weeks" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
193 Day p s f -> ("time_days" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
194
195
196 seaToLabel :: Config -> [Char]
197 seaToLabel config = case (seaElevation config) of
198 Constante start step -> ("sea_cst_" <> (show start) <> "_" <> (show step))
199 Adaptative granularity -> ("sea_adapt" <> (show granularity))
200
201
202 sensToLabel :: Config -> [Char]
203 sensToLabel config = case (phyloProximity config) of
204 Hamming -> undefined
205 WeightedLogJaccard s -> ("WeightedLogJaccard_" <> show s)
206 WeightedLogSim s -> ( "WeightedLogSim-sens_" <> show s)
207
208
209 cliqueToLabel :: Config -> [Char]
210 cliqueToLabel config = case (clique config) of
211 Fis s s' -> "fis_" <> (show s) <> "_" <> (show s')
212 MaxClique s t f -> "clique_" <> (show s)<> "_" <> (show f)<> "_" <> (show t)
213
214
215 syncToLabel :: Config -> [Char]
216 syncToLabel config = case (phyloSynchrony config) of
217 ByProximityThreshold scl sync_sens scope _ -> ("scale_" <> (show scope) <> "_" <> (show sync_sens) <> "_" <> (show scl))
218 ByProximityDistribution _ _ -> undefined
219
220 qualToConfig :: Config -> [Char]
221 qualToConfig config = case (phyloQuality config) of
222 Quality g m -> "quality_" <> (show g) <> "_" <> (show m)
223
224
225 -- To set up the export file's label from the configuration
226 configToLabel :: Config -> [Char]
227 configToLabel config = outputPath config
228 <> (unpack $ phyloName config)
229 <> "-" <> (timeToLabel config)
230 <> "-scale_" <> (show (phyloLevel config))
231 <> "-" <> (seaToLabel config)
232 <> "-" <> (sensToLabel config)
233 <> "-" <> (cliqueToLabel config)
234 <> "-level_" <> (show (_qua_granularity $ phyloQuality config))
235 <> "-" <> (syncToLabel config)
236 <> ".dot"
237
238
239 -- To write a sha256 from a set of config's parameters
240 configToSha :: PhyloStage -> Config -> [Char]
241 configToSha stage config = unpack
242 $ replace "/" "-"
243 $ T.pack (show (hash $ C8.pack label))
244 where
245 label :: [Char]
246 label = case stage of
247 PhyloWithCliques -> (corpusPath config)
248 <> (listPath config)
249 <> (timeToLabel config)
250 <> (cliqueToLabel config)
251 PhyloWithLinks -> (corpusPath config)
252 <> (listPath config)
253 <> (timeToLabel config)
254 <> (cliqueToLabel config)
255 <> (sensToLabel config)
256 <> (seaToLabel config)
257 <> (syncToLabel config)
258 <> (qualToConfig config)
259 <> (show (phyloLevel config))
260
261
262 writePhylo :: [Char] -> Phylo -> IO ()
263 writePhylo path phylo = Lazy.writeFile path $ encode phylo
264
265
266 readPhylo :: [Char] -> IO Phylo
267 readPhylo path = do
268 phyloJson <- (eitherDecode <$> readJson path) :: IO (Either String Phylo)
269 case phyloJson of
270 Left err -> do
271 putStrLn err
272 undefined
273 Right phylo -> pure phylo
274
275
276 --------------
277 -- | Main | --
278 --------------
279
280
281 main :: IO ()
282 main = do
283
284 printIOMsg "Starting the reconstruction"
285
286 printIOMsg "Read the configuration file"
287 [args] <- getArgs
288 jsonArgs <- (eitherDecode <$> readJson args) :: IO (Either String Config)
289
290 case jsonArgs of
291 Left err -> putStrLn err
292 Right config -> do
293
294 printIOMsg "Parse the corpus"
295 mapList <- csvMapTermList (listPath config)
296 corpus <- fileToDocs' (corpusParser config) (corpusPath config) (timeUnit config) mapList
297 printIOComment (show (length corpus) <> " parsed docs from the corpus")
298
299 printIOMsg "Reconstruct the phylo"
300
301 let phyloWithCliquesFile = (outputPath config) <> "phyloWithCliques_" <> (configToSha PhyloWithCliques config) <> ".json"
302 let phyloWithLinksFile = (outputPath config) <> "phyloWithLinks_" <> (configToSha PhyloWithLinks config) <> ".json"
303
304 phyloWithCliquesExists <- doesFileExist phyloWithCliquesFile
305 phyloWithLinksExists <- doesFileExist phyloWithLinksFile
306
307 -- phyloStep <- if phyloWithCliquesExists
308 -- then do
309 -- printIOMsg "Reconstruct the phylo step from an existing file"
310 -- readPhylo phyloWithCliquesFile
311 -- else do
312 -- printIOMsg "Reconstruct the phylo step from scratch"
313 -- pure $ toPhyloStep corpus mapList config
314
315 -- writePhylo phyloWithCliquesFile phyloStep
316
317 -- let phylo = toPhylo (setConfig config phyloStep)
318
319 phyloWithLinks <- if phyloWithLinksExists
320 then do
321 printIOMsg "Reconstruct the phylo from an existing file with intertemporal links"
322 readPhylo phyloWithLinksFile
323 else do
324 if phyloWithCliquesExists
325 then do
326 printIOMsg "Reconstruct the phylo from an existing file with cliques"
327 phyloWithCliques <- readPhylo phyloWithCliquesFile
328 writePhylo phyloWithCliquesFile phyloWithCliques
329 pure $ toPhylo (setConfig config phyloWithCliques)
330 else do
331 printIOMsg "Reconstruct the phylo from scratch"
332 phyloWithCliques <- pure $ toPhyloStep corpus mapList config
333 writePhylo phyloWithCliquesFile phyloWithCliques
334 pure $ toPhylo (setConfig config phyloWithCliques)
335
336 writePhylo phyloWithLinksFile phyloWithLinks
337
338
339 -- probes
340
341 -- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_synchronic_distance_cumu_jaccard.txt")
342 -- $ synchronicDistance' phylo 1
343
344 -- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_inflexion_points.txt")
345 -- $ inflexionPoints phylo 1
346
347 printIOMsg "End of reconstruction, start the export"
348
349 let dot = toPhyloExport (setConfig config phyloWithLinks)
350
351 let output = configToLabel config
352
353 dotToFile output dot