]> Git — Sourcephile - gargantext.git/blob - bin/gargantext-phylo/Main.hs
[FEAT][PHYLO] preparing integration to backend
[gargantext.git] / bin / gargantext-phylo / Main.hs
1 {-|
2 Module : Main.hs
3 Description : Gargantext starter binary with Adaptative Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Adaptative Phylo binaries
11 -}
12
13 {-# LANGUAGE StandaloneDeriving #-}
14 {-# LANGUAGE TypeOperators #-}
15 {-# LANGUAGE Strict #-}
16
17 module Main where
18
19 import Control.Concurrent.Async (mapConcurrently)
20 import Crypto.Hash.SHA256 (hash)
21 import Data.Aeson
22 import Data.Either (Either(..))
23 import Data.List (concat, nub, isSuffixOf)
24 import Data.Maybe (fromMaybe)
25 import Data.String (String)
26 import GHC.IO (FilePath)
27 import qualified Prelude as Prelude
28 import System.Environment
29 import System.Directory (listDirectory,doesFileExist)
30 import Data.Text (Text, unwords, unpack, replace, pack)
31 import Data.Time.Calendar (fromGregorian, diffGregorianDurationClip, cdMonths, diffDays, showGregorian)
32
33 import qualified Data.ByteString.Char8 as C8
34 import qualified Data.ByteString.Lazy as Lazy
35 import qualified Data.Vector as Vector
36 import qualified Data.Text as T
37
38 import Gargantext.Prelude
39 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
40 import Gargantext.Core.Text.Context (TermList)
41 import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year, csv_publication_month, csv_publication_day,
42 csv'_source, csv'_title, csv'_abstract, csv'_publication_year, csv'_publication_month, csv'_publication_day, csv'_weight)
43 import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as Csv
44 import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..),parseFile)
45 import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
46 import Gargantext.Core.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
47 import Gargantext.Core.Viz.Phylo
48 import Gargantext.Core.Viz.Phylo.PhyloMaker (toPhylo, toPhyloStep)
49 import Gargantext.Core.Viz.Phylo.PhyloTools (printIOMsg, printIOComment, setConfig)
50 import Gargantext.Core.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
51 -- import Gargantext.API.Ngrams.Prelude (toTermList)
52
53 -- import Debug.Trace (trace)
54
55 data PhyloStage = PhyloWithCliques | PhyloWithLinks deriving (Show)
56
57 ---------------
58 -- | Tools | --
59 ---------------
60
61
62 -- | To get all the files in a directory or just a file
63 getFilesFromPath :: FilePath -> IO([FilePath])
64 getFilesFromPath path = do
65 if (isSuffixOf "/" path)
66 then (listDirectory path)
67 else return [path]
68
69 ---------------
70 -- | Dates | --
71 ---------------
72 toMonths :: Integer -> Int -> Int -> Date
73 toMonths y m d = fromIntegral $ cdMonths
74 $ diffGregorianDurationClip (fromGregorian y m d) (fromGregorian 0000 0 0)
75
76
77 toDays :: Integer -> Int -> Int -> Date
78 toDays y m d = fromIntegral
79 $ diffDays (fromGregorian y m d) (fromGregorian 0000 0 0)
80
81
82 toPhyloDate :: Int -> Int -> Int -> TimeUnit -> Date
83 toPhyloDate y m d tu = case tu of
84 Year _ _ _ -> y
85 Month _ _ _ -> toMonths (Prelude.toInteger y) m d
86 Week _ _ _ -> div (toDays (Prelude.toInteger y) m d) 7
87 Day _ _ _ -> toDays (Prelude.toInteger y) m d
88
89
90 -- Function to use in Database export
91 toPhyloDate' :: Int -> Int -> Int -> Text
92 toPhyloDate' y m d = pack $ showGregorian $ fromGregorian (Prelude.toInteger y) m d
93
94
95 --------------
96 -- | Json | --
97 --------------
98
99
100 -- | To read and decode a Json file
101 readJson :: FilePath -> IO Lazy.ByteString
102 readJson path = Lazy.readFile path
103
104
105 ----------------
106 -- | Parser | --
107 ----------------
108
109 -- | To filter the Ngrams of a document based on the termList
110 termsInText :: Patterns -> Text -> [Text]
111 termsInText pats txt = nub $ concat $ map (map unwords) $ extractTermsWithList pats txt
112
113
114 -- | To transform a Wos file (or [file]) into a list of Docs
115 wosToDocs :: Int -> Patterns -> TimeUnit -> FilePath -> IO [Document]
116 wosToDocs limit patterns time path = do
117 files <- getFilesFromPath path
118 let parseFile' file = do
119 eParsed <- parseFile WOS (path <> file)
120 case eParsed of
121 Right ps -> pure ps
122 Left e -> panic $ "Error: " <> (pack e)
123 take limit
124 <$> map (\d -> let title = fromJust $ _hd_title d
125 abstr = if (isJust $ _hd_abstract d)
126 then fromJust $ _hd_abstract d
127 else ""
128 in Document (toPhyloDate
129 (fromIntegral $ fromJust $ _hd_publication_year d)
130 (fromJust $ _hd_publication_month d)
131 (fromJust $ _hd_publication_day d) time)
132 (toPhyloDate'
133 (fromIntegral $ fromJust $ _hd_publication_year d)
134 (fromJust $ _hd_publication_month d)
135 (fromJust $ _hd_publication_day d))
136 (termsInText patterns $ title <> " " <> abstr) Nothing [])
137 <$> concat
138 <$> mapConcurrently (\file ->
139 filter (\d -> (isJust $ _hd_publication_year d)
140 && (isJust $ _hd_title d))
141 <$> parseFile' file) files
142
143
144 -- To transform a Csv file into a list of Document
145 csvToDocs :: CorpusParser -> Patterns -> TimeUnit -> FilePath -> IO [Document]
146 csvToDocs parser patterns time path =
147 case parser of
148 Wos _ -> undefined
149 Csv limit -> do
150 eR <- Csv.readFile path
151 case eR of
152 Right r ->
153 pure $ Vector.toList
154 $ Vector.take limit
155 $ Vector.map (\row -> Document (toPhyloDate (Csv.fromMIntOrDec Csv.defaultYear $ csv_publication_year row)
156 (fromMaybe Csv.defaultMonth $ csv_publication_month row)
157 (fromMaybe Csv.defaultDay $ csv_publication_day row)
158 time)
159 (toPhyloDate' (Csv.fromMIntOrDec Csv.defaultYear $ csv_publication_year row)
160 (fromMaybe Csv.defaultMonth $ csv_publication_month row)
161 (fromMaybe Csv.defaultDay $ csv_publication_day row))
162 (termsInText patterns $ (csv_title row) <> " " <> (csv_abstract row))
163 Nothing
164 []
165 ) $ snd r
166 Left e -> panic $ "Error: " <> (pack e)
167 Csv' limit -> Vector.toList
168 <$> Vector.take limit
169 <$> Vector.map (\row -> Document (toPhyloDate (csv'_publication_year row) (csv'_publication_month row) (csv'_publication_day row) time)
170 (toPhyloDate' (csv'_publication_year row) (csv'_publication_month row) (csv'_publication_day row))
171 (termsInText patterns $ (csv'_title row) <> " " <> (csv'_abstract row))
172 (Just $ csv'_weight row)
173 [csv'_source row]
174 ) <$> snd <$> Csv.readWeightedCsv path
175
176
177 -- To parse a file into a list of Document
178 fileToDocs' :: CorpusParser -> FilePath -> TimeUnit -> TermList -> IO [Document]
179 fileToDocs' parser path time lst = do
180 let patterns = buildPatterns lst
181 case parser of
182 Wos limit -> wosToDocs limit patterns time path
183 Csv _ -> csvToDocs parser patterns time path
184 Csv' _ -> csvToDocs parser patterns time path
185
186
187 ---------------
188 -- | Label | --
189 ---------------
190
191
192 -- Config time parameters to label
193 timeToLabel :: Config -> [Char]
194 timeToLabel config = case (timeUnit config) of
195 Year p s f -> ("time_years" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
196 Month p s f -> ("time_months"<> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
197 Week p s f -> ("time_weeks" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
198 Day p s f -> ("time_days" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
199
200
201 seaToLabel :: Config -> [Char]
202 seaToLabel config = case (seaElevation config) of
203 Constante start step -> ("sea_cst_" <> (show start) <> "_" <> (show step))
204 Adaptative granularity -> ("sea_adapt" <> (show granularity))
205
206
207 sensToLabel :: Config -> [Char]
208 sensToLabel config = case (phyloProximity config) of
209 Hamming -> undefined
210 WeightedLogJaccard s -> ("WeightedLogJaccard_" <> show s)
211 WeightedLogSim s -> ( "WeightedLogSim-sens_" <> show s)
212
213
214 cliqueToLabel :: Config -> [Char]
215 cliqueToLabel config = case (clique config) of
216 Fis s s' -> "fis_" <> (show s) <> "_" <> (show s')
217 MaxClique s t f -> "clique_" <> (show s)<> "_" <> (show f)<> "_" <> (show t)
218
219
220 syncToLabel :: Config -> [Char]
221 syncToLabel config = case (phyloSynchrony config) of
222 ByProximityThreshold scl sync_sens scope _ -> ("scale_" <> (show scope) <> "_" <> (show sync_sens) <> "_" <> (show scl))
223 ByProximityDistribution _ _ -> undefined
224
225 qualToConfig :: Config -> [Char]
226 qualToConfig config = case (phyloQuality config) of
227 Quality g m -> "quality_" <> (show g) <> "_" <> (show m)
228
229
230 -- To set up the export file's label from the configuration
231 configToLabel :: Config -> [Char]
232 configToLabel config = outputPath config
233 <> (unpack $ phyloName config)
234 <> "-" <> (timeToLabel config)
235 <> "-scale_" <> (show (phyloLevel config))
236 <> "-" <> (seaToLabel config)
237 <> "-" <> (sensToLabel config)
238 <> "-" <> (cliqueToLabel config)
239 <> "-level_" <> (show (_qua_granularity $ phyloQuality config))
240 <> "-" <> (syncToLabel config)
241 <> ".dot"
242
243
244 -- To write a sha256 from a set of config's parameters
245 configToSha :: PhyloStage -> Config -> [Char]
246 configToSha stage config = unpack
247 $ replace "/" "-"
248 $ T.pack (show (hash $ C8.pack label))
249 where
250 label :: [Char]
251 label = case stage of
252 PhyloWithCliques -> (corpusPath config)
253 <> (listPath config)
254 <> (timeToLabel config)
255 <> (cliqueToLabel config)
256 PhyloWithLinks -> (corpusPath config)
257 <> (listPath config)
258 <> (timeToLabel config)
259 <> (cliqueToLabel config)
260 <> (sensToLabel config)
261 <> (seaToLabel config)
262 <> (syncToLabel config)
263 <> (qualToConfig config)
264 <> (show (phyloLevel config))
265
266
267 writePhylo :: [Char] -> Phylo -> IO ()
268 writePhylo path phylo = Lazy.writeFile path $ encode phylo
269
270
271 readPhylo :: [Char] -> IO Phylo
272 readPhylo path = do
273 phyloJson <- (eitherDecode <$> readJson path) :: IO (Either String Phylo)
274 case phyloJson of
275 Left err -> do
276 putStrLn err
277 undefined
278 Right phylo -> pure phylo
279
280
281 --------------
282 -- | Main | --
283 --------------
284
285
286 main :: IO ()
287 main = do
288
289 printIOMsg "Starting the reconstruction"
290
291 printIOMsg "Read the configuration file"
292 [args] <- getArgs
293 jsonArgs <- (eitherDecode <$> readJson args) :: IO (Either String Config)
294
295 case jsonArgs of
296 Left err -> putStrLn err
297 Right config -> do
298
299 printIOMsg "Parse the corpus"
300 mapList <- csvMapTermList (listPath config)
301 corpus <- fileToDocs' (corpusParser config) (corpusPath config) (timeUnit config) mapList
302 printIOComment (show (length corpus) <> " parsed docs from the corpus")
303
304 printIOMsg "Reconstruct the phylo"
305
306 let phyloWithCliquesFile = (outputPath config) <> "phyloWithCliques_" <> (configToSha PhyloWithCliques config) <> ".json"
307 let phyloWithLinksFile = (outputPath config) <> "phyloWithLinks_" <> (configToSha PhyloWithLinks config) <> ".json"
308
309 phyloWithCliquesExists <- doesFileExist phyloWithCliquesFile
310 phyloWithLinksExists <- doesFileExist phyloWithLinksFile
311
312 -- phyloStep <- if phyloWithCliquesExists
313 -- then do
314 -- printIOMsg "Reconstruct the phylo step from an existing file"
315 -- readPhylo phyloWithCliquesFile
316 -- else do
317 -- printIOMsg "Reconstruct the phylo step from scratch"
318 -- pure $ toPhyloStep corpus mapList config
319
320 -- writePhylo phyloWithCliquesFile phyloStep
321
322 -- let phylo = toPhylo (setConfig config phyloStep)
323
324 -- QL: 2 files read from disk
325 phyloWithLinks <- if phyloWithLinksExists
326 then do
327 printIOMsg "Reconstruct the phylo from an existing file with intertemporal links"
328 readPhylo phyloWithLinksFile
329 else do
330 if phyloWithCliquesExists
331 then do
332 printIOMsg "Reconstruct the phylo from an existing file with cliques"
333 phyloWithCliques <- readPhylo phyloWithCliquesFile
334 writePhylo phyloWithCliquesFile phyloWithCliques
335 pure $ toPhylo (setConfig config phyloWithCliques)
336 else do
337 printIOMsg "Reconstruct the phylo from scratch"
338 phyloWithCliques <- pure $ toPhyloStep corpus mapList config
339 writePhylo phyloWithCliquesFile phyloWithCliques
340 pure $ toPhylo (setConfig config phyloWithCliques)
341
342 writePhylo phyloWithLinksFile phyloWithLinks
343
344
345 -- probes
346
347 -- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_synchronic_distance_cumu_jaccard.txt")
348 -- $ synchronicDistance' phylo 1
349
350 -- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_inflexion_points.txt")
351 -- $ inflexionPoints phylo 1
352
353 printIOMsg "End of reconstruction, start the export"
354
355 let dot = toPhyloExport (setConfig config phyloWithLinks)
356
357 let output = configToLabel config
358
359 dotToFile output dot