]> Git — Sourcephile - gargantext.git/blob - bin/gargantext-adaptative-phylo/Main.hs
[FIX] removing printDebug
[gargantext.git] / bin / gargantext-adaptative-phylo / Main.hs
1 {-|
2 Module : Main.hs
3 Description : Gargantext starter binary with Adaptative Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Adaptative Phylo binaries
11 -}
12
13 {-# LANGUAGE StandaloneDeriving #-}
14 {-# LANGUAGE TypeOperators #-}
15 {-# LANGUAGE Strict #-}
16
17 module Main where
18
19 import Data.Aeson
20 import Data.List (concat, nub, isSuffixOf)
21 import Data.String (String)
22 import Data.Text (Text, unwords, unpack, replace)
23 import Crypto.Hash.SHA256 (hash)
24
25 import Gargantext.Prelude
26 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
27 import Gargantext.Core.Text.Context (TermList)
28 import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year)
29 import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..),parseFile)
30 import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
31 import Gargantext.Core.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
32 import Gargantext.Core.Viz.AdaptativePhylo
33 import Gargantext.Core.Viz.Phylo.PhyloMaker (toPhylo, toPhyloStep)
34 import Gargantext.Core.Viz.Phylo.PhyloTools (printIOMsg, printIOComment, setConfig)
35 import Gargantext.Core.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
36
37 import GHC.IO (FilePath)
38 import Prelude (Either(Left, Right))
39 import System.Environment
40 import System.Directory (listDirectory,doesFileExist)
41 import Control.Concurrent.Async (mapConcurrently)
42
43 import qualified Data.ByteString.Char8 as C8
44 import qualified Data.ByteString.Lazy as Lazy
45 import qualified Data.Vector as Vector
46 import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as Csv
47 import qualified Data.Text as T
48
49
50 data PhyloStage = PhyloWithCliques | PhyloWithLinks deriving (Show)
51
52
53
54 ---------------
55 -- | Tools | --
56 ---------------
57
58
59 -- | To get all the files in a directory or just a file
60 getFilesFromPath :: FilePath -> IO([FilePath])
61 getFilesFromPath path = do
62 if (isSuffixOf "/" path)
63 then (listDirectory path)
64 else return [path]
65
66
67 --------------
68 -- | Json | --
69 --------------
70
71
72 -- | To read and decode a Json file
73 readJson :: FilePath -> IO Lazy.ByteString
74 readJson path = Lazy.readFile path
75
76
77 ----------------
78 -- | Parser | --
79 ----------------
80
81 -- | To filter the Ngrams of a document based on the termList
82 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
83 filterTerms patterns (y,d) = (y,termsInText patterns d)
84 where
85 --------------------------------------
86 termsInText :: Patterns -> Text -> [Text]
87 termsInText pats txt = nub $ concat $ map (map unwords) $ extractTermsWithList pats txt
88 --------------------------------------
89
90
91 -- | To transform a Wos file (or [file]) into a readable corpus
92 wosToCorpus :: Int -> FilePath -> IO ([(Int,Text)])
93 wosToCorpus limit path = do
94 files <- getFilesFromPath path
95 take limit
96 <$> map (\d -> let date' = fromJust $ _hd_publication_year d
97 title = fromJust $ _hd_title d
98 abstr = if (isJust $ _hd_abstract d)
99 then fromJust $ _hd_abstract d
100 else ""
101 in (date', title <> " " <> abstr))
102 <$> concat
103 <$> mapConcurrently (\file ->
104 filter (\d -> (isJust $ _hd_publication_year d)
105 && (isJust $ _hd_title d))
106 <$> parseFile WOS (path <> file) ) files
107
108
109 -- | To transform a Csv file into a readable corpus
110 csvToCorpus :: Int -> FilePath -> IO ([(Int,Text)])
111 csvToCorpus limit path = Vector.toList
112 <$> Vector.take limit
113 <$> Vector.map (\row -> (csv_publication_year row, (csv_title row) <> " " <> (csv_abstract row)))
114 <$> snd <$> Csv.readFile path
115
116
117 -- | To use the correct parser given a CorpusType
118 fileToCorpus :: CorpusParser -> FilePath -> IO ([(Int,Text)])
119 fileToCorpus parser path = case parser of
120 Wos limit -> wosToCorpus limit path
121 Csv limit -> csvToCorpus limit path
122
123
124 -- | To parse a file into a list of Document
125 fileToDocs :: CorpusParser -> FilePath -> TermList -> IO [Document]
126 fileToDocs parser path lst = do
127 corpus <- fileToCorpus parser path
128 let patterns = buildPatterns lst
129 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
130
131
132 -- Config time parameters to label
133 timeToLabel :: Config -> [Char]
134 timeToLabel config = case (timeUnit config) of
135 Year p s f -> ("time"<> "_"<> (show p) <> "_" <> (show s) <> "_" <> (show f))
136
137
138 seaToLabel :: Config -> [Char]
139 seaToLabel config = case (seaElevation config) of
140 Constante start step -> ("sea_cst_" <> (show start) <> "_" <> (show step))
141 Adaptative granularity -> ("sea_adapt" <> (show granularity))
142
143
144 sensToLabel :: Config -> [Char]
145 sensToLabel config = case (phyloProximity config) of
146 Hamming -> undefined
147 WeightedLogJaccard s -> ("WeightedLogJaccard_" <> show s)
148 WeightedLogSim s -> ( "WeightedLogSim-sens_" <> show s)
149
150
151 cliqueToLabel :: Config -> [Char]
152 cliqueToLabel config = case (clique config) of
153 Fis s s' -> "fis_" <> (show s) <> "_" <> (show s')
154 MaxClique s t f -> "clique_" <> (show s)<> "_" <> (show f)<> "_" <> (show t)
155
156
157 syncToLabel :: Config -> [Char]
158 syncToLabel config = case (phyloSynchrony config) of
159 ByProximityThreshold scl sync_sens scope _ -> ("scale_" <> (show scope) <> "_" <> (show sync_sens) <> "_" <> (show scl))
160 ByProximityDistribution _ _ -> undefined
161
162 qualToConfig :: Config -> [Char]
163 qualToConfig config = case (phyloQuality config) of
164 Quality g m -> "quality_" <> (show g) <> "_" <> (show m)
165
166
167 -- To set up the export file's label from the configuration
168 configToLabel :: Config -> [Char]
169 configToLabel config = outputPath config
170 <> (unpack $ phyloName config)
171 <> "-" <> (timeToLabel config)
172 <> "-scale_" <> (show (phyloLevel config))
173 <> "-" <> (seaToLabel config)
174 <> "-" <> (sensToLabel config)
175 <> "-" <> (cliqueToLabel config)
176 <> "-level_" <> (show (_qua_granularity $ phyloQuality config))
177 <> "-" <> (syncToLabel config)
178 <> ".dot"
179
180
181 -- To write a sha256 from a set of config's parameters
182 configToSha :: PhyloStage -> Config -> [Char]
183 configToSha stage config = unpack
184 $ replace "/" "-"
185 $ T.pack (show (hash $ C8.pack label))
186 where
187 label :: [Char]
188 label = case stage of
189 PhyloWithCliques -> (corpusPath config)
190 <> (listPath config)
191 <> (timeToLabel config)
192 <> (cliqueToLabel config)
193 PhyloWithLinks -> (corpusPath config)
194 <> (listPath config)
195 <> (timeToLabel config)
196 <> (cliqueToLabel config)
197 <> (sensToLabel config)
198 <> (seaToLabel config)
199 <> (syncToLabel config)
200 <> (qualToConfig config)
201 <> (show (phyloLevel config))
202
203
204 writePhylo :: [Char] -> Phylo -> IO ()
205 writePhylo path phylo = Lazy.writeFile path $ encode phylo
206
207
208 readPhylo :: [Char] -> IO Phylo
209 readPhylo path = do
210 phyloJson <- (eitherDecode <$> readJson path) :: IO (Either String Phylo)
211 case phyloJson of
212 Left err -> do
213 putStrLn err
214 undefined
215 Right phylo -> pure phylo
216
217
218 --------------
219 -- | Main | --
220 --------------
221
222
223 main :: IO ()
224 main = do
225
226 printIOMsg "Starting the reconstruction"
227
228 printIOMsg "Read the configuration file"
229 [args] <- getArgs
230 jsonArgs <- (eitherDecode <$> readJson args) :: IO (Either String Config)
231
232 case jsonArgs of
233 Left err -> putStrLn err
234 Right config -> do
235
236 printIOMsg "Parse the corpus"
237 mapList <- csvMapTermList (listPath config)
238 corpus <- fileToDocs (corpusParser config) (corpusPath config) mapList
239 printIOComment (show (length corpus) <> " parsed docs from the corpus")
240
241 printIOMsg "Reconstruct the phylo"
242
243 let phyloWithCliquesFile = (outputPath config) <> "phyloWithCliques_" <> (configToSha PhyloWithCliques config) <> ".json"
244 let phyloWithLinksFile = (outputPath config) <> "phyloWithLinks_" <> (configToSha PhyloWithLinks config) <> ".json"
245
246 phyloWithCliquesExists <- doesFileExist phyloWithCliquesFile
247 phyloWithLinksExists <- doesFileExist phyloWithLinksFile
248
249 -- phyloStep <- if phyloWithCliquesExists
250 -- then do
251 -- printIOMsg "Reconstruct the phylo step from an existing file"
252 -- readPhylo phyloWithCliquesFile
253 -- else do
254 -- printIOMsg "Reconstruct the phylo step from scratch"
255 -- pure $ toPhyloStep corpus mapList config
256
257 -- writePhylo phyloWithCliquesFile phyloStep
258
259 -- let phylo = toPhylo (setConfig config phyloStep)
260
261 phyloWithLinks <- if phyloWithLinksExists
262 then do
263 printIOMsg "Reconstruct the phylo from an existing file with intertemporal links"
264 readPhylo phyloWithLinksFile
265 else do
266 if phyloWithCliquesExists
267 then do
268 printIOMsg "Reconstruct the phylo from an existing file with cliques"
269 phyloWithCliques <- readPhylo phyloWithCliquesFile
270 writePhylo phyloWithCliquesFile phyloWithCliques
271 pure $ toPhylo (setConfig config phyloWithCliques)
272 else do
273 printIOMsg "Reconstruct the phylo from scratch"
274 phyloWithCliques <- pure $ toPhyloStep corpus mapList config
275 writePhylo phyloWithCliquesFile phyloWithCliques
276 pure $ toPhylo (setConfig config phyloWithCliques)
277
278 writePhylo phyloWithLinksFile phyloWithLinks
279
280
281 -- | probes
282
283 -- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_synchronic_distance_cumu_jaccard.txt")
284 -- $ synchronicDistance' phylo 1
285
286 -- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_inflexion_points.txt")
287 -- $ inflexionPoints phylo 1
288
289 printIOMsg "End of reconstruction, start the export"
290
291 let dot = toPhyloExport (setConfig config phyloWithLinks)
292
293 let output = configToLabel config
294
295 dotToFile output dot