]> Git — Sourcephile - gargantext.git/blob - bin/gargantext-phylo/Main.hs
add a list parser param
[gargantext.git] / bin / gargantext-phylo / Main.hs
1 {-|
2 Module : Main.hs
3 Description : Gargantext starter binary with Adaptative Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Adaptative Phylo binaries
11 -}
12
13 {-# LANGUAGE StandaloneDeriving #-}
14 {-# LANGUAGE TypeOperators #-}
15 {-# LANGUAGE Strict #-}
16
17 module Main where
18
19 -- import Debug.Trace (trace)
20 import Control.Concurrent.Async (mapConcurrently)
21 import Crypto.Hash.SHA256 (hash)
22 import Data.Aeson
23 import Data.Either (Either(..), fromRight)
24 import Data.List (concat, nub, isSuffixOf)
25 import Data.List.Split
26 import Data.Maybe (fromMaybe)
27 import Data.String (String)
28 import Data.Text (Text, unwords, unpack, replace, pack)
29 import GHC.IO (FilePath)
30 import Gargantext.API.Ngrams.Prelude (toTermList)
31 import Gargantext.API.Ngrams.Types
32 import Gargantext.Core.Text.Context (TermList)
33 import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..),parseFile)
34 import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year, csv_publication_month, csv_publication_day, csv'_source, csv'_title, csv'_abstract, csv'_publication_year, csv'_publication_month, csv'_publication_day, csv'_weight)
35 import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
36 import Gargantext.Core.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
37 import Gargantext.Core.Types.Main (ListType(..))
38 import Gargantext.Core.Viz.Phylo
39 import Gargantext.Core.Viz.Phylo.API
40 import Gargantext.Core.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
41 import Gargantext.Core.Viz.Phylo.PhyloMaker (toPhylo, toPhyloStep)
42 import Gargantext.Core.Viz.Phylo.PhyloTools (printIOMsg, printIOComment, setConfig)
43 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
44 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
45 import Gargantext.Prelude
46 import System.Directory (listDirectory,doesFileExist)
47 import System.Environment
48 import qualified Data.ByteString.Char8 as C8
49 import qualified Data.ByteString.Lazy as Lazy
50 import qualified Data.Text as T
51 import qualified Data.Vector as Vector
52 import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as Csv
53
54 data PhyloStage = PhyloWithCliques | PhyloWithLinks deriving (Show)
55
56 ---------------
57 -- | Tools | --
58 ---------------
59
60 -- | To get all the files in a directory or just a file
61 getFilesFromPath :: FilePath -> IO [FilePath]
62 getFilesFromPath path = do
63 if (isSuffixOf "/" path)
64 then (listDirectory path)
65 else return [path]
66
67 ----------------
68 -- | Parser | --
69 ----------------
70
71 -- | To filter the Ngrams of a document based on the termList
72 termsInText :: Patterns -> Text -> [Text]
73 termsInText pats txt = nub $ concat $ map (map unwords) $ extractTermsWithList pats txt
74
75
76 -- | To transform a Wos file (or [file]) into a list of Docs
77 wosToDocs :: Int -> Patterns -> TimeUnit -> FilePath -> IO [Document]
78 wosToDocs limit patterns time path = do
79 files <- getFilesFromPath path
80 take limit
81 <$> map (\d -> let title = fromJust $ _hd_title d
82 abstr = if (isJust $ _hd_abstract d)
83 then fromJust $ _hd_abstract d
84 else ""
85 in Document (toPhyloDate
86 (fromIntegral $ fromJust $ _hd_publication_year d)
87 (fromJust $ _hd_publication_month d)
88 (fromJust $ _hd_publication_day d) time)
89 (toPhyloDate'
90 (fromIntegral $ fromJust $ _hd_publication_year d)
91 (fromJust $ _hd_publication_month d)
92 (fromJust $ _hd_publication_day d) time)
93 (termsInText patterns $ title <> " " <> abstr) Nothing [])
94 <$> concat
95 <$> mapConcurrently (\file ->
96 filter (\d -> (isJust $ _hd_publication_year d)
97 && (isJust $ _hd_title d))
98 <$> fromRight [] <$> parseFile WOS (path <> file) ) files
99
100
101 -- To transform a Csv file into a list of Document
102 csvToDocs :: CorpusParser -> Patterns -> TimeUnit -> FilePath -> IO [Document]
103 csvToDocs parser patterns time path =
104 case parser of
105 Wos _ -> undefined
106 Csv limit -> Vector.toList
107 <$> Vector.take limit
108 <$> Vector.map (\row -> Document (toPhyloDate (Csv.fromMIntOrDec Csv.defaultYear $ csv_publication_year row) (fromMaybe Csv.defaultMonth $ csv_publication_month row) (fromMaybe Csv.defaultDay $ csv_publication_day row) time)
109 (toPhyloDate' (Csv.fromMIntOrDec Csv.defaultYear $ csv_publication_year row) (fromMaybe Csv.defaultMonth $ csv_publication_month row) (fromMaybe Csv.defaultDay $ csv_publication_day row) time)
110 (termsInText patterns $ (csv_title row) <> " " <> (csv_abstract row))
111 Nothing
112 []
113 ) <$> snd <$> either (\err -> panic $ cs $ "CSV error" <> (show err)) identity <$> Csv.readFile path
114 Csv' limit -> Vector.toList
115 <$> Vector.take limit
116 <$> Vector.map (\row -> Document (toPhyloDate (csv'_publication_year row) (csv'_publication_month row) (csv'_publication_day row) time)
117 (toPhyloDate' (csv'_publication_year row) (csv'_publication_month row) (csv'_publication_day row) time)
118 (termsInText patterns $ (csv'_title row) <> " " <> (csv'_abstract row))
119 (Just $ csv'_weight row)
120 (map (T.strip . pack) $ splitOn ";" (unpack $ (csv'_source row)))
121 ) <$> snd <$> Csv.readWeightedCsv path
122
123
124 -- To parse a file into a list of Document
125 fileToDocs' :: CorpusParser -> FilePath -> TimeUnit -> TermList -> IO [Document]
126 fileToDocs' parser path time lst = do
127 let patterns = buildPatterns lst
128 case parser of
129 Wos limit -> wosToDocs limit patterns time path
130 Csv _ -> csvToDocs parser patterns time path
131 Csv' _ -> csvToDocs parser patterns time path
132
133
134 ---------------
135 -- | Label | --
136 ---------------
137
138
139 -- Config time parameters to label
140 timeToLabel :: Config -> [Char]
141 timeToLabel config = case (timeUnit config) of
142 Epoch p s f -> ("time_epochs" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
143 Year p s f -> ("time_years" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
144 Month p s f -> ("time_months" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
145 Week p s f -> ("time_weeks" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
146 Day p s f -> ("time_days" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
147
148
149 seaToLabel :: Config -> [Char]
150 seaToLabel config = case (seaElevation config) of
151 Constante start step -> ("sea_cst_" <> (show start) <> "_" <> (show step))
152 Adaptative granularity -> ("sea_adapt" <> (show granularity))
153
154
155 sensToLabel :: Config -> [Char]
156 sensToLabel config = case (phyloProximity config) of
157 Hamming _ -> undefined
158 WeightedLogJaccard s -> ("WeightedLogJaccard_" <> show s)
159 WeightedLogSim s -> ( "WeightedLogSim-sens_" <> show s)
160
161
162 cliqueToLabel :: Config -> [Char]
163 cliqueToLabel config = case (clique config) of
164 Fis s s' -> "fis_" <> (show s) <> "_" <> (show s')
165 MaxClique s t f -> "clique_" <> (show s)<> "_" <> (show f)<> "_" <> (show t)
166
167
168 syncToLabel :: Config -> [Char]
169 syncToLabel config = case (phyloSynchrony config) of
170 ByProximityThreshold scl sync_sens scope _ -> ("scale_" <> (show scope) <> "_" <> (show sync_sens) <> "_" <> (show scl))
171 ByProximityDistribution _ _ -> undefined
172
173 qualToConfig :: Config -> [Char]
174 qualToConfig config = case (phyloQuality config) of
175 Quality g m -> "quality_" <> (show g) <> "_" <> (show m)
176
177
178 -- To set up the export file's label from the configuration
179 configToLabel :: Config -> [Char]
180 configToLabel config = outputPath config
181 <> (unpack $ phyloName config)
182 <> "-" <> (timeToLabel config)
183 <> "-scale_" <> (show (phyloLevel config))
184 <> "-" <> (seaToLabel config)
185 <> "-" <> (sensToLabel config)
186 <> "-" <> (cliqueToLabel config)
187 <> "-level_" <> (show (_qua_granularity $ phyloQuality config))
188 <> "-" <> (syncToLabel config)
189 <> ".dot"
190
191
192 -- To write a sha256 from a set of config's parameters
193 configToSha :: PhyloStage -> Config -> [Char]
194 configToSha stage config = unpack
195 $ replace "/" "-"
196 $ T.pack (show (hash $ C8.pack label))
197 where
198 label :: [Char]
199 label = case stage of
200 PhyloWithCliques -> (corpusPath config)
201 <> (listPath config)
202 <> (timeToLabel config)
203 <> (cliqueToLabel config)
204 PhyloWithLinks -> (corpusPath config)
205 <> (listPath config)
206 <> (timeToLabel config)
207 <> (cliqueToLabel config)
208 <> (sensToLabel config)
209 <> (seaToLabel config)
210 <> (syncToLabel config)
211 <> (qualToConfig config)
212 <> (show (phyloLevel config))
213
214
215 readListV4 :: [Char] -> IO NgramsList
216 readListV4 path = do
217 listJson <- (eitherDecode <$> readJson path) :: IO (Either String NgramsList)
218 case listJson of
219 Left err -> do
220 putStrLn err
221 undefined
222 Right listV4 -> pure listV4
223
224
225 fileToList :: ListParser -> FilePath -> IO TermList
226 fileToList parser path =
227 case parser of
228 V3 -> csvMapTermList path
229 V4 -> fromJust
230 <$> toTermList MapTerm NgramsTerms
231 <$> readListV4 path
232
233
234 --------------
235 -- | Main | --
236 --------------
237
238
239 main :: IO ()
240 main = do
241
242 printIOMsg "Starting the reconstruction"
243
244 printIOMsg "Read the configuration file"
245 [args] <- getArgs
246 jsonArgs <- (eitherDecode <$> readJson args) :: IO (Either String Config)
247
248 case jsonArgs of
249 Left err -> putStrLn err
250 Right config -> do
251
252 printIOMsg "Parse the corpus"
253 mapList <- fileToList (listParser config) (listPath config)
254 corpus <- fileToDocs' (corpusParser config) (corpusPath config) (timeUnit config) mapList
255 printIOComment (show (length corpus) <> " parsed docs from the corpus")
256
257 printIOMsg "Reconstruct the phylo"
258
259 let phyloWithCliquesFile = (outputPath config) <> "phyloWithCliques_" <> (configToSha PhyloWithCliques config) <> ".json"
260 let phyloWithLinksFile = (outputPath config) <> "phyloWithLinks_" <> (configToSha PhyloWithLinks config) <> ".json"
261
262 phyloWithCliquesExists <- doesFileExist phyloWithCliquesFile
263 phyloWithLinksExists <- doesFileExist phyloWithLinksFile
264
265 -- phyloStep <- if phyloWithCliquesExists
266 -- then do
267 -- printIOMsg "Reconstruct the phylo step from an existing file"
268 -- readPhylo phyloWithCliquesFile
269 -- else do
270 -- printIOMsg "Reconstruct the phylo step from scratch"
271 -- pure $ toPhyloStep corpus mapList config
272
273 -- writePhylo phyloWithCliquesFile phyloStep
274
275 -- let phylo = toPhylo (setConfig config phyloStep)
276
277 phyloWithLinks <- if phyloWithLinksExists
278 then do
279 printIOMsg "Reconstruct the phylo from an existing file with intertemporal links"
280 readPhylo phyloWithLinksFile
281 else do
282 if phyloWithCliquesExists
283 then do
284 printIOMsg "Reconstruct the phylo from an existing file with cliques"
285 phyloWithCliques <- readPhylo phyloWithCliquesFile
286 writePhylo phyloWithCliquesFile phyloWithCliques
287 pure $ toPhylo (setConfig config phyloWithCliques)
288 else do
289 printIOMsg "Reconstruct the phylo from scratch"
290 phyloWithCliques <- pure $ toPhyloStep corpus mapList config
291 writePhylo phyloWithCliquesFile phyloWithCliques
292 pure $ toPhylo (setConfig config phyloWithCliques)
293
294 writePhylo phyloWithLinksFile phyloWithLinks
295
296
297 -- probes
298
299 -- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_synchronic_distance_cumu_jaccard.txt")
300 -- $ synchronicDistance' phylo 1
301
302 -- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_inflexion_points.txt")
303 -- $ inflexionPoints phylo 1
304
305 printIOMsg "End of reconstruction, start the export"
306
307 let dot = toPhyloExport (setConfig config phyloWithLinks)
308
309 let output = configToLabel config
310
311 dotToFile output dot