]> Git — Sourcephile - gargantext.git/blob - bin/gargantext-phylo/Main.hs
Merge branch 'dev' into 111-dev-refactor-text-corpus-api-with-conduit
[gargantext.git] / bin / gargantext-phylo / Main.hs
1 {-|
2 Module : Main.hs
3 Description : Gargantext starter binary with Adaptative Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Adaptative Phylo binaries
11 -}
12
13 {-# LANGUAGE StandaloneDeriving #-}
14 {-# LANGUAGE TypeOperators #-}
15 {-# LANGUAGE Strict #-}
16
17 module Main where
18
19 -- import Debug.Trace (trace)
20 import Control.Concurrent.Async (mapConcurrently)
21 import Crypto.Hash.SHA256 (hash)
22 import Data.Aeson
23 import Data.Either (Either(..), fromRight)
24 import Data.List (concat, nub, isSuffixOf)
25 import Data.List.Split
26 import Data.Maybe (fromMaybe)
27 import Data.String (String)
28 import Data.Text (Text, unwords, unpack, replace, pack)
29 import GHC.IO (FilePath)
30 import Gargantext.API.Ngrams.Prelude (toTermList)
31 import Gargantext.API.Ngrams.Types
32 import Gargantext.Core.Text.Context (TermList)
33 import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..),parseFile)
34 import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year, csv_publication_month, csv_publication_day, csv'_source, csv'_title, csv'_abstract, csv'_publication_year, csv'_publication_month, csv'_publication_day, csv'_weight)
35 import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
36 import Gargantext.Core.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
37 import Gargantext.Core.Types.Main (ListType(..))
38 import Gargantext.Core.Viz.Phylo
39 import Gargantext.Core.Viz.Phylo.API.Tools
40 import Gargantext.Core.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
41 import Gargantext.Core.Viz.Phylo.PhyloMaker (toPhylo, toPhyloStep)
42 import Gargantext.Core.Viz.Phylo.PhyloTools (printIOMsg, printIOComment, setConfig)
43 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
44 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
45 import Gargantext.Prelude
46 import System.Directory (listDirectory,doesFileExist)
47 import System.Environment
48 import qualified Data.ByteString.Char8 as C8
49 import qualified Data.Text as T
50 import qualified Data.Vector as Vector
51 import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as Csv
52
53 data PhyloStage = PhyloWithCliques | PhyloWithLinks deriving (Show)
54
55 ---------------
56 -- | Tools | --
57 ---------------
58
59 -- | To get all the files in a directory or just a file
60 getFilesFromPath :: FilePath -> IO [FilePath]
61 getFilesFromPath path = do
62 if (isSuffixOf "/" path)
63 then (listDirectory path)
64 else return [path]
65
66 ----------------
67 -- | Parser | --
68 ----------------
69
70 -- | To filter the Ngrams of a document based on the termList
71 termsInText :: Patterns -> Text -> [Text]
72 termsInText pats txt = nub $ concat $ map (map unwords) $ extractTermsWithList pats txt
73
74
75 -- | To transform a Wos file (or [file]) into a list of Docs
76 wosToDocs :: Int -> Patterns -> TimeUnit -> FilePath -> IO [Document]
77 wosToDocs limit patterns time path = do
78 files <- getFilesFromPath path
79 take limit
80 <$> map (\d -> let title = fromJust $ _hd_title d
81 abstr = if (isJust $ _hd_abstract d)
82 then fromJust $ _hd_abstract d
83 else ""
84 in Document (toPhyloDate
85 (fromIntegral $ fromJust $ _hd_publication_year d)
86 (fromJust $ _hd_publication_month d)
87 (fromJust $ _hd_publication_day d) time)
88 (toPhyloDate'
89 (fromIntegral $ fromJust $ _hd_publication_year d)
90 (fromJust $ _hd_publication_month d)
91 (fromJust $ _hd_publication_day d) time)
92 (termsInText patterns $ title <> " " <> abstr) Nothing [])
93 <$> concat
94 <$> mapConcurrently (\file ->
95 filter (\d -> (isJust $ _hd_publication_year d)
96 && (isJust $ _hd_title d))
97 <$> fromRight [] <$> parseFile WOS (path <> file) ) files
98
99
100 -- To transform a Csv file into a list of Document
101 csvToDocs :: CorpusParser -> Patterns -> TimeUnit -> FilePath -> IO [Document]
102 csvToDocs parser patterns time path =
103 case parser of
104 Wos _ -> undefined
105 Csv limit -> Vector.toList
106 <$> Vector.take limit
107 <$> Vector.map (\row -> Document (toPhyloDate (Csv.fromMIntOrDec Csv.defaultYear $ csv_publication_year row) (fromMaybe Csv.defaultMonth $ csv_publication_month row) (fromMaybe Csv.defaultDay $ csv_publication_day row) time)
108 (toPhyloDate' (Csv.fromMIntOrDec Csv.defaultYear $ csv_publication_year row) (fromMaybe Csv.defaultMonth $ csv_publication_month row) (fromMaybe Csv.defaultDay $ csv_publication_day row) time)
109 (termsInText patterns $ (csv_title row) <> " " <> (csv_abstract row))
110 Nothing
111 []
112 ) <$> snd <$> either (\err -> panic $ cs $ "CSV error" <> (show err)) identity <$> Csv.readFile path
113 Csv' limit -> Vector.toList
114 <$> Vector.take limit
115 <$> Vector.map (\row -> Document (toPhyloDate (csv'_publication_year row) (csv'_publication_month row) (csv'_publication_day row) time)
116 (toPhyloDate' (csv'_publication_year row) (csv'_publication_month row) (csv'_publication_day row) time)
117 (termsInText patterns $ (csv'_title row) <> " " <> (csv'_abstract row))
118 (Just $ csv'_weight row)
119 (map (T.strip . pack) $ splitOn ";" (unpack $ (csv'_source row)))
120 ) <$> snd <$> Csv.readWeightedCsv path
121
122
123 -- To parse a file into a list of Document
124 fileToDocs' :: CorpusParser -> FilePath -> TimeUnit -> TermList -> IO [Document]
125 fileToDocs' parser path time lst = do
126 let patterns = buildPatterns lst
127 case parser of
128 Wos limit -> wosToDocs limit patterns time path
129 Csv _ -> csvToDocs parser patterns time path
130 Csv' _ -> csvToDocs parser patterns time path
131
132
133 ---------------
134 -- | Label | --
135 ---------------
136
137
138 -- Config time parameters to label
139 timeToLabel :: PhyloConfig -> [Char]
140 timeToLabel config = case (timeUnit config) of
141 Epoch p s f -> ("time_epochs" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
142 Year p s f -> ("time_years" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
143 Month p s f -> ("time_months" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
144 Week p s f -> ("time_weeks" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
145 Day p s f -> ("time_days" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
146
147
148 seaToLabel :: PhyloConfig -> [Char]
149 seaToLabel config = case (seaElevation config) of
150 Constante start step -> ("sea_cst_" <> (show start) <> "_" <> (show step))
151 Adaptative granularity -> ("sea_adapt" <> (show granularity))
152
153
154 sensToLabel :: PhyloConfig -> [Char]
155 sensToLabel config = case (phyloProximity config) of
156 Hamming _ -> undefined
157 WeightedLogJaccard s -> ("WeightedLogJaccard_" <> show s)
158 WeightedLogSim s -> ( "WeightedLogSim-sens_" <> show s)
159
160
161 cliqueToLabel :: PhyloConfig -> [Char]
162 cliqueToLabel config = case (clique config) of
163 Fis s s' -> "fis_" <> (show s) <> "_" <> (show s')
164 MaxClique s t f -> "clique_" <> (show s)<> "_" <> (show f)<> "_" <> (show t)
165
166
167 syncToLabel :: PhyloConfig -> [Char]
168 syncToLabel config = case (phyloSynchrony config) of
169 ByProximityThreshold scl sync_sens scope _ -> ("scale_" <> (show scope) <> "_" <> (show sync_sens) <> "_" <> (show scl))
170 ByProximityDistribution _ _ -> undefined
171
172 qualToConfig :: PhyloConfig -> [Char]
173 qualToConfig config = case (phyloQuality config) of
174 Quality g m -> "quality_" <> (show g) <> "_" <> (show m)
175
176
177 -- To set up the export file's label from the configuration
178 configToLabel :: PhyloConfig -> [Char]
179 configToLabel config = outputPath config
180 <> (unpack $ phyloName config)
181 <> "-" <> (timeToLabel config)
182 <> "-scale_" <> (show (phyloLevel config))
183 <> "-" <> (seaToLabel config)
184 <> "-" <> (sensToLabel config)
185 <> "-" <> (cliqueToLabel config)
186 <> "-level_" <> (show (_qua_granularity $ phyloQuality config))
187 <> "-" <> (syncToLabel config)
188 <> ".dot"
189
190
191 -- To write a sha256 from a set of config's parameters
192 configToSha :: PhyloStage -> PhyloConfig -> [Char]
193 configToSha stage config = unpack
194 $ replace "/" "-"
195 $ T.pack (show (hash $ C8.pack label))
196 where
197 label :: [Char]
198 label = case stage of
199 PhyloWithCliques -> (corpusPath config)
200 <> (listPath config)
201 <> (timeToLabel config)
202 <> (cliqueToLabel config)
203 PhyloWithLinks -> (corpusPath config)
204 <> (listPath config)
205 <> (timeToLabel config)
206 <> (cliqueToLabel config)
207 <> (sensToLabel config)
208 <> (seaToLabel config)
209 <> (syncToLabel config)
210 <> (qualToConfig config)
211 <> (show (phyloLevel config))
212
213
214 readListV4 :: [Char] -> IO NgramsList
215 readListV4 path = do
216 listJson <- (eitherDecode <$> readJson path) :: IO (Either String NgramsList)
217 case listJson of
218 Left err -> do
219 putStrLn err
220 undefined
221 Right listV4 -> pure listV4
222
223
224 fileToList :: ListParser -> FilePath -> IO TermList
225 fileToList parser path =
226 case parser of
227 V3 -> csvMapTermList path
228 V4 -> fromJust
229 <$> toTermList MapTerm NgramsTerms
230 <$> readListV4 path
231
232
233 --------------
234 -- | Main | --
235 --------------
236
237
238 main :: IO ()
239 main = do
240
241 printIOMsg "Starting the reconstruction"
242
243 printIOMsg "Read the configuration file"
244 [args] <- getArgs
245 jsonArgs <- (eitherDecode <$> readJson args) :: IO (Either String PhyloConfig)
246
247 case jsonArgs of
248 Left err -> putStrLn err
249 Right config -> do
250
251 printIOMsg "Parse the corpus"
252 mapList <- fileToList (listParser config) (listPath config)
253 corpus <- fileToDocs' (corpusParser config) (corpusPath config) (timeUnit config) mapList
254 printIOComment (show (length corpus) <> " parsed docs from the corpus")
255
256 printIOMsg "Reconstruct the phylo"
257
258 let phyloWithCliquesFile = (outputPath config) <> "phyloWithCliques_" <> (configToSha PhyloWithCliques config) <> ".json"
259 let phyloWithLinksFile = (outputPath config) <> "phyloWithLinks_" <> (configToSha PhyloWithLinks config) <> ".json"
260
261 phyloWithCliquesExists <- doesFileExist phyloWithCliquesFile
262 phyloWithLinksExists <- doesFileExist phyloWithLinksFile
263
264 -- phyloStep <- if phyloWithCliquesExists
265 -- then do
266 -- printIOMsg "Reconstruct the phylo step from an existing file"
267 -- readPhylo phyloWithCliquesFile
268 -- else do
269 -- printIOMsg "Reconstruct the phylo step from scratch"
270 -- pure $ toPhyloStep corpus mapList config
271
272 -- writePhylo phyloWithCliquesFile phyloStep
273
274 -- let phylo = toPhylo (setConfig config phyloStep)
275
276 phyloWithLinks <- if phyloWithLinksExists
277 then do
278 printIOMsg "Reconstruct the phylo from an existing file with intertemporal links"
279 readPhylo phyloWithLinksFile
280 else do
281 if phyloWithCliquesExists
282 then do
283 printIOMsg "Reconstruct the phylo from an existing file with cliques"
284 phyloWithCliques <- readPhylo phyloWithCliquesFile
285 writePhylo phyloWithCliquesFile phyloWithCliques
286 pure $ toPhylo (setConfig config phyloWithCliques)
287 else do
288 printIOMsg "Reconstruct the phylo from scratch"
289 phyloWithCliques <- pure $ toPhyloStep corpus mapList config
290 writePhylo phyloWithCliquesFile phyloWithCliques
291 pure $ toPhylo (setConfig config phyloWithCliques)
292
293 writePhylo phyloWithLinksFile phyloWithLinks
294
295
296 -- probes
297
298 -- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_synchronic_distance_cumu_jaccard.txt")
299 -- $ synchronicDistance' phylo 1
300
301 -- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_inflexion_points.txt")
302 -- $ inflexionPoints phylo 1
303
304 printIOMsg "End of reconstruction, start the export"
305
306 let dot = toPhyloExport (setConfig config phyloWithLinks)
307
308 let output = configToLabel config
309
310 dotToFile output dot