]> Git — Sourcephile - gargantext.git/blob - bin/gargantext-adaptative-phylo/Main.hs
[searx] fetching searx json page works now
[gargantext.git] / bin / gargantext-adaptative-phylo / Main.hs
1 {-|
2 Module : Main.hs
3 Description : Gargantext starter binary with Adaptative Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Adaptative Phylo binaries
11 -}
12
13 {-# LANGUAGE StandaloneDeriving #-}
14 {-# LANGUAGE TypeOperators #-}
15 {-# LANGUAGE Strict #-}
16
17 module Main where
18
19 import Data.Aeson
20 import Data.List (concat, nub, isSuffixOf)
21 import Data.String (String)
22 import Data.Text (Text, unwords, unpack, replace, pack)
23 import Crypto.Hash.SHA256 (hash)
24
25 import Gargantext.Prelude
26 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
27 import Gargantext.Core.Text.Context (TermList)
28 import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year, csv_publication_month, csv_publication_day,
29 csv'_source, csv'_title, csv'_abstract, csv'_publication_year, csv'_publication_month, csv'_publication_day, csv'_weight)
30 import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..),parseFile)
31 import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
32 import Gargantext.Core.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
33 import Gargantext.Core.Viz.AdaptativePhylo
34 import Gargantext.Core.Viz.Phylo.PhyloMaker (toPhylo, toPhyloStep)
35 import Gargantext.Core.Viz.Phylo.PhyloTools (printIOMsg, printIOComment, setConfig)
36 import Gargantext.Core.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
37 -- import Gargantext.API.Ngrams.Prelude (toTermList)
38
39 import GHC.IO (FilePath)
40 import Prelude (Either(Left, Right),toInteger)
41 import System.Environment
42 import System.Directory (listDirectory,doesFileExist)
43 import Control.Concurrent.Async (mapConcurrently)
44
45 import Data.Time.Calendar (fromGregorian, diffGregorianDurationClip, cdMonths, diffDays, showGregorian)
46
47 import qualified Data.ByteString.Char8 as C8
48 import qualified Data.ByteString.Lazy as Lazy
49 import qualified Data.Vector as Vector
50 import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as Csv
51 import qualified Data.Text as T
52
53 -- import Debug.Trace (trace)
54
55 data PhyloStage = PhyloWithCliques | PhyloWithLinks deriving (Show)
56
57 ---------------
58 -- | Tools | --
59 ---------------
60
61
62 -- | To get all the files in a directory or just a file
63 getFilesFromPath :: FilePath -> IO([FilePath])
64 getFilesFromPath path = do
65 if (isSuffixOf "/" path)
66 then (listDirectory path)
67 else return [path]
68
69 ---------------
70 -- | Dates | --
71 ---------------
72
73
74 toMonths :: Integer -> Int -> Int -> Date
75 toMonths y m d = fromIntegral $ cdMonths
76 $ diffGregorianDurationClip (fromGregorian y m d) (fromGregorian 0000 0 0)
77
78
79 toDays :: Integer -> Int -> Int -> Date
80 toDays y m d = fromIntegral
81 $ diffDays (fromGregorian y m d) (fromGregorian 0000 0 0)
82
83
84 toPhyloDate :: Int -> Int -> Int -> TimeUnit -> Date
85 toPhyloDate y m d tu = case tu of
86 Year _ _ _ -> y
87 Month _ _ _ -> toMonths (toInteger y) m d
88 Week _ _ _ -> div (toDays (toInteger y) m d) 7
89 Day _ _ _ -> toDays (toInteger y) m d
90
91
92 toPhyloDate' :: Int -> Int -> Int -> Text
93 toPhyloDate' y m d = pack $ showGregorian $ fromGregorian (toInteger y) m d
94
95
96 --------------
97 -- | Json | --
98 --------------
99
100
101 -- | To read and decode a Json file
102 readJson :: FilePath -> IO Lazy.ByteString
103 readJson path = Lazy.readFile path
104
105
106 ----------------
107 -- | Parser | --
108 ----------------
109
110 -- | To filter the Ngrams of a document based on the termList
111 termsInText :: Patterns -> Text -> [Text]
112 termsInText pats txt = nub $ concat $ map (map unwords) $ extractTermsWithList pats txt
113
114
115 -- | To transform a Wos file (or [file]) into a list of Docs
116 wosToDocs :: Int -> Patterns -> TimeUnit -> FilePath -> IO ([Document])
117 wosToDocs limit patterns time path = do
118 files <- getFilesFromPath path
119 take limit
120 <$> map (\d -> let title = fromJust $ _hd_title d
121 abstr = if (isJust $ _hd_abstract d)
122 then fromJust $ _hd_abstract d
123 else ""
124 in Document (toPhyloDate
125 (fromIntegral $ fromJust $ _hd_publication_year d)
126 (fromJust $ _hd_publication_month d)
127 (fromJust $ _hd_publication_day d) time)
128 (toPhyloDate'
129 (fromIntegral $ fromJust $ _hd_publication_year d)
130 (fromJust $ _hd_publication_month d)
131 (fromJust $ _hd_publication_day d))
132 (termsInText patterns $ title <> " " <> abstr) Nothing [])
133 <$> concat
134 <$> mapConcurrently (\file ->
135 filter (\d -> (isJust $ _hd_publication_year d)
136 && (isJust $ _hd_title d))
137 <$> parseFile WOS (path <> file) ) files
138
139
140 -- To transform a Csv file into a list of Document
141 csvToDocs :: CorpusParser -> Patterns -> TimeUnit -> FilePath -> IO ([Document])
142 csvToDocs parser patterns time path =
143 case parser of
144 Wos _ -> undefined
145 Csv limit -> Vector.toList
146 <$> Vector.take limit
147 <$> Vector.map (\row -> Document (toPhyloDate (csv_publication_year row) (csv_publication_month row) (csv_publication_day row) time)
148 (toPhyloDate' (csv_publication_year row) (csv_publication_month row) (csv_publication_day row))
149 (termsInText patterns $ (csv_title row) <> " " <> (csv_abstract row))
150 Nothing
151 []
152 ) <$> snd <$> Csv.readFile path
153 Csv' limit -> Vector.toList
154 <$> Vector.take limit
155 <$> Vector.map (\row -> Document (toPhyloDate (csv'_publication_year row) (csv'_publication_month row) (csv'_publication_day row) time)
156 (toPhyloDate' (csv'_publication_year row) (csv'_publication_month row) (csv'_publication_day row))
157 (termsInText patterns $ (csv'_title row) <> " " <> (csv'_abstract row))
158 (Just $ csv'_weight row)
159 [csv'_source row]
160 ) <$> snd <$> Csv.readWeightedCsv path
161
162
163 -- To parse a file into a list of Document
164 fileToDocs' :: CorpusParser -> FilePath -> TimeUnit -> TermList -> IO [Document]
165 fileToDocs' parser path time lst = do
166 let patterns = buildPatterns lst
167 case parser of
168 Wos limit -> wosToDocs limit patterns time path
169 Csv _ -> csvToDocs parser patterns time path
170 Csv' _ -> csvToDocs parser patterns time path
171
172
173 ---------------
174 -- | Label | --
175 ---------------
176
177
178 -- Config time parameters to label
179 timeToLabel :: Config -> [Char]
180 timeToLabel config = case (timeUnit config) of
181 Year p s f -> ("time_years" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
182 Month p s f -> ("time_months"<> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
183 Week p s f -> ("time_weeks" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
184 Day p s f -> ("time_days" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
185
186
187 seaToLabel :: Config -> [Char]
188 seaToLabel config = case (seaElevation config) of
189 Constante start step -> ("sea_cst_" <> (show start) <> "_" <> (show step))
190 Adaptative granularity -> ("sea_adapt" <> (show granularity))
191
192
193 sensToLabel :: Config -> [Char]
194 sensToLabel config = case (phyloProximity config) of
195 Hamming -> undefined
196 WeightedLogJaccard s -> ("WeightedLogJaccard_" <> show s)
197 WeightedLogSim s -> ( "WeightedLogSim-sens_" <> show s)
198
199
200 cliqueToLabel :: Config -> [Char]
201 cliqueToLabel config = case (clique config) of
202 Fis s s' -> "fis_" <> (show s) <> "_" <> (show s')
203 MaxClique s t f -> "clique_" <> (show s)<> "_" <> (show f)<> "_" <> (show t)
204
205
206 syncToLabel :: Config -> [Char]
207 syncToLabel config = case (phyloSynchrony config) of
208 ByProximityThreshold scl sync_sens scope _ -> ("scale_" <> (show scope) <> "_" <> (show sync_sens) <> "_" <> (show scl))
209 ByProximityDistribution _ _ -> undefined
210
211 qualToConfig :: Config -> [Char]
212 qualToConfig config = case (phyloQuality config) of
213 Quality g m -> "quality_" <> (show g) <> "_" <> (show m)
214
215
216 -- To set up the export file's label from the configuration
217 configToLabel :: Config -> [Char]
218 configToLabel config = outputPath config
219 <> (unpack $ phyloName config)
220 <> "-" <> (timeToLabel config)
221 <> "-scale_" <> (show (phyloLevel config))
222 <> "-" <> (seaToLabel config)
223 <> "-" <> (sensToLabel config)
224 <> "-" <> (cliqueToLabel config)
225 <> "-level_" <> (show (_qua_granularity $ phyloQuality config))
226 <> "-" <> (syncToLabel config)
227 <> ".dot"
228
229
230 -- To write a sha256 from a set of config's parameters
231 configToSha :: PhyloStage -> Config -> [Char]
232 configToSha stage config = unpack
233 $ replace "/" "-"
234 $ T.pack (show (hash $ C8.pack label))
235 where
236 label :: [Char]
237 label = case stage of
238 PhyloWithCliques -> (corpusPath config)
239 <> (listPath config)
240 <> (timeToLabel config)
241 <> (cliqueToLabel config)
242 PhyloWithLinks -> (corpusPath config)
243 <> (listPath config)
244 <> (timeToLabel config)
245 <> (cliqueToLabel config)
246 <> (sensToLabel config)
247 <> (seaToLabel config)
248 <> (syncToLabel config)
249 <> (qualToConfig config)
250 <> (show (phyloLevel config))
251
252
253 writePhylo :: [Char] -> Phylo -> IO ()
254 writePhylo path phylo = Lazy.writeFile path $ encode phylo
255
256
257 readPhylo :: [Char] -> IO Phylo
258 readPhylo path = do
259 phyloJson <- (eitherDecode <$> readJson path) :: IO (Either String Phylo)
260 case phyloJson of
261 Left err -> do
262 putStrLn err
263 undefined
264 Right phylo -> pure phylo
265
266
267 --------------
268 -- | Main | --
269 --------------
270
271
272 main :: IO ()
273 main = do
274
275 printIOMsg "Starting the reconstruction"
276
277 printIOMsg "Read the configuration file"
278 [args] <- getArgs
279 jsonArgs <- (eitherDecode <$> readJson args) :: IO (Either String Config)
280
281 case jsonArgs of
282 Left err -> putStrLn err
283 Right config -> do
284
285 printIOMsg "Parse the corpus"
286 mapList <- csvMapTermList (listPath config)
287 corpus <- fileToDocs' (corpusParser config) (corpusPath config) (timeUnit config) mapList
288 printIOComment (show (length corpus) <> " parsed docs from the corpus")
289
290 printIOMsg "Reconstruct the phylo"
291
292 let phyloWithCliquesFile = (outputPath config) <> "phyloWithCliques_" <> (configToSha PhyloWithCliques config) <> ".json"
293 let phyloWithLinksFile = (outputPath config) <> "phyloWithLinks_" <> (configToSha PhyloWithLinks config) <> ".json"
294
295 phyloWithCliquesExists <- doesFileExist phyloWithCliquesFile
296 phyloWithLinksExists <- doesFileExist phyloWithLinksFile
297
298 -- phyloStep <- if phyloWithCliquesExists
299 -- then do
300 -- printIOMsg "Reconstruct the phylo step from an existing file"
301 -- readPhylo phyloWithCliquesFile
302 -- else do
303 -- printIOMsg "Reconstruct the phylo step from scratch"
304 -- pure $ toPhyloStep corpus mapList config
305
306 -- writePhylo phyloWithCliquesFile phyloStep
307
308 -- let phylo = toPhylo (setConfig config phyloStep)
309
310 phyloWithLinks <- if phyloWithLinksExists
311 then do
312 printIOMsg "Reconstruct the phylo from an existing file with intertemporal links"
313 readPhylo phyloWithLinksFile
314 else do
315 if phyloWithCliquesExists
316 then do
317 printIOMsg "Reconstruct the phylo from an existing file with cliques"
318 phyloWithCliques <- readPhylo phyloWithCliquesFile
319 writePhylo phyloWithCliquesFile phyloWithCliques
320 pure $ toPhylo (setConfig config phyloWithCliques)
321 else do
322 printIOMsg "Reconstruct the phylo from scratch"
323 phyloWithCliques <- pure $ toPhyloStep corpus mapList config
324 writePhylo phyloWithCliquesFile phyloWithCliques
325 pure $ toPhylo (setConfig config phyloWithCliques)
326
327 writePhylo phyloWithLinksFile phyloWithLinks
328
329
330 -- probes
331
332 -- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_synchronic_distance_cumu_jaccard.txt")
333 -- $ synchronicDistance' phylo 1
334
335 -- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_inflexion_points.txt")
336 -- $ inflexionPoints phylo 1
337
338 printIOMsg "End of reconstruction, start the export"
339
340 let dot = toPhyloExport (setConfig config phyloWithLinks)
341
342 let output = configToLabel config
343
344 dotToFile output dot