]> Git — Sourcephile - gargantext.git/blob - bin/gargantext-adaptative-phylo/Main.hs
[FIX] Unary document upload
[gargantext.git] / bin / gargantext-adaptative-phylo / Main.hs
1 {-|
2 Module : Main.hs
3 Description : Gargantext starter binary with Adaptative Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Adaptative Phylo binaries
11 -}
12
13 {-# LANGUAGE StandaloneDeriving #-}
14 {-# LANGUAGE TypeOperators #-}
15 {-# LANGUAGE Strict #-}
16
17 module Main where
18
19 import Control.Concurrent.Async (mapConcurrently)
20 import Crypto.Hash.SHA256 (hash)
21 import Data.Aeson
22 import Data.Either (Either(..))
23 import Data.List (concat, nub, isSuffixOf)
24 import Data.Maybe (fromMaybe)
25 import Data.String (String)
26 import GHC.IO (FilePath)
27 import qualified Prelude as Prelude
28 import System.Environment
29 import System.Directory (listDirectory,doesFileExist)
30 import Data.Text (Text, unwords, unpack, replace, pack)
31 import Data.Time.Calendar (fromGregorian, diffGregorianDurationClip, cdMonths, diffDays, showGregorian)
32
33 import qualified Data.ByteString.Char8 as C8
34 import qualified Data.ByteString.Lazy as Lazy
35 import qualified Data.Vector as Vector
36 import qualified Data.Text as T
37
38 import Gargantext.Prelude
39 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
40 import Gargantext.Core.Text.Context (TermList)
41 import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year, csv_publication_month, csv_publication_day,
42 csv'_source, csv'_title, csv'_abstract, csv'_publication_year, csv'_publication_month, csv'_publication_day, csv'_weight)
43 import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as Csv
44 import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..),parseFile)
45 import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
46 import Gargantext.Core.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
47 import Gargantext.Core.Viz.AdaptativePhylo
48 import Gargantext.Core.Viz.Phylo.PhyloMaker (toPhylo, toPhyloStep)
49 import Gargantext.Core.Viz.Phylo.PhyloTools (printIOMsg, printIOComment, setConfig)
50 import Gargantext.Core.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
51 -- import Gargantext.API.Ngrams.Prelude (toTermList)
52
53 -- import Debug.Trace (trace)
54
55 data PhyloStage = PhyloWithCliques | PhyloWithLinks deriving (Show)
56
57 ---------------
58 -- | Tools | --
59 ---------------
60
61
62 -- | To get all the files in a directory or just a file
63 getFilesFromPath :: FilePath -> IO([FilePath])
64 getFilesFromPath path = do
65 if (isSuffixOf "/" path)
66 then (listDirectory path)
67 else return [path]
68
69 ---------------
70 -- | Dates | --
71 ---------------
72
73
74 toMonths :: Integer -> Int -> Int -> Date
75 toMonths y m d = fromIntegral $ cdMonths
76 $ diffGregorianDurationClip (fromGregorian y m d) (fromGregorian 0000 0 0)
77
78
79 toDays :: Integer -> Int -> Int -> Date
80 toDays y m d = fromIntegral
81 $ diffDays (fromGregorian y m d) (fromGregorian 0000 0 0)
82
83
84 toPhyloDate :: Int -> Int -> Int -> TimeUnit -> Date
85 toPhyloDate y m d tu = case tu of
86 Year _ _ _ -> y
87 Month _ _ _ -> toMonths (Prelude.toInteger y) m d
88 Week _ _ _ -> div (toDays (Prelude.toInteger y) m d) 7
89 Day _ _ _ -> toDays (Prelude.toInteger y) m d
90
91
92 toPhyloDate' :: Int -> Int -> Int -> Text
93 toPhyloDate' y m d = pack $ showGregorian $ fromGregorian (Prelude.toInteger y) m d
94
95
96 --------------
97 -- | Json | --
98 --------------
99
100
101 -- | To read and decode a Json file
102 readJson :: FilePath -> IO Lazy.ByteString
103 readJson path = Lazy.readFile path
104
105
106 ----------------
107 -- | Parser | --
108 ----------------
109
110 -- | To filter the Ngrams of a document based on the termList
111 termsInText :: Patterns -> Text -> [Text]
112 termsInText pats txt = nub $ concat $ map (map unwords) $ extractTermsWithList pats txt
113
114
115 -- | To transform a Wos file (or [file]) into a list of Docs
116 wosToDocs :: Int -> Patterns -> TimeUnit -> FilePath -> IO [Document]
117 wosToDocs limit patterns time path = do
118 files <- getFilesFromPath path
119 let parseFile' file = do
120 eParsed <- parseFile WOS (path <> file)
121 case eParsed of
122 Right ps -> pure ps
123 Left e -> panic $ "Error: " <> (pack e)
124 take limit
125 <$> map (\d -> let title = fromJust $ _hd_title d
126 abstr = if (isJust $ _hd_abstract d)
127 then fromJust $ _hd_abstract d
128 else ""
129 in Document (toPhyloDate
130 (fromIntegral $ fromJust $ _hd_publication_year d)
131 (fromJust $ _hd_publication_month d)
132 (fromJust $ _hd_publication_day d) time)
133 (toPhyloDate'
134 (fromIntegral $ fromJust $ _hd_publication_year d)
135 (fromJust $ _hd_publication_month d)
136 (fromJust $ _hd_publication_day d))
137 (termsInText patterns $ title <> " " <> abstr) Nothing [])
138 <$> concat
139 <$> mapConcurrently (\file ->
140 filter (\d -> (isJust $ _hd_publication_year d)
141 && (isJust $ _hd_title d))
142 <$> parseFile' file) files
143
144
145 -- To transform a Csv file into a list of Document
146 csvToDocs :: CorpusParser -> Patterns -> TimeUnit -> FilePath -> IO [Document]
147 csvToDocs parser patterns time path =
148 case parser of
149 Wos _ -> undefined
150 Csv limit -> do
151 eR <- Csv.readFile path
152 case eR of
153 Right r ->
154 pure $ Vector.toList
155 $ Vector.take limit
156 $ Vector.map (\row -> Document (toPhyloDate (Csv.fromMIntOrDec Csv.defaultYear $ csv_publication_year row)
157 (fromMaybe Csv.defaultMonth $ csv_publication_month row)
158 (fromMaybe Csv.defaultDay $ csv_publication_day row)
159 time)
160 (toPhyloDate' (Csv.fromMIntOrDec Csv.defaultYear $ csv_publication_year row)
161 (fromMaybe Csv.defaultMonth $ csv_publication_month row)
162 (fromMaybe Csv.defaultDay $ csv_publication_day row))
163 (termsInText patterns $ (csv_title row) <> " " <> (csv_abstract row))
164 Nothing
165 []
166 ) $ snd r
167 Left e -> panic $ "Error: " <> (pack e)
168 Csv' limit -> Vector.toList
169 <$> Vector.take limit
170 <$> Vector.map (\row -> Document (toPhyloDate (csv'_publication_year row) (csv'_publication_month row) (csv'_publication_day row) time)
171 (toPhyloDate' (csv'_publication_year row) (csv'_publication_month row) (csv'_publication_day row))
172 (termsInText patterns $ (csv'_title row) <> " " <> (csv'_abstract row))
173 (Just $ csv'_weight row)
174 [csv'_source row]
175 ) <$> snd <$> Csv.readWeightedCsv path
176
177
178 -- To parse a file into a list of Document
179 fileToDocs' :: CorpusParser -> FilePath -> TimeUnit -> TermList -> IO [Document]
180 fileToDocs' parser path time lst = do
181 let patterns = buildPatterns lst
182 case parser of
183 Wos limit -> wosToDocs limit patterns time path
184 Csv _ -> csvToDocs parser patterns time path
185 Csv' _ -> csvToDocs parser patterns time path
186
187
188 ---------------
189 -- | Label | --
190 ---------------
191
192
193 -- Config time parameters to label
194 timeToLabel :: Config -> [Char]
195 timeToLabel config = case (timeUnit config) of
196 Year p s f -> ("time_years" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
197 Month p s f -> ("time_months"<> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
198 Week p s f -> ("time_weeks" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
199 Day p s f -> ("time_days" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
200
201
202 seaToLabel :: Config -> [Char]
203 seaToLabel config = case (seaElevation config) of
204 Constante start step -> ("sea_cst_" <> (show start) <> "_" <> (show step))
205 Adaptative granularity -> ("sea_adapt" <> (show granularity))
206
207
208 sensToLabel :: Config -> [Char]
209 sensToLabel config = case (phyloProximity config) of
210 Hamming -> undefined
211 WeightedLogJaccard s -> ("WeightedLogJaccard_" <> show s)
212 WeightedLogSim s -> ( "WeightedLogSim-sens_" <> show s)
213
214
215 cliqueToLabel :: Config -> [Char]
216 cliqueToLabel config = case (clique config) of
217 Fis s s' -> "fis_" <> (show s) <> "_" <> (show s')
218 MaxClique s t f -> "clique_" <> (show s)<> "_" <> (show f)<> "_" <> (show t)
219
220
221 syncToLabel :: Config -> [Char]
222 syncToLabel config = case (phyloSynchrony config) of
223 ByProximityThreshold scl sync_sens scope _ -> ("scale_" <> (show scope) <> "_" <> (show sync_sens) <> "_" <> (show scl))
224 ByProximityDistribution _ _ -> undefined
225
226 qualToConfig :: Config -> [Char]
227 qualToConfig config = case (phyloQuality config) of
228 Quality g m -> "quality_" <> (show g) <> "_" <> (show m)
229
230
231 -- To set up the export file's label from the configuration
232 configToLabel :: Config -> [Char]
233 configToLabel config = outputPath config
234 <> (unpack $ phyloName config)
235 <> "-" <> (timeToLabel config)
236 <> "-scale_" <> (show (phyloLevel config))
237 <> "-" <> (seaToLabel config)
238 <> "-" <> (sensToLabel config)
239 <> "-" <> (cliqueToLabel config)
240 <> "-level_" <> (show (_qua_granularity $ phyloQuality config))
241 <> "-" <> (syncToLabel config)
242 <> ".dot"
243
244
245 -- To write a sha256 from a set of config's parameters
246 configToSha :: PhyloStage -> Config -> [Char]
247 configToSha stage config = unpack
248 $ replace "/" "-"
249 $ T.pack (show (hash $ C8.pack label))
250 where
251 label :: [Char]
252 label = case stage of
253 PhyloWithCliques -> (corpusPath config)
254 <> (listPath config)
255 <> (timeToLabel config)
256 <> (cliqueToLabel config)
257 PhyloWithLinks -> (corpusPath config)
258 <> (listPath config)
259 <> (timeToLabel config)
260 <> (cliqueToLabel config)
261 <> (sensToLabel config)
262 <> (seaToLabel config)
263 <> (syncToLabel config)
264 <> (qualToConfig config)
265 <> (show (phyloLevel config))
266
267
268 writePhylo :: [Char] -> Phylo -> IO ()
269 writePhylo path phylo = Lazy.writeFile path $ encode phylo
270
271
272 readPhylo :: [Char] -> IO Phylo
273 readPhylo path = do
274 phyloJson <- (eitherDecode <$> readJson path) :: IO (Either String Phylo)
275 case phyloJson of
276 Left err -> do
277 putStrLn err
278 undefined
279 Right phylo -> pure phylo
280
281
282 --------------
283 -- | Main | --
284 --------------
285
286
287 main :: IO ()
288 main = do
289
290 printIOMsg "Starting the reconstruction"
291
292 printIOMsg "Read the configuration file"
293 [args] <- getArgs
294 jsonArgs <- (eitherDecode <$> readJson args) :: IO (Either String Config)
295
296 case jsonArgs of
297 Left err -> putStrLn err
298 Right config -> do
299
300 printIOMsg "Parse the corpus"
301 mapList <- csvMapTermList (listPath config)
302 corpus <- fileToDocs' (corpusParser config) (corpusPath config) (timeUnit config) mapList
303 printIOComment (show (length corpus) <> " parsed docs from the corpus")
304
305 printIOMsg "Reconstruct the phylo"
306
307 let phyloWithCliquesFile = (outputPath config) <> "phyloWithCliques_" <> (configToSha PhyloWithCliques config) <> ".json"
308 let phyloWithLinksFile = (outputPath config) <> "phyloWithLinks_" <> (configToSha PhyloWithLinks config) <> ".json"
309
310 phyloWithCliquesExists <- doesFileExist phyloWithCliquesFile
311 phyloWithLinksExists <- doesFileExist phyloWithLinksFile
312
313 -- phyloStep <- if phyloWithCliquesExists
314 -- then do
315 -- printIOMsg "Reconstruct the phylo step from an existing file"
316 -- readPhylo phyloWithCliquesFile
317 -- else do
318 -- printIOMsg "Reconstruct the phylo step from scratch"
319 -- pure $ toPhyloStep corpus mapList config
320
321 -- writePhylo phyloWithCliquesFile phyloStep
322
323 -- let phylo = toPhylo (setConfig config phyloStep)
324
325 phyloWithLinks <- if phyloWithLinksExists
326 then do
327 printIOMsg "Reconstruct the phylo from an existing file with intertemporal links"
328 readPhylo phyloWithLinksFile
329 else do
330 if phyloWithCliquesExists
331 then do
332 printIOMsg "Reconstruct the phylo from an existing file with cliques"
333 phyloWithCliques <- readPhylo phyloWithCliquesFile
334 writePhylo phyloWithCliquesFile phyloWithCliques
335 pure $ toPhylo (setConfig config phyloWithCliques)
336 else do
337 printIOMsg "Reconstruct the phylo from scratch"
338 phyloWithCliques <- pure $ toPhyloStep corpus mapList config
339 writePhylo phyloWithCliquesFile phyloWithCliques
340 pure $ toPhylo (setConfig config phyloWithCliques)
341
342 writePhylo phyloWithLinksFile phyloWithLinks
343
344
345 -- probes
346
347 -- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_synchronic_distance_cumu_jaccard.txt")
348 -- $ synchronicDistance' phylo 1
349
350 -- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_inflexion_points.txt")
351 -- $ inflexionPoints phylo 1
352
353 printIOMsg "End of reconstruction, start the export"
354
355 let dot = toPhyloExport (setConfig config phyloWithLinks)
356
357 let output = configToLabel config
358
359 dotToFile output dot