3 Description : Gargantext starter binary with Adaptative Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Adaptative Phylo binaries
13 {-# LANGUAGE StandaloneDeriving #-}
14 {-# LANGUAGE TypeOperators #-}
15 {-# LANGUAGE Strict #-}
20 import Data.ByteString.Lazy (ByteString)
21 import Data.Maybe (isJust, fromJust)
22 import Data.List (concat, nub, isSuffixOf, take)
23 import Data.String (String)
24 import Data.Text (Text, unwords, unpack)
26 import Gargantext.Prelude
27 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
28 import Gargantext.Core.Text.Context (TermList)
29 import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year)
30 import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..),parseFile)
31 import Gargantext.Core.Text.List.CSV (csvMapTermList)
32 import Gargantext.Core.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
33 import Gargantext.Core.Viz.AdaptativePhylo
34 import Gargantext.Core.Viz.Phylo.PhyloMaker (toPhylo)
35 import Gargantext.Core.Viz.Phylo.PhyloTools (printIOMsg, printIOComment)
36 import Gargantext.Core.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
37 -- import Gargantext.Core.Viz.Phylo.SynchronicClustering (synchronicDistance')
39 import GHC.IO (FilePath)
40 import Prelude (Either(..))
41 import System.Environment
42 import System.Directory (listDirectory)
43 import Control.Concurrent.Async (mapConcurrently)
45 import qualified Data.ByteString.Lazy as Lazy
46 import qualified Data.Vector as Vector
47 import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as Csv
55 -- | To get all the files in a directory or just a file
56 getFilesFromPath :: FilePath -> IO([FilePath])
57 getFilesFromPath path = do
58 if (isSuffixOf "/" path)
59 then (listDirectory path)
68 -- | To read and decode a Json file
69 readJson :: FilePath -> IO ByteString
70 readJson path = Lazy.readFile path
77 -- | To filter the Ngrams of a document based on the termList
78 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
79 filterTerms patterns (y,d) = (y,termsInText patterns d)
81 --------------------------------------
82 termsInText :: Patterns -> Text -> [Text]
83 termsInText pats txt = nub $ concat $ map (map unwords) $ extractTermsWithList pats txt
84 --------------------------------------
87 -- | To transform a Wos file (or [file]) into a readable corpus
88 wosToCorpus :: Int -> FilePath -> IO ([(Int,Text)])
89 wosToCorpus limit path = do
90 files <- getFilesFromPath path
92 <$> map (\d -> let date' = fromJust $ _hd_publication_year d
93 title = fromJust $ _hd_title d
94 abstr = if (isJust $ _hd_abstract d)
95 then fromJust $ _hd_abstract d
97 in (date', title <> " " <> abstr))
99 <$> mapConcurrently (\file ->
100 filter (\d -> (isJust $ _hd_publication_year d)
101 && (isJust $ _hd_title d))
102 <$> parseFile WOS (path <> file) ) files
105 -- | To transform a Csv file into a readable corpus
106 csvToCorpus :: Int -> FilePath -> IO ([(Int,Text)])
107 csvToCorpus limit path = Vector.toList
108 <$> Vector.take limit
109 <$> Vector.map (\row -> (csv_publication_year row, (csv_title row) <> " " <> (csv_abstract row)))
110 <$> snd <$> Csv.readFile path
113 -- | To use the correct parser given a CorpusType
114 fileToCorpus :: CorpusParser -> FilePath -> IO ([(Int,Text)])
115 fileToCorpus parser path = case parser of
116 Wos limit -> wosToCorpus limit path
117 Csv limit -> csvToCorpus limit path
120 -- | To parse a file into a list of Document
121 fileToDocs :: CorpusParser -> FilePath -> TermList -> IO [Document]
122 fileToDocs parser path lst = do
123 corpus <- fileToCorpus parser path
124 let patterns = buildPatterns lst
125 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
136 printIOMsg "Starting the reconstruction"
138 printIOMsg "Read the configuration file"
140 jsonArgs <- (eitherDecode <$> readJson args) :: IO (Either String Config)
143 Left err -> putStrLn err
146 printIOMsg "Parse the corpus"
147 mapList <- csvMapTermList (listPath config)
148 corpus <- fileToDocs (corpusParser config) (corpusPath config) mapList
149 printIOComment (show (length corpus) <> " parsed docs from the corpus")
151 printIOMsg "Reconstruct the Phylo"
153 let phylo = toPhylo corpus mapList config
157 -- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_synchronic_distance_cumu_jaccard.txt")
158 -- $ synchronicDistance' phylo 1
160 -- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_inflexion_points.txt")
161 -- $ inflexionPoints phylo 1
163 printIOMsg "End of reconstruction, start the export"
165 let dot = toPhyloExport phylo
167 let clq = case (clique config) of
168 Fis s s' -> "fis_" <> (show s) <> "_" <> (show s')
169 MaxClique s -> "clique_" <> (show s)
171 let sensibility = case (phyloProximity config) of
173 WeightedLogJaccard s -> (show s)
175 let sync = case (phyloSynchrony config) of
176 ByProximityThreshold t _ _ _ -> (show t)
177 ByProximityDistribution _ _ -> undefined
180 -- let br_length = case (take 1 $ exportFilter config) of
181 -- ByBranchSize t -> (show t)
184 let output = (outputPath config)
185 <> (unpack $ phyloName config)
187 <> "-level_" <> (show (phyloLevel config))
188 <> "-sens_" <> sensibility
189 -- <> "-lenght_" <> br_length
190 <> "-scale_" <> (show (_qua_granularity $ phyloQuality config))