3 Description : Gargantext starter binary with Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
14 {-# LANGUAGE StandaloneDeriving #-}
15 {-# LANGUAGE TypeOperators #-}
16 {-# LANGUAGE Strict #-}
20 import Control.Concurrent.Async as CCA (mapConcurrently)
22 import Data.List ((++),concat)
24 import Data.Text (Text, unwords)
26 import GHC.IO (FilePath)
27 import Gargantext.Database.Admin.Types.Hyperdata
28 import Gargantext.Prelude
29 import Gargantext.Core.Text.Context (TermList)
30 import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..),parseFile)
31 import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year)
32 import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
33 import Gargantext.Core.Text.Terms.WithList
34 import Gargantext.Core.Viz.Phylo
35 import Gargantext.Core.Viz.Phylo.LevelMaker
36 import Gargantext.Core.Viz.Phylo.Tools
37 import Gargantext.Core.Viz.Phylo.View.Export
38 import Gargantext.Core.Viz.Phylo.View.ViewMaker
39 import System.Directory (doesFileExist)
40 import System.Environment
41 import qualified Data.ByteString.Lazy as L
42 import qualified Data.List as DL
43 import qualified Data.Map as DM
44 import qualified Data.Text as DT
45 import qualified Data.Vector as DV
46 import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as CSV
47 import qualified Prelude as P
55 type ListPath = FilePath
56 type FisPath = FilePath
57 type CorpusPath = FilePath
58 data CorpusType = Wos | Csv deriving (Show,Generic)
62 Conf { corpusPath :: CorpusPath
63 , corpusType :: CorpusType
64 , listPath :: ListPath
66 , outputPath :: FilePath
72 , timeFrameTh :: Double
75 , reBranchThr :: Double
78 , clusterSens :: Double
83 , minSizeBranch :: Int
84 } deriving (Show,Generic)
86 instance FromJSON Conf
89 instance FromJSON CorpusType
90 instance ToJSON CorpusType
93 decoder :: P.Either a b -> b
94 decoder (P.Left _) = P.error "Error"
95 decoder (P.Right x) = x
97 -- | Get the conf from a Json file
98 getJson :: FilePath -> IO L.ByteString
99 getJson path = L.readFile path
106 -- | To filter the Ngrams of a document based on the termList
107 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
108 filterTerms patterns (y,d) = (y,termsInText patterns d)
111 -- | To transform a Csv nfile into a readable corpus
112 csvToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
113 csvToCorpus limit csv = DV.toList
117 . DV.map (\n -> (csv_publication_year n, (csv_title n) <> " " <> (csv_abstract n)))
118 . snd <$> CSV.readFile csv
121 -- | To transform a Wos nfile into a readable corpus
122 wosToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
123 wosToCorpus limit path = DL.take limit
124 . map (\d -> ((fromJust $_hd_publication_year d)
125 ,(fromJust $_hd_title d) <> " " <> (fromJust $_hd_abstract d)))
126 . filter (\d -> (isJust $_hd_publication_year d)
127 && (isJust $_hd_title d)
128 && (isJust $_hd_abstract d))
130 <$> mapConcurrently (\idx -> parseFile WOS (path <> show(idx) <> ".txt")) [1..20]
133 -- | To use the correct parser given a CorpusType
134 fileToCorpus :: CorpusType -> Limit -> CorpusPath -> IO ([(Int,Text)])
135 fileToCorpus format limit path = case format of
136 Wos -> wosToCorpus limit path
137 Csv -> csvToCorpus limit path
140 -- | To parse a file into a list of Document
141 parse :: CorpusType -> Limit -> CorpusPath -> TermList -> IO [Document]
142 parse format limit path l = do
143 corpus <- fileToCorpus format limit path
144 let patterns = buildPatterns l
145 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
148 -- | To parse an existing Fis file
149 parseFis :: FisPath -> Text -> Int -> Int -> Int -> Int -> IO [PhyloFis]
150 parseFis path name grain step support clique = do
151 fisExists <- doesFileExist (path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json")
154 fisJson <- (eitherDecode <$> getJson (path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json")) :: IO (P.Either P.String [PhyloFis])
159 P.Right fis -> pure fis
162 writeFis :: FisPath -> Text -> Int -> Int -> Int -> Int -> DM.Map (Date,Date) [PhyloFis] -> IO ()
163 writeFis path name grain step support clique fis = do
164 let fisPath = path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json"
165 L.writeFile fisPath $ encode (DL.concat $ DM.elems fis)
175 [jsonPath] <- getArgs
177 confJson <- (eitherDecode <$> getJson jsonPath) :: IO (P.Either P.String Conf)
180 P.Left err -> putStrLn err
183 termList <- csvMapTermList (listPath conf)
185 corpus <- parse (corpusType conf) (limit conf) (corpusPath conf) termList
187 putStrLn $ ("\n" <> show (length corpus) <> " parsed docs")
189 fis <- parseFis (fisPath conf) (phyloName conf) (timeGrain conf) (timeStep conf) (fisSupport conf) (fisClique conf)
191 putStrLn $ ("\n" <> show (length fis) <> " parsed fis")
193 let fis' = DM.fromListWith (++) $ DL.sortOn (fst . fst) $ map (\f -> (getFisPeriod f,[f])) fis
195 let query = PhyloQueryBuild (phyloName conf) "" (timeGrain conf) (timeStep conf)
196 (Fis $ FisParams True (fisSupport conf) (fisClique conf)) [] [] (WeightedLogJaccard $ WLJParams (timeTh conf) (timeSens conf)) (timeFrame conf) (timeFrameTh conf)
197 (reBranchThr conf) (reBranchNth conf) (phyloLevel conf)
198 (RelatedComponents $ RCParams $ WeightedLogJaccard $ WLJParams (clusterTh conf) (clusterSens conf))
200 let queryView = PhyloQueryView (viewLevel conf) Merge False 1 [BranchAge,BranchBirth,BranchGroups] [SizeBranch $ SBParams (minSizeBranch conf)] [GroupLabelIncDyn,BranchPeakInc] (Just (ByBranchBirth,Asc)) Json Flat True
202 let phylo = toPhylo query corpus termList fis'
204 writeFis (fisPath conf) (phyloName conf) (timeGrain conf) (timeStep conf) (fisSupport conf) (fisClique conf) (getPhyloFis phylo)
206 let view = toPhyloView queryView phylo
208 putStrLn $ ("phylo completed until level " <> show (phyloLevel conf) <> ", export at level " <> show (viewLevel conf))
210 let outputFile = (outputPath conf) <> (DT.unpack $ phyloName conf)
211 <> "_" <> show (limit conf) <> "_"
212 <> "_" <> show (timeTh conf) <> "_"
213 <> "_" <> show (timeSens conf) <> "_"
214 <> "_" <> show (clusterTh conf) <> "_"
215 <> "_" <> show (clusterSens conf)
218 P.writeFile outputFile $ dotToString $ viewToDot view