3 Description : Gargantext starter binary with Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
14 {-# LANGUAGE StandaloneDeriving #-}
15 {-# LANGUAGE TypeOperators #-}
16 {-# LANGUAGE Strict #-}
20 import Control.Concurrent.Async as CCA (mapConcurrently)
21 import Control.Monad (mapM)
23 import Data.List ((++),concat)
25 import Data.Text (Text, unwords, unlines)
27 import GHC.IO (FilePath)
28 import Gargantext.Database.Admin.Types.Hyperdata
29 import Gargantext.Database.Admin.Types.Node
30 import Gargantext.Prelude
31 import Gargantext.Text.Context (TermList)
32 import Gargantext.Text.Corpus.Parsers (FileFormat(..),parseFile)
33 import Gargantext.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year)
34 import Gargantext.Text.List.CSV (csvMapTermList)
35 import Gargantext.Text.Terms.WithList
36 import Gargantext.Viz.Phylo
37 import Gargantext.Viz.Phylo.LevelMaker
38 import Gargantext.Viz.Phylo.Tools
39 import Gargantext.Viz.Phylo.View.Export
40 import Gargantext.Viz.Phylo.View.ViewMaker
41 import System.Directory (doesFileExist)
42 import System.Environment
43 import qualified Data.ByteString.Lazy as L
44 import qualified Data.List as DL
45 import qualified Data.Map as DM
46 import qualified Data.Text as DT
47 import qualified Data.Vector as DV
48 import qualified Gargantext.Text.Corpus.Parsers.CSV as CSV
49 import qualified Prelude as P
57 type ListPath = FilePath
58 type FisPath = FilePath
59 type CorpusPath = FilePath
60 data CorpusType = Wos | Csv deriving (Show,Generic)
64 Conf { corpusPath :: CorpusPath
65 , corpusType :: CorpusType
66 , listPath :: ListPath
68 , outputPath :: FilePath
74 , timeFrameTh :: Double
77 , reBranchThr :: Double
80 , clusterSens :: Double
85 , minSizeBranch :: Int
86 } deriving (Show,Generic)
88 instance FromJSON Conf
91 instance FromJSON CorpusType
92 instance ToJSON CorpusType
95 decoder :: P.Either a b -> b
96 decoder (P.Left _) = P.error "Error"
97 decoder (P.Right x) = x
99 -- | Get the conf from a Json file
100 getJson :: FilePath -> IO L.ByteString
101 getJson path = L.readFile path
109 -- | To filter the Ngrams of a document based on the termList
110 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
111 filterTerms patterns (y,d) = (y,termsInText patterns d)
113 --------------------------------------
114 termsInText :: Patterns -> Text -> [Text]
115 termsInText pats txt = DL.nub
118 $ extractTermsWithList pats txt
119 --------------------------------------
122 -- | To transform a Csv nfile into a readable corpus
123 csvToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
124 csvToCorpus limit csv = DV.toList
128 . DV.map (\n -> (csv_publication_year n, (csv_title n) <> " " <> (csv_abstract n)))
129 . snd <$> CSV.readFile csv
132 -- | To transform a Wos nfile into a readable corpus
133 wosToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
134 wosToCorpus limit path = DL.take limit
135 . map (\d -> ((fromJust $_hyperdataDocument_publication_year d)
136 ,(fromJust $_hyperdataDocument_title d) <> " " <> (fromJust $_hyperdataDocument_abstract d)))
137 . filter (\d -> (isJust $_hyperdataDocument_publication_year d)
138 && (isJust $_hyperdataDocument_title d)
139 && (isJust $_hyperdataDocument_abstract d))
141 <$> mapConcurrently (\idx -> parseFile WOS (path <> show(idx) <> ".txt")) [1..20]
144 -- | To use the correct parser given a CorpusType
145 fileToCorpus :: CorpusType -> Limit -> CorpusPath -> IO ([(Int,Text)])
146 fileToCorpus format limit path = case format of
147 Wos -> wosToCorpus limit path
148 Csv -> csvToCorpus limit path
151 -- | To parse a file into a list of Document
152 parse :: CorpusType -> Limit -> CorpusPath -> TermList -> IO [Document]
153 parse format limit path l = do
154 corpus <- fileToCorpus format limit path
155 let patterns = buildPatterns l
156 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
159 -- | To parse an existing Fis file
160 parseFis :: FisPath -> Text -> Int -> Int -> Int -> Int -> IO [PhyloFis]
161 parseFis path name grain step support clique = do
162 fisExists <- doesFileExist (path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json")
165 fisJson <- (eitherDecode <$> getJson (path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json")) :: IO (P.Either P.String [PhyloFis])
170 P.Right fis -> pure fis
173 writeFis :: FisPath -> Text -> Int -> Int -> Int -> Int -> DM.Map (Date,Date) [PhyloFis] -> IO ()
174 writeFis path name grain step support clique fis = do
175 let fisPath = path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json"
176 L.writeFile fisPath $ encode (DL.concat $ DM.elems fis)
186 [jsonPath] <- getArgs
188 confJson <- (eitherDecode <$> getJson jsonPath) :: IO (P.Either P.String Conf)
191 P.Left err -> putStrLn err
194 termList <- csvMapTermList (listPath conf)
196 corpus <- parse (corpusType conf) (limit conf) (corpusPath conf) termList
198 putStrLn $ ("\n" <> show (length corpus) <> " parsed docs")
200 fis <- parseFis (fisPath conf) (phyloName conf) (timeGrain conf) (timeStep conf) (fisSupport conf) (fisClique conf)
202 putStrLn $ ("\n" <> show (length fis) <> " parsed fis")
204 let fis' = DM.fromListWith (++) $ DL.sortOn (fst . fst) $ map (\f -> (getFisPeriod f,[f])) fis
206 let query = PhyloQueryBuild (phyloName conf) "" (timeGrain conf) (timeStep conf)
207 (Fis $ FisParams True (fisSupport conf) (fisClique conf)) [] [] (WeightedLogJaccard $ WLJParams (timeTh conf) (timeSens conf)) (timeFrame conf) (timeFrameTh conf)
208 (reBranchThr conf) (reBranchNth conf) (phyloLevel conf)
209 (RelatedComponents $ RCParams $ WeightedLogJaccard $ WLJParams (clusterTh conf) (clusterSens conf))
211 let queryView = PhyloQueryView (viewLevel conf) Merge False 1 [BranchAge,BranchBirth,BranchGroups] [SizeBranch $ SBParams (minSizeBranch conf)] [GroupLabelIncDyn,BranchPeakInc] (Just (ByBranchBirth,Asc)) Json Flat True
213 let phylo = toPhylo query corpus termList fis'
215 writeFis (fisPath conf) (phyloName conf) (timeGrain conf) (timeStep conf) (fisSupport conf) (fisClique conf) (getPhyloFis phylo)
217 let view = toPhyloView queryView phylo
219 putStrLn $ ("phylo completed until level " <> show (phyloLevel conf) <> ", export at level " <> show (viewLevel conf))
221 let outputFile = (outputPath conf) <> (DT.unpack $ phyloName conf)
222 <> "_" <> show (limit conf) <> "_"
223 <> "_" <> show (timeTh conf) <> "_"
224 <> "_" <> show (timeSens conf) <> "_"
225 <> "_" <> show (clusterTh conf) <> "_"
226 <> "_" <> show (clusterSens conf)
229 P.writeFile outputFile $ dotToString $ viewToDot view