3 Description : Gargantext starter binary with Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
14 {-# LANGUAGE StandaloneDeriving #-}
15 {-# LANGUAGE TypeOperators #-}
16 {-# LANGUAGE Strict #-}
20 import Control.Concurrent.Async as CCA (mapConcurrently)
21 import Control.Monad (mapM)
23 import Data.List ((++),concat)
25 import Data.Text (Text, unwords, unlines)
27 import GHC.IO (FilePath)
28 import Gargantext.Database.Admin.Types.Node
29 import Gargantext.Prelude
30 import Gargantext.Text.Context (TermList)
31 import Gargantext.Text.Corpus.Parsers (FileFormat(..),parseFile)
32 import Gargantext.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year)
33 import Gargantext.Text.List.CSV (csvGraphTermList)
34 import Gargantext.Text.Terms.WithList
35 import Gargantext.Viz.Phylo
36 import Gargantext.Viz.Phylo.LevelMaker
37 import Gargantext.Viz.Phylo.Tools
38 import Gargantext.Viz.Phylo.View.Export
39 import Gargantext.Viz.Phylo.View.ViewMaker
40 import System.Directory (doesFileExist)
41 import System.Environment
42 import qualified Data.ByteString.Lazy as L
43 import qualified Data.List as DL
44 import qualified Data.Map as DM
45 import qualified Data.Text as DT
46 import qualified Data.Vector as DV
47 import qualified Gargantext.Text.Corpus.Parsers.CSV as CSV
48 import qualified Prelude as P
56 type ListPath = FilePath
57 type FisPath = FilePath
58 type CorpusPath = FilePath
59 data CorpusType = Wos | Csv deriving (Show,Generic)
63 Conf { corpusPath :: CorpusPath
64 , corpusType :: CorpusType
65 , listPath :: ListPath
67 , outputPath :: FilePath
73 , timeFrameTh :: Double
76 , reBranchThr :: Double
79 , clusterSens :: Double
84 , minSizeBranch :: Int
85 } deriving (Show,Generic)
87 instance FromJSON Conf
90 instance FromJSON CorpusType
91 instance ToJSON CorpusType
94 decoder :: P.Either a b -> b
95 decoder (P.Left _) = P.error "Error"
96 decoder (P.Right x) = x
98 -- | Get the conf from a Json file
99 getJson :: FilePath -> IO L.ByteString
100 getJson path = L.readFile path
108 -- | To filter the Ngrams of a document based on the termList
109 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
110 filterTerms patterns (y,d) = (y,termsInText patterns d)
112 --------------------------------------
113 termsInText :: Patterns -> Text -> [Text]
114 termsInText pats txt = DL.nub
117 $ extractTermsWithList pats txt
118 --------------------------------------
121 -- | To transform a Csv nfile into a readable corpus
122 csvToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
123 csvToCorpus limit csv = DV.toList
127 . DV.map (\n -> (csv_publication_year n, (csv_title n) <> " " <> (csv_abstract n)))
128 . snd <$> CSV.readFile csv
131 -- | To transform a Wos nfile into a readable corpus
132 wosToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
133 wosToCorpus limit path = DL.take limit
134 . map (\d -> ((fromJust $_hyperdataDocument_publication_year d)
135 ,(fromJust $_hyperdataDocument_title d) <> " " <> (fromJust $_hyperdataDocument_abstract d)))
136 . filter (\d -> (isJust $_hyperdataDocument_publication_year d)
137 && (isJust $_hyperdataDocument_title d)
138 && (isJust $_hyperdataDocument_abstract d))
140 <$> mapConcurrently (\idx -> parseFile WOS (path <> show(idx) <> ".txt")) [1..20]
143 -- | To use the correct parser given a CorpusType
144 fileToCorpus :: CorpusType -> Limit -> CorpusPath -> IO ([(Int,Text)])
145 fileToCorpus format limit path = case format of
146 Wos -> wosToCorpus limit path
147 Csv -> csvToCorpus limit path
150 -- | To parse a file into a list of Document
151 parse :: CorpusType -> Limit -> CorpusPath -> TermList -> IO [Document]
152 parse format limit path l = do
153 corpus <- fileToCorpus format limit path
154 let patterns = buildPatterns l
155 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
158 -- | To parse an existing Fis file
159 parseFis :: FisPath -> Text -> Int -> Int -> Int -> Int -> IO [PhyloFis]
160 parseFis path name grain step support clique = do
161 fisExists <- doesFileExist (path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json")
164 fisJson <- (eitherDecode <$> getJson (path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json")) :: IO (P.Either P.String [PhyloFis])
169 P.Right fis -> pure fis
172 writeFis :: FisPath -> Text -> Int -> Int -> Int -> Int -> DM.Map (Date,Date) [PhyloFis] -> IO ()
173 writeFis path name grain step support clique fis = do
174 let fisPath = path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json"
175 L.writeFile fisPath $ encode (DL.concat $ DM.elems fis)
185 [jsonPath] <- getArgs
187 confJson <- (eitherDecode <$> getJson jsonPath) :: IO (P.Either P.String Conf)
190 P.Left err -> putStrLn err
193 termList <- csvGraphTermList (listPath conf)
195 corpus <- parse (corpusType conf) (limit conf) (corpusPath conf) termList
197 putStrLn $ ("\n" <> show (length corpus) <> " parsed docs")
199 fis <- parseFis (fisPath conf) (phyloName conf) (timeGrain conf) (timeStep conf) (fisSupport conf) (fisClique conf)
201 putStrLn $ ("\n" <> show (length fis) <> " parsed fis")
203 let fis' = DM.fromListWith (++) $ DL.sortOn (fst . fst) $ map (\f -> (getFisPeriod f,[f])) fis
205 let query = PhyloQueryBuild (phyloName conf) "" (timeGrain conf) (timeStep conf)
206 (Fis $ FisParams True (fisSupport conf) (fisClique conf)) [] [] (WeightedLogJaccard $ WLJParams (timeTh conf) (timeSens conf)) (timeFrame conf) (timeFrameTh conf)
207 (reBranchThr conf) (reBranchNth conf) (phyloLevel conf)
208 (RelatedComponents $ RCParams $ WeightedLogJaccard $ WLJParams (clusterTh conf) (clusterSens conf))
210 let queryView = PhyloQueryView (viewLevel conf) Merge False 1 [BranchAge,BranchBirth,BranchGroups] [SizeBranch $ SBParams (minSizeBranch conf)] [GroupLabelIncDyn,BranchPeakInc] (Just (ByBranchBirth,Asc)) Json Flat True
212 let phylo = toPhylo query corpus termList fis'
214 writeFis (fisPath conf) (phyloName conf) (timeGrain conf) (timeStep conf) (fisSupport conf) (fisClique conf) (getPhyloFis phylo)
216 let view = toPhyloView queryView phylo
218 putStrLn $ ("phylo completed until level " <> show (phyloLevel conf) <> ", export at level " <> show (viewLevel conf))
220 let outputFile = (outputPath conf) <> (DT.unpack $ phyloName conf)
221 <> "_" <> show (limit conf) <> "_"
222 <> "_" <> show (timeTh conf) <> "_"
223 <> "_" <> show (timeSens conf) <> "_"
224 <> "_" <> show (clusterTh conf) <> "_"
225 <> "_" <> show (clusterSens conf)
228 P.writeFile outputFile $ dotToString $ viewToDot view