3 Description : Gargantext starter binary with Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
14 {-# LANGUAGE DataKinds #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE NoImplicitPrelude #-}
18 {-# LANGUAGE OverloadedStrings #-}
19 {-# LANGUAGE StandaloneDeriving #-}
20 {-# LANGUAGE TypeOperators #-}
21 {-# LANGUAGE Strict #-}
25 import Control.Concurrent.Async as CCA (mapConcurrently)
26 import Control.Monad (mapM)
28 import Data.List ((++),concat)
30 import Data.Text (Text, unwords, unlines)
32 import GHC.IO (FilePath)
33 import Gargantext.Database.Admin.Types.Node
34 import Gargantext.Prelude
35 import Gargantext.Text.Context (TermList)
36 import Gargantext.Text.Corpus.Parsers (FileFormat(..),parseFile)
37 import Gargantext.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year)
38 import Gargantext.Text.List.CSV (csvGraphTermList)
39 import Gargantext.Text.Terms.WithList
40 import Gargantext.Viz.Phylo
41 import Gargantext.Viz.Phylo.LevelMaker
42 import Gargantext.Viz.Phylo.Tools
43 import Gargantext.Viz.Phylo.View.Export
44 import Gargantext.Viz.Phylo.View.ViewMaker
45 import System.Directory (doesFileExist)
46 import System.Environment
47 import qualified Data.ByteString.Lazy as L
48 import qualified Data.List as DL
49 import qualified Data.Map as DM
50 import qualified Data.Text as DT
51 import qualified Data.Vector as DV
52 import qualified Gargantext.Text.Corpus.Parsers.CSV as CSV
53 import qualified Prelude as P
61 type ListPath = FilePath
62 type FisPath = FilePath
63 type CorpusPath = FilePath
64 data CorpusType = Wos | Csv deriving (Show,Generic)
68 Conf { corpusPath :: CorpusPath
69 , corpusType :: CorpusType
70 , listPath :: ListPath
72 , outputPath :: FilePath
78 , timeFrameTh :: Double
81 , reBranchThr :: Double
84 , clusterSens :: Double
89 , minSizeBranch :: Int
90 } deriving (Show,Generic)
92 instance FromJSON Conf
95 instance FromJSON CorpusType
96 instance ToJSON CorpusType
99 decoder :: P.Either a b -> b
100 decoder (P.Left _) = P.error "Error"
101 decoder (P.Right x) = x
103 -- | Get the conf from a Json file
104 getJson :: FilePath -> IO L.ByteString
105 getJson path = L.readFile path
113 -- | To filter the Ngrams of a document based on the termList
114 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
115 filterTerms patterns (y,d) = (y,termsInText patterns d)
117 --------------------------------------
118 termsInText :: Patterns -> Text -> [Text]
119 termsInText pats txt = DL.nub
122 $ extractTermsWithList pats txt
123 --------------------------------------
126 -- | To transform a Csv nfile into a readable corpus
127 csvToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
128 csvToCorpus limit csv = DV.toList
132 . DV.map (\n -> (csv_publication_year n, (csv_title n) <> " " <> (csv_abstract n)))
133 . snd <$> CSV.readFile csv
136 -- | To transform a Wos nfile into a readable corpus
137 wosToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
138 wosToCorpus limit path = DL.take limit
139 . map (\d -> ((fromJust $_hyperdataDocument_publication_year d)
140 ,(fromJust $_hyperdataDocument_title d) <> " " <> (fromJust $_hyperdataDocument_abstract d)))
141 . filter (\d -> (isJust $_hyperdataDocument_publication_year d)
142 && (isJust $_hyperdataDocument_title d)
143 && (isJust $_hyperdataDocument_abstract d))
145 <$> mapConcurrently (\idx -> parseFile WOS (path <> show(idx) <> ".txt")) [1..20]
148 -- | To use the correct parser given a CorpusType
149 fileToCorpus :: CorpusType -> Limit -> CorpusPath -> IO ([(Int,Text)])
150 fileToCorpus format limit path = case format of
151 Wos -> wosToCorpus limit path
152 Csv -> csvToCorpus limit path
155 -- | To parse a file into a list of Document
156 parse :: CorpusType -> Limit -> CorpusPath -> TermList -> IO [Document]
157 parse format limit path l = do
158 corpus <- fileToCorpus format limit path
159 let patterns = buildPatterns l
160 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
163 -- | To parse an existing Fis file
164 parseFis :: FisPath -> Text -> Int -> Int -> Int -> Int -> IO [PhyloFis]
165 parseFis path name grain step support clique = do
166 fisExists <- doesFileExist (path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json")
169 fisJson <- (eitherDecode <$> getJson (path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json")) :: IO (P.Either P.String [PhyloFis])
174 P.Right fis -> pure fis
177 writeFis :: FisPath -> Text -> Int -> Int -> Int -> Int -> DM.Map (Date,Date) [PhyloFis] -> IO ()
178 writeFis path name grain step support clique fis = do
179 let fisPath = path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json"
180 L.writeFile fisPath $ encode (DL.concat $ DM.elems fis)
190 [jsonPath] <- getArgs
192 confJson <- (eitherDecode <$> getJson jsonPath) :: IO (P.Either P.String Conf)
195 P.Left err -> putStrLn err
198 termList <- csvGraphTermList (listPath conf)
200 corpus <- parse (corpusType conf) (limit conf) (corpusPath conf) termList
202 putStrLn $ ("\n" <> show (length corpus) <> " parsed docs")
204 fis <- parseFis (fisPath conf) (phyloName conf) (timeGrain conf) (timeStep conf) (fisSupport conf) (fisClique conf)
206 putStrLn $ ("\n" <> show (length fis) <> " parsed fis")
208 let fis' = DM.fromListWith (++) $ DL.sortOn (fst . fst) $ map (\f -> (getFisPeriod f,[f])) fis
210 let query = PhyloQueryBuild (phyloName conf) "" (timeGrain conf) (timeStep conf)
211 (Fis $ FisParams True (fisSupport conf) (fisClique conf)) [] [] (WeightedLogJaccard $ WLJParams (timeTh conf) (timeSens conf)) (timeFrame conf) (timeFrameTh conf)
212 (reBranchThr conf) (reBranchNth conf) (phyloLevel conf)
213 (RelatedComponents $ RCParams $ WeightedLogJaccard $ WLJParams (clusterTh conf) (clusterSens conf))
215 let queryView = PhyloQueryView (viewLevel conf) Merge False 1 [BranchAge,BranchBirth,BranchGroups] [SizeBranch $ SBParams (minSizeBranch conf)] [GroupLabelIncDyn,BranchPeakInc] (Just (ByBranchBirth,Asc)) Json Flat True
217 let phylo = toPhylo query corpus termList fis'
219 writeFis (fisPath conf) (phyloName conf) (timeGrain conf) (timeStep conf) (fisSupport conf) (fisClique conf) (getPhyloFis phylo)
221 let view = toPhyloView queryView phylo
223 putStrLn $ ("phylo completed until level " <> show (phyloLevel conf) <> ", export at level " <> show (viewLevel conf))
225 let outputFile = (outputPath conf) <> (DT.unpack $ phyloName conf)
226 <> "_" <> show (limit conf) <> "_"
227 <> "_" <> show (timeTh conf) <> "_"
228 <> "_" <> show (timeSens conf) <> "_"
229 <> "_" <> show (clusterTh conf) <> "_"
230 <> "_" <> show (clusterSens conf)
233 P.writeFile outputFile $ dotToString $ viewToDot view