3 Description : Gargantext starter binary with Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
14 {-# LANGUAGE DataKinds #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE NoImplicitPrelude #-}
18 {-# LANGUAGE OverloadedStrings #-}
19 {-# LANGUAGE StandaloneDeriving #-}
20 {-# LANGUAGE TypeOperators #-}
21 {-# LANGUAGE Strict #-}
25 import System.Directory (doesFileExist)
28 import Data.Text (Text, unwords, unlines)
29 import Data.List ((++),concat)
31 import GHC.IO (FilePath)
32 import Gargantext.Prelude
33 import Gargantext.Text.List.CSV (csvGraphTermList)
34 import Gargantext.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year)
35 import qualified Gargantext.Text.Corpus.Parsers.CSV as CSV
36 import Gargantext.Text.Corpus.Parsers (FileFormat(..),parseFile)
37 import Gargantext.Text.Terms.WithList
38 import Gargantext.Text.Context (TermList)
40 import Control.Monad (mapM)
42 import System.Environment
44 import Gargantext.Viz.Phylo
45 import Gargantext.Viz.Phylo.Tools
46 import Gargantext.Viz.Phylo.LevelMaker
47 import Gargantext.Viz.Phylo.View.Export
48 import Gargantext.Viz.Phylo.View.ViewMaker
50 import Gargantext.Database.Types.Node
53 import Control.Concurrent.Async as CCA (mapConcurrently)
55 import qualified Data.Map as DM
56 import qualified Data.Vector as DV
57 import qualified Data.List as DL
58 import qualified Data.Text as DT
59 import qualified Prelude as P
60 import qualified Data.ByteString.Lazy as L
68 type ListPath = FilePath
69 type FisPath = FilePath
70 type CorpusPath = FilePath
71 data CorpusType = Wos | Csv deriving (Show,Generic)
75 Conf { corpusPath :: CorpusPath
76 , corpusType :: CorpusType
77 , listPath :: ListPath
79 , outputPath :: FilePath
85 , timeFrameTh :: Double
88 , reBranchThr :: Double
91 , clusterSens :: Double
96 , minSizeBranch :: Int
97 } deriving (Show,Generic)
99 instance FromJSON Conf
102 instance FromJSON CorpusType
103 instance ToJSON CorpusType
106 decoder :: P.Either a b -> b
107 decoder (P.Left _) = P.error "Error"
108 decoder (P.Right x) = x
110 -- | Get the conf from a Json file
111 getJson :: FilePath -> IO L.ByteString
112 getJson path = L.readFile path
120 -- | To filter the Ngrams of a document based on the termList
121 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
122 filterTerms patterns (y,d) = (y,termsInText patterns d)
124 --------------------------------------
125 termsInText :: Patterns -> Text -> [Text]
126 termsInText pats txt = DL.nub $ DL.concat $ map (map unwords) $ extractTermsWithList pats txt
127 --------------------------------------
130 -- | To transform a Csv nfile into a readable corpus
131 csvToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
132 csvToCorpus limit csv = DV.toList
136 . DV.map (\n -> (csv_publication_year n, (csv_title n) <> " " <> (csv_abstract n)))
137 . snd <$> CSV.readFile csv
140 -- | To transform a Wos nfile into a readable corpus
141 wosToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
142 wosToCorpus limit path = DL.take limit
143 . map (\d -> ((fromJust $_hyperdataDocument_publication_year d)
144 ,(fromJust $_hyperdataDocument_title d) <> " " <> (fromJust $_hyperdataDocument_abstract d)))
145 . filter (\d -> (isJust $_hyperdataDocument_publication_year d)
146 && (isJust $_hyperdataDocument_title d)
147 && (isJust $_hyperdataDocument_abstract d))
149 <$> mapConcurrently (\idx -> parseFile WOS (path <> show(idx) <> ".txt")) [1..20]
152 -- | To use the correct parser given a CorpusType
153 fileToCorpus :: CorpusType -> Limit -> CorpusPath -> IO ([(Int,Text)])
154 fileToCorpus format limit path = case format of
155 Wos -> wosToCorpus limit path
156 Csv -> csvToCorpus limit path
159 -- | To parse a file into a list of Document
160 parse :: CorpusType -> Limit -> CorpusPath -> TermList -> IO [Document]
161 parse format limit path l = do
162 corpus <- fileToCorpus format limit path
163 let patterns = buildPatterns l
164 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
167 -- | To parse an existing Fis file
168 parseFis :: FisPath -> Text -> Int -> Int -> Int -> Int -> IO [PhyloFis]
169 parseFis path name grain step support clique = do
170 fisExists <- doesFileExist (path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json")
173 fisJson <- (eitherDecode <$> getJson (path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json")) :: IO (P.Either P.String [PhyloFis])
178 P.Right fis -> pure fis
181 writeFis :: FisPath -> Text -> Int -> Int -> Int -> Int -> DM.Map (Date,Date) [PhyloFis] -> IO ()
182 writeFis path name grain step support clique fis = do
183 let fisPath = path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json"
184 L.writeFile fisPath $ encode (DL.concat $ DM.elems fis)
194 [jsonPath] <- getArgs
196 confJson <- (eitherDecode <$> getJson jsonPath) :: IO (P.Either P.String Conf)
199 P.Left err -> putStrLn err
202 termList <- csvGraphTermList (listPath conf)
204 corpus <- parse (corpusType conf) (limit conf) (corpusPath conf) termList
206 putStrLn $ ("\n" <> show (length corpus) <> " parsed docs")
208 fis <- parseFis (fisPath conf) (phyloName conf) (timeGrain conf) (timeStep conf) (fisSupport conf) (fisClique conf)
210 putStrLn $ ("\n" <> show (length fis) <> " parsed fis")
212 let fis' = DM.fromListWith (++) $ DL.sortOn (fst . fst) $ map (\f -> (getFisPeriod f,[f])) fis
214 let query = PhyloQueryBuild (phyloName conf) "" (timeGrain conf) (timeStep conf)
215 (Fis $ FisParams True (fisSupport conf) (fisClique conf)) [] [] (WeightedLogJaccard $ WLJParams (timeTh conf) (timeSens conf)) (timeFrame conf) (timeFrameTh conf)
216 (reBranchThr conf) (reBranchNth conf) (phyloLevel conf)
217 (RelatedComponents $ RCParams $ WeightedLogJaccard $ WLJParams (clusterTh conf) (clusterSens conf))
219 let queryView = PhyloQueryView (viewLevel conf) Merge False 1 [BranchAge,BranchBirth,BranchGroups] [SizeBranch $ SBParams (minSizeBranch conf)] [GroupLabelIncDyn,BranchPeakInc] (Just (ByBranchBirth,Asc)) Json Flat True
221 let phylo = toPhylo query corpus termList fis'
223 writeFis (fisPath conf) (phyloName conf) (timeGrain conf) (timeStep conf) (fisSupport conf) (fisClique conf) (getPhyloFis phylo)
225 let view = toPhyloView queryView phylo
227 putStrLn $ ("phylo completed until level " <> show (phyloLevel conf) <> ", export at level " <> show (viewLevel conf))
229 let outputFile = (outputPath conf) <> (DT.unpack $ phyloName conf)
230 <> "_" <> show (limit conf) <> "_"
231 <> "_" <> show (timeTh conf) <> "_"
232 <> "_" <> show (timeSens conf) <> "_"
233 <> "_" <> show (clusterTh conf) <> "_"
234 <> "_" <> show (clusterSens conf)
237 P.writeFile outputFile $ dotToString $ viewToDot view