3 Description : Gargantext starter binary with Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
14 {-# LANGUAGE DataKinds #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE NoImplicitPrelude #-}
18 {-# LANGUAGE OverloadedStrings #-}
19 {-# LANGUAGE StandaloneDeriving #-}
20 {-# LANGUAGE TypeOperators #-}
21 {-# LANGUAGE Strict #-}
26 import Data.Text (Text, unwords)
28 import GHC.IO (FilePath)
29 import Gargantext.Prelude
30 import Gargantext.Text.List.CSV (csvGraphTermList)
31 import Gargantext.Text.Parsers.CSV (readCsv, csv_title, csv_abstract, csv_publication_year)
32 import Gargantext.Text.Parsers (FileFormat(..),parseDocs)
33 import Gargantext.Text.Terms.WithList
34 import Gargantext.Text.Context (TermList)
36 import System.Environment
38 import Gargantext.Viz.Phylo
39 import Gargantext.Viz.Phylo.Tools
40 import Gargantext.Viz.Phylo.LevelMaker
41 import Gargantext.Viz.Phylo.View.Export
42 import Gargantext.Viz.Phylo.View.ViewMaker
44 import qualified Data.Map as DM
45 import qualified Data.Vector as DV
46 import qualified Data.List as DL
47 import qualified Data.Text as DT
48 import qualified Prelude as P
49 import qualified Data.ByteString.Lazy as L
57 type ListPath = FilePath
58 type CorpusPath = FilePath
59 data CorpusType = Wos | Csv deriving (Show,Generic)
63 Conf { corpusPath :: CorpusPath
64 , corpusType :: CorpusType
65 , listPath :: ListPath
66 , outputPath :: FilePath
69 } deriving (Show,Generic)
71 instance FromJSON Conf
74 instance FromJSON CorpusType
75 instance ToJSON CorpusType
77 -- | Get the conf from a Json file
78 getJson :: FilePath -> IO L.ByteString
79 getJson path = L.readFile path
87 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
88 filterTerms patterns (year', doc) = (year',termsInText patterns doc)
90 termsInText :: Patterns -> Text -> [Text]
91 termsInText pats txt = DL.nub $ DL.concat $ map (map unwords) $ extractTermsWithList pats txt
94 csvToCorpus :: Int -> CorpusPath -> IO ([(Int,Text)])
95 csvToCorpus limit csv = DV.toList
97 . DV.map (\n -> (csv_publication_year n, (csv_title n) <> " " <> (csv_abstract n)))
101 wosToCorpus :: Int -> CorpusPath -> IO ([(Int,Text)])
102 wosToCorpus limit path = undefined
105 fileToCorpus :: CorpusType -> Int -> CorpusPath -> IO ([(Int,Text)])
106 fileToCorpus format limit path = case format of
107 Wos -> wosToCorpus limit path
108 Csv -> csvToCorpus limit path
111 parse :: Limit -> CorpusPath -> TermList -> IO [Document]
112 parse limit corpus lst = do
113 corpus' <- csvToCorpus limit corpus
114 let patterns = buildPatterns lst
115 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus'
126 putStrLn $ show "--| Read the conf |--"
128 [jsonPath] <- getArgs
130 confJson <- (eitherDecode <$> getJson jsonPath) :: IO (P.Either P.String Conf)
133 P.Left err -> putStrLn err
136 putStrLn $ show "--| Parse the corpus |--"
138 termList <- csvGraphTermList (listPath conf)
140 corpus <- parse (limit conf) (corpusPath conf) termList
142 let roots = DL.nub $ DL.concat $ map text corpus
144 putStrLn $ show "--| Build the phylo |--"
146 let query = PhyloQueryBuild (phyloName conf) "" 5 3 defaultFis [] [] (WeightedLogJaccard $ WLJParams 0.00001 10) 2 (RelatedComponents $ RCParams $ WeightedLogJaccard $ WLJParams 0.5 10)
148 let queryView = PhyloQueryView 2 Merge False 1 [BranchAge] [defaultSmallBranch] [BranchPeakFreq,GroupLabelCooc] (Just (ByBranchAge,Asc)) Json Flat True
150 let phylo = toPhylo query corpus roots termList
152 let view = toPhyloView queryView phylo
154 putStrLn $ show "--| Export the phylo as a dot graph |--"
156 let outputFile = (outputPath conf) P.++ (DT.unpack $ phyloName conf) P.++ ".dot"
158 P.writeFile outputFile $ dotToString $ viewToDot view