3 Description : Gargantext starter binary with Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
14 {-# LANGUAGE DataKinds #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE NoImplicitPrelude #-}
18 {-# LANGUAGE OverloadedStrings #-}
19 {-# LANGUAGE StandaloneDeriving #-}
20 {-# LANGUAGE TypeOperators #-}
21 {-# LANGUAGE Strict #-}
26 import Data.Text (Text, unwords)
28 import GHC.IO (FilePath)
29 import Gargantext.Prelude
30 import Gargantext.Text.List.CSV (csvGraphTermList)
31 import Gargantext.Text.Parsers.CSV (readCsv, csv_title, csv_abstract, csv_publication_year)
32 import Gargantext.Text.Parsers (FileFormat(..),parseDocs)
33 import Gargantext.Text.Terms.WithList
34 import Gargantext.Text.Context (TermList)
36 import System.Environment
38 import Gargantext.Viz.Phylo
39 import Gargantext.Viz.Phylo.Tools
40 import Gargantext.Viz.Phylo.LevelMaker
41 import Gargantext.Viz.Phylo.View.Export
42 import Gargantext.Viz.Phylo.View.ViewMaker
45 import Gargantext.Database.Types.Node
50 import qualified Data.Map as DM
51 import qualified Data.Vector as DV
52 import qualified Data.List as DL
53 import qualified Data.Text as DT
54 import qualified Prelude as P
55 import qualified Data.ByteString.Lazy as L
63 type ListPath = FilePath
64 type CorpusPath = FilePath
65 data CorpusType = Wos | Csv deriving (Show,Generic)
69 Conf { corpusPath :: CorpusPath
70 , corpusType :: CorpusType
71 , listPath :: ListPath
72 , outputPath :: FilePath
80 , clusterSens :: Double
81 } deriving (Show,Generic)
83 instance FromJSON Conf
86 instance FromJSON CorpusType
87 instance ToJSON CorpusType
89 -- | Get the conf from a Json file
90 getJson :: FilePath -> IO L.ByteString
91 getJson path = L.readFile path
99 -- | To filter the Ngrams of a document based on the termList
100 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
101 filterTerms patterns (y,d) = (y,termsInText patterns d)
103 --------------------------------------
104 termsInText :: Patterns -> Text -> [Text]
105 termsInText pats txt = DL.nub $ DL.concat $ map (map unwords) $ extractTermsWithList pats txt
106 --------------------------------------
109 -- | To transform a Csv nfile into a readable corpus
110 csvToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
111 csvToCorpus limit csv = DV.toList
113 . DV.map (\n -> (csv_publication_year n, (csv_title n) <> " " <> (csv_abstract n)))
114 . snd <$> readCsv csv
117 -- | To transform a Wos nfile into a readable corpus
118 wosToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
119 wosToCorpus limit path = DL.take limit
120 . map (\d -> ((fromJust $_hyperdataDocument_publication_year d)
121 ,(fromJust $_hyperdataDocument_title d) <> " " <> (fromJust $_hyperdataDocument_abstract d)))
122 . filter (\d -> (isJust $_hyperdataDocument_publication_year d)
123 && (isJust $_hyperdataDocument_title d)
124 && (isJust $_hyperdataDocument_abstract d))
125 <$> parseDocs WOS path
128 -- | To use the correct parser given a CorpusType
129 fileToCorpus :: CorpusType -> Limit -> CorpusPath -> IO ([(Int,Text)])
130 fileToCorpus format limit path = case format of
131 Wos -> wosToCorpus limit path
132 Csv -> csvToCorpus limit path
135 -- | To parse a file into a list of Document
136 parse :: CorpusType -> Limit -> CorpusPath -> TermList -> IO [Document]
137 parse format limit path l = do
138 corpus <- fileToCorpus format limit path
139 let patterns = buildPatterns l
140 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
151 putStrLn $ show ("--| Read the conf |--")
153 [jsonPath] <- getArgs
155 confJson <- (eitherDecode <$> getJson jsonPath) :: IO (P.Either P.String Conf)
158 P.Left err -> putStrLn err
161 putStrLn $ show ("--| Parse the corpus |--")
163 termList <- csvGraphTermList (listPath conf)
165 corpus <- parse (corpusType conf) (limit conf) (corpusPath conf) termList
167 let roots = DL.nub $ DL.concat $ map text corpus
169 putStrLn $ ("-- | parsed docs : " <> show (length corpus) <> " |--")
171 putStrLn $ show ("--| Build the phylo |--")
173 let query = PhyloQueryBuild (phyloName conf) "" (timeGrain conf) (timeStep conf)
174 defaultFis [] [] (WeightedLogJaccard $ WLJParams (timeTh conf) (timeSens conf)) 2
175 (RelatedComponents $ RCParams $ WeightedLogJaccard $ WLJParams (clusterTh conf) (clusterSens conf))
177 let queryView = PhyloQueryView 2 Merge False 1 [BranchAge] [defaultSmallBranch] [BranchPeakFreq,GroupLabelCooc] (Just (ByBranchAge,Asc)) Json Flat True
179 let phylo = toPhylo query corpus roots termList
181 let view = toPhyloView queryView phylo
183 putStrLn $ show ("--| Export the phylo as a dot graph |--")
185 let outputFile = (outputPath conf) <> (DT.unpack $ phyloName conf)
186 <> "_" <> show (limit conf) <> "_"
187 <> "_" <> show (timeTh conf) <> "_"
188 <> "_" <> show (timeSens conf) <> "_"
189 <> "_" <> show (clusterTh conf) <> "_"
190 <> "_" <> show (clusterSens conf)
193 P.writeFile outputFile $ dotToString $ viewToDot view