3 Description : Gargantext starter binary with Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
14 {-# LANGUAGE DataKinds #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE NoImplicitPrelude #-}
18 {-# LANGUAGE OverloadedStrings #-}
19 {-# LANGUAGE StandaloneDeriving #-}
20 {-# LANGUAGE TypeOperators #-}
21 {-# LANGUAGE Strict #-}
26 import Data.Text (Text, unwords)
28 import GHC.IO (FilePath)
29 import Gargantext.Prelude
30 import Gargantext.Text.List.CSV (csvGraphTermList)
31 import Gargantext.Text.Parsers.CSV (csv_title, csv_abstract, csv_publication_year)
32 import qualified Gargantext.Text.Parsers.CSV as CSV
33 import Gargantext.Text.Parsers (FileFormat(..),parseFile)
34 import Gargantext.Text.Terms.WithList
35 import Gargantext.Text.Context (TermList)
37 import System.Environment
39 import Gargantext.Viz.Phylo
40 import Gargantext.Viz.Phylo.Tools
41 import Gargantext.Viz.Phylo.LevelMaker
42 import Gargantext.Viz.Phylo.View.Export
43 import Gargantext.Viz.Phylo.View.ViewMaker
46 import Gargantext.Database.Types.Node
51 import qualified Data.Map as DM
52 import qualified Data.Vector as DV
53 import qualified Data.List as DL
54 import qualified Data.Text as DT
55 import qualified Prelude as P
56 import qualified Data.ByteString.Lazy as L
64 type ListPath = FilePath
65 type CorpusPath = FilePath
66 data CorpusType = Wos | Csv deriving (Show,Generic)
70 Conf { corpusPath :: CorpusPath
71 , corpusType :: CorpusType
72 , listPath :: ListPath
73 , outputPath :: FilePath
81 , clusterSens :: Double
86 , minSizeBranch :: Int
87 } deriving (Show,Generic)
89 instance FromJSON Conf
92 instance FromJSON CorpusType
93 instance ToJSON CorpusType
95 -- | Get the conf from a Json file
96 getJson :: FilePath -> IO L.ByteString
97 getJson path = L.readFile path
105 -- | To filter the Ngrams of a document based on the termList
106 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
107 filterTerms patterns (y,d) = (y,termsInText patterns d)
109 --------------------------------------
110 termsInText :: Patterns -> Text -> [Text]
111 termsInText pats txt = DL.nub $ DL.concat $ map (map unwords) $ extractTermsWithList pats txt
112 --------------------------------------
115 -- | To transform a Csv nfile into a readable corpus
116 csvToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
117 csvToCorpus limit csv = DV.toList
119 . DV.map (\n -> (csv_publication_year n, (csv_title n) <> " " <> (csv_abstract n)))
120 . snd <$> CSV.readFile csv
123 -- | To transform a Wos nfile into a readable corpus
124 wosToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
125 wosToCorpus limit path = DL.take limit
126 . map (\d -> ((fromJust $_hyperdataDocument_publication_year d)
127 ,(fromJust $_hyperdataDocument_title d) <> " " <> (fromJust $_hyperdataDocument_abstract d)))
128 . filter (\d -> (isJust $_hyperdataDocument_publication_year d)
129 && (isJust $_hyperdataDocument_title d)
130 && (isJust $_hyperdataDocument_abstract d))
131 <$> parseFile WOS path
134 -- | To use the correct parser given a CorpusType
135 fileToCorpus :: CorpusType -> Limit -> CorpusPath -> IO ([(Int,Text)])
136 fileToCorpus format limit path = case format of
137 Wos -> wosToCorpus limit path
138 Csv -> csvToCorpus limit path
141 -- | To parse a file into a list of Document
142 parse :: CorpusType -> Limit -> CorpusPath -> TermList -> IO [Document]
143 parse format limit path l = do
144 corpus <- fileToCorpus format limit path
145 let patterns = buildPatterns l
146 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
157 [jsonPath] <- getArgs
159 confJson <- (eitherDecode <$> getJson jsonPath) :: IO (P.Either P.String Conf)
162 P.Left err -> putStrLn err
165 termList <- csvGraphTermList (listPath conf)
167 corpus <- parse (corpusType conf) (limit conf) (corpusPath conf) termList
169 let roots = DL.nub $ DL.concat $ map text corpus
171 putStrLn $ ("\n" <> show (length corpus) <> " parsed docs")
173 let query = PhyloQueryBuild (phyloName conf) "" (timeGrain conf) (timeStep conf)
174 (Fis $ FisParams True (fisSupport conf) (fisClique conf)) [] [] (WeightedLogJaccard $ WLJParams (timeTh conf) (timeSens conf)) (phyloLevel conf)
175 (RelatedComponents $ RCParams $ WeightedLogJaccard $ WLJParams (clusterTh conf) (clusterSens conf))
177 let queryView = PhyloQueryView (viewLevel conf) Merge False 1 [BranchAge] [SizeBranch $ SBParams (minSizeBranch conf)] [BranchPeakFreq,GroupLabelCooc] (Just (ByBranchAge,Asc)) Json Flat True
179 let phylo = toPhylo query corpus roots termList
181 let view = toPhyloView queryView phylo
183 putStrLn $ ("phylo completed until level " <> show (phyloLevel conf) <> ", export at level " <> show (viewLevel conf))
185 let outputFile = (outputPath conf) <> (DT.unpack $ phyloName conf)
186 <> "_" <> show (limit conf) <> "_"
187 <> "_" <> show (timeTh conf) <> "_"
188 <> "_" <> show (timeSens conf) <> "_"
189 <> "_" <> show (clusterTh conf) <> "_"
190 <> "_" <> show (clusterSens conf)
193 P.writeFile outputFile $ dotToString $ viewToDot view