3 Description : Gargantext starter binary with Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
14 {-# LANGUAGE DataKinds #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE NoImplicitPrelude #-}
18 {-# LANGUAGE OverloadedStrings #-}
19 {-# LANGUAGE StandaloneDeriving #-}
20 {-# LANGUAGE TypeOperators #-}
21 {-# LANGUAGE Strict #-}
25 import System.Directory (doesFileExist)
28 import Data.Text (Text, unwords)
29 import Data.List ((++))
31 import GHC.IO (FilePath)
32 import Gargantext.Prelude
33 import Gargantext.Text.List.CSV (csvGraphTermList)
34 import Gargantext.Text.Parsers.CSV (csv_title, csv_abstract, csv_publication_year)
35 import qualified Gargantext.Text.Parsers.CSV as CSV
36 import Gargantext.Text.Parsers (FileFormat(..),parseFile)
37 import Gargantext.Text.Terms.WithList
38 import Gargantext.Text.Context (TermList)
40 import System.Environment
42 import Gargantext.Viz.Phylo
43 import Gargantext.Viz.Phylo.Tools
44 import Gargantext.Viz.Phylo.LevelMaker
45 import Gargantext.Viz.Phylo.View.Export
46 import Gargantext.Viz.Phylo.View.ViewMaker
48 import Gargantext.Database.Types.Node
51 import qualified Data.Map as DM
52 import qualified Data.Vector as DV
53 import qualified Data.List as DL
54 import qualified Data.Text as DT
55 import qualified Prelude as P
56 import qualified Data.ByteString.Lazy as L
64 type ListPath = FilePath
65 type FisPath = FilePath
66 type CorpusPath = FilePath
67 data CorpusType = Wos | Csv deriving (Show,Generic)
71 Conf { corpusPath :: CorpusPath
72 , corpusType :: CorpusType
73 , listPath :: ListPath
75 , outputPath :: FilePath
84 , clusterSens :: Double
89 , minSizeBranch :: Int
90 } deriving (Show,Generic)
92 instance FromJSON Conf
95 instance FromJSON CorpusType
96 instance ToJSON CorpusType
99 decoder :: P.Either a b -> b
100 decoder (P.Left _) = P.error "Error"
101 decoder (P.Right x) = x
103 -- | Get the conf from a Json file
104 getJson :: FilePath -> IO L.ByteString
105 getJson path = L.readFile path
113 -- | To filter the Ngrams of a document based on the termList
114 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
115 filterTerms patterns (y,d) = (y,termsInText patterns d)
117 --------------------------------------
118 termsInText :: Patterns -> Text -> [Text]
119 termsInText pats txt = DL.nub $ DL.concat $ map (map unwords) $ extractTermsWithList pats txt
120 --------------------------------------
123 -- | To transform a Csv nfile into a readable corpus
124 csvToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
125 csvToCorpus limit csv = DV.toList
129 . DV.map (\n -> (csv_publication_year n, (csv_title n) <> " " <> (csv_abstract n)))
130 . snd <$> CSV.readFile csv
133 -- | To transform a Wos nfile into a readable corpus
134 wosToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
135 wosToCorpus limit path = DL.take limit
136 . map (\d -> ((fromJust $_hyperdataDocument_publication_year d)
137 ,(fromJust $_hyperdataDocument_title d) <> " " <> (fromJust $_hyperdataDocument_abstract d)))
138 . filter (\d -> (isJust $_hyperdataDocument_publication_year d)
139 && (isJust $_hyperdataDocument_title d)
140 && (isJust $_hyperdataDocument_abstract d))
141 <$> parseFile WOS path
144 -- | To use the correct parser given a CorpusType
145 fileToCorpus :: CorpusType -> Limit -> CorpusPath -> IO ([(Int,Text)])
146 fileToCorpus format limit path = case format of
147 Wos -> wosToCorpus limit path
148 Csv -> csvToCorpus limit path
151 -- | To parse a file into a list of Document
152 parse :: CorpusType -> Limit -> CorpusPath -> TermList -> IO [Document]
153 parse format limit path l = do
154 corpus <- fileToCorpus format limit path
155 let patterns = buildPatterns l
156 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
159 -- | To parse an existing Fis file
160 parseFis :: FisPath -> Text -> Int -> Int -> Int -> Int -> IO [PhyloFis]
161 parseFis path name grain step support clique = do
162 fisExists <- doesFileExist (path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json")
165 fisJson <- (eitherDecode <$> getJson (path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json")) :: IO (P.Either P.String [PhyloFis])
170 P.Right fis -> pure fis
173 writeFis :: FisPath -> Text -> Int -> Int -> Int -> Int -> DM.Map (Date,Date) [PhyloFis] -> IO ()
174 writeFis path name grain step support clique fis = do
175 let fisPath = path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json"
176 L.writeFile fisPath $ encode (DL.concat $ DM.elems fis)
186 [jsonPath] <- getArgs
188 confJson <- (eitherDecode <$> getJson jsonPath) :: IO (P.Either P.String Conf)
191 P.Left err -> putStrLn err
194 termList <- csvGraphTermList (listPath conf)
196 corpus <- parse (corpusType conf) (limit conf) (corpusPath conf) termList
198 putStrLn $ ("\n" <> show (length corpus) <> " parsed docs")
200 let roots = DL.nub $ DL.concat $ map text corpus
202 putStrLn $ ("\n" <> show (length roots) <> " parsed foundation roots")
204 fis <- parseFis (fisPath conf) (phyloName conf) (timeGrain conf) (timeStep conf) (fisSupport conf) (fisClique conf)
206 putStrLn $ ("\n" <> show (length fis) <> " parsed fis")
208 let mFis = DM.fromListWith (++) $ DL.sortOn (fst . fst) $ map (\f -> (getFisPeriod f,[f])) fis
210 let query = PhyloQueryBuild (phyloName conf) "" (timeGrain conf) (timeStep conf)
211 (Fis $ FisParams True (fisSupport conf) (fisClique conf)) [] [] (WeightedLogJaccard $ WLJParams (timeTh conf) (timeSens conf)) (timeFrame conf) (phyloLevel conf)
212 (RelatedComponents $ RCParams $ WeightedLogJaccard $ WLJParams (clusterTh conf) (clusterSens conf))
214 let queryView = PhyloQueryView (viewLevel conf) Merge False 1 [BranchAge] [SizeBranch $ SBParams (minSizeBranch conf)] [BranchPeakFreq,GroupLabelCooc] (Just (ByBranchAge,Asc)) Json Flat True
216 let phylo = toPhylo query corpus roots termList mFis
218 writeFis (fisPath conf) (phyloName conf) (timeGrain conf) (timeStep conf) (fisSupport conf) (fisClique conf) (getPhyloFis phylo)
220 let view = toPhyloView queryView phylo
222 putStrLn $ ("phylo completed until level " <> show (phyloLevel conf) <> ", export at level " <> show (viewLevel conf))
224 let outputFile = (outputPath conf) <> (DT.unpack $ phyloName conf)
225 <> "_" <> show (limit conf) <> "_"
226 <> "_" <> show (timeTh conf) <> "_"
227 <> "_" <> show (timeSens conf) <> "_"
228 <> "_" <> show (clusterTh conf) <> "_"
229 <> "_" <> show (clusterSens conf)
232 P.writeFile outputFile $ dotToString $ viewToDot view