3 Description : Gargantext starter binary with Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
14 {-# LANGUAGE DataKinds #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE NoImplicitPrelude #-}
18 {-# LANGUAGE OverloadedStrings #-}
19 {-# LANGUAGE StandaloneDeriving #-}
20 {-# LANGUAGE TypeOperators #-}
21 {-# LANGUAGE Strict #-}
25 import System.Directory (doesFileExist)
28 import Data.Text (Text, unwords, unlines)
29 import Data.List ((++),concat)
31 import GHC.IO (FilePath)
32 import Gargantext.Prelude
33 import Gargantext.Text.List.CSV (csvGraphTermList)
34 import Gargantext.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year)
35 import qualified Gargantext.Text.Corpus.Parsers.CSV as CSV
36 import Gargantext.Text.Corpus.Parsers (FileFormat(..),parseFile)
37 import Gargantext.Text.Terms.WithList
38 import Gargantext.Text.Context (TermList)
40 import Control.Monad (mapM)
42 import System.Environment
44 import Gargantext.Viz.Phylo
45 import Gargantext.Viz.Phylo.Tools
46 import Gargantext.Viz.Phylo.LevelMaker
47 import Gargantext.Viz.Phylo.View.Export
48 import Gargantext.Viz.Phylo.View.ViewMaker
50 import Gargantext.Database.Types.Node
53 import Control.Concurrent.Async as CCA (mapConcurrently)
55 import qualified Data.Map as DM
56 import qualified Data.Vector as DV
57 import qualified Data.List as DL
58 import qualified Data.Text as DT
59 import qualified Prelude as P
60 import qualified Data.ByteString.Lazy as L
68 type ListPath = FilePath
69 type FisPath = FilePath
70 type CorpusPath = FilePath
71 data CorpusType = Wos | Csv deriving (Show,Generic)
75 Conf { corpusPath :: CorpusPath
76 , corpusType :: CorpusType
77 , listPath :: ListPath
79 , outputPath :: FilePath
85 , timeFrameTh :: Double
88 , reBranchThr :: Double
91 , clusterSens :: Double
96 , minSizeBranch :: Int
97 } deriving (Show,Generic)
99 instance FromJSON Conf
102 instance FromJSON CorpusType
103 instance ToJSON CorpusType
106 decoder :: P.Either a b -> b
107 decoder (P.Left _) = P.error "Error"
108 decoder (P.Right x) = x
110 -- | Get the conf from a Json file
111 getJson :: FilePath -> IO L.ByteString
112 getJson path = L.readFile path
120 -- | To filter the Ngrams of a document based on the termList
121 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
122 filterTerms patterns (y,d) = (y,termsInText patterns d)
124 --------------------------------------
125 termsInText :: Patterns -> Text -> [Text]
126 termsInText pats txt = DL.nub
129 $ extractTermsWithList pats txt
130 --------------------------------------
133 -- | To transform a Csv nfile into a readable corpus
134 csvToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
135 csvToCorpus limit csv = DV.toList
139 . DV.map (\n -> (csv_publication_year n, (csv_title n) <> " " <> (csv_abstract n)))
140 . snd <$> CSV.readFile csv
143 -- | To transform a Wos nfile into a readable corpus
144 wosToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
145 wosToCorpus limit path = DL.take limit
146 . map (\d -> ((fromJust $_hyperdataDocument_publication_year d)
147 ,(fromJust $_hyperdataDocument_title d) <> " " <> (fromJust $_hyperdataDocument_abstract d)))
148 . filter (\d -> (isJust $_hyperdataDocument_publication_year d)
149 && (isJust $_hyperdataDocument_title d)
150 && (isJust $_hyperdataDocument_abstract d))
152 <$> mapConcurrently (\idx -> parseFile WOS (path <> show(idx) <> ".txt")) [1..20]
155 -- | To use the correct parser given a CorpusType
156 fileToCorpus :: CorpusType -> Limit -> CorpusPath -> IO ([(Int,Text)])
157 fileToCorpus format limit path = case format of
158 Wos -> wosToCorpus limit path
159 Csv -> csvToCorpus limit path
162 -- | To parse a file into a list of Document
163 parse :: CorpusType -> Limit -> CorpusPath -> TermList -> IO [Document]
164 parse format limit path l = do
165 corpus <- fileToCorpus format limit path
166 let patterns = buildPatterns l
167 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
170 -- | To parse an existing Fis file
171 parseFis :: FisPath -> Text -> Int -> Int -> Int -> Int -> IO [PhyloFis]
172 parseFis path name grain step support clique = do
173 fisExists <- doesFileExist (path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json")
176 fisJson <- (eitherDecode <$> getJson (path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json")) :: IO (P.Either P.String [PhyloFis])
181 P.Right fis -> pure fis
184 writeFis :: FisPath -> Text -> Int -> Int -> Int -> Int -> DM.Map (Date,Date) [PhyloFis] -> IO ()
185 writeFis path name grain step support clique fis = do
186 let fisPath = path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json"
187 L.writeFile fisPath $ encode (DL.concat $ DM.elems fis)
197 [jsonPath] <- getArgs
199 confJson <- (eitherDecode <$> getJson jsonPath) :: IO (P.Either P.String Conf)
202 P.Left err -> putStrLn err
205 termList <- csvGraphTermList (listPath conf)
207 corpus <- parse (corpusType conf) (limit conf) (corpusPath conf) termList
209 putStrLn $ ("\n" <> show (length corpus) <> " parsed docs")
211 fis <- parseFis (fisPath conf) (phyloName conf) (timeGrain conf) (timeStep conf) (fisSupport conf) (fisClique conf)
213 putStrLn $ ("\n" <> show (length fis) <> " parsed fis")
215 let fis' = DM.fromListWith (++) $ DL.sortOn (fst . fst) $ map (\f -> (getFisPeriod f,[f])) fis
217 let query = PhyloQueryBuild (phyloName conf) "" (timeGrain conf) (timeStep conf)
218 (Fis $ FisParams True (fisSupport conf) (fisClique conf)) [] [] (WeightedLogJaccard $ WLJParams (timeTh conf) (timeSens conf)) (timeFrame conf) (timeFrameTh conf)
219 (reBranchThr conf) (reBranchNth conf) (phyloLevel conf)
220 (RelatedComponents $ RCParams $ WeightedLogJaccard $ WLJParams (clusterTh conf) (clusterSens conf))
222 let queryView = PhyloQueryView (viewLevel conf) Merge False 1 [BranchAge,BranchBirth,BranchGroups] [SizeBranch $ SBParams (minSizeBranch conf)] [GroupLabelIncDyn,BranchPeakInc] (Just (ByBranchBirth,Asc)) Json Flat True
224 let phylo = toPhylo query corpus termList fis'
226 writeFis (fisPath conf) (phyloName conf) (timeGrain conf) (timeStep conf) (fisSupport conf) (fisClique conf) (getPhyloFis phylo)
228 let view = toPhyloView queryView phylo
230 putStrLn $ ("phylo completed until level " <> show (phyloLevel conf) <> ", export at level " <> show (viewLevel conf))
232 let outputFile = (outputPath conf) <> (DT.unpack $ phyloName conf)
233 <> "_" <> show (limit conf) <> "_"
234 <> "_" <> show (timeTh conf) <> "_"
235 <> "_" <> show (timeSens conf) <> "_"
236 <> "_" <> show (clusterTh conf) <> "_"
237 <> "_" <> show (clusterSens conf)
240 P.writeFile outputFile $ dotToString $ viewToDot view