3 Description : Gargantext starter binary with Adaptative Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Adaptative Phylo binaries
13 {-# LANGUAGE DataKinds #-}
14 {-# LANGUAGE DeriveGeneric #-}
15 {-# LANGUAGE FlexibleInstances #-}
16 {-# LANGUAGE NoImplicitPrelude #-}
17 {-# LANGUAGE StandaloneDeriving #-}
18 {-# LANGUAGE TypeOperators #-}
19 {-# LANGUAGE OverloadedStrings #-}
20 {-# LANGUAGE Strict #-}
25 import Data.ByteString.Lazy (ByteString)
26 import Data.Maybe (isJust, fromJust)
27 import Data.List (concat, nub, isSuffixOf, take)
28 import Data.String (String)
29 import Data.Text (Text, unwords, unpack)
31 import Gargantext.Prelude
32 import Gargantext.Database.Types.Node (HyperdataDocument(..))
33 import Gargantext.Text.Context (TermList)
34 import Gargantext.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year)
35 import Gargantext.Text.Corpus.Parsers (FileFormat(..),parseFile)
36 import Gargantext.Text.List.CSV (csvGraphTermList)
37 import Gargantext.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
38 import Gargantext.Viz.AdaptativePhylo
39 import Gargantext.Viz.Phylo.PhyloMaker (toPhylo)
40 import Gargantext.Viz.Phylo.PhyloTools (printIOMsg, printIOComment)
41 import Gargantext.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
43 import GHC.IO (FilePath)
44 import Prelude (Either(..))
45 import System.Environment
46 import System.Directory (listDirectory)
47 import Control.Concurrent.Async (mapConcurrently)
49 import qualified Data.ByteString.Lazy as Lazy
50 import qualified Data.Vector as Vector
51 import qualified Gargantext.Text.Corpus.Parsers.CSV as Csv
59 -- | To get all the files in a directory or just a file
60 getFilesFromPath :: FilePath -> IO([FilePath])
61 getFilesFromPath path = do
62 if (isSuffixOf "/" path)
63 then (listDirectory path)
72 -- | To read and decode a Json file
73 readJson :: FilePath -> IO ByteString
74 readJson path = Lazy.readFile path
81 -- | To filter the Ngrams of a document based on the termList
82 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
83 filterTerms patterns (y,d) = (y,termsInText patterns d)
85 --------------------------------------
86 termsInText :: Patterns -> Text -> [Text]
87 termsInText pats txt = nub $ concat $ map (map unwords) $ extractTermsWithList pats txt
88 --------------------------------------
91 -- | To transform a Wos file (or [file]) into a readable corpus
92 wosToCorpus :: Int -> FilePath -> IO ([(Int,Text)])
93 wosToCorpus limit path = do
94 files <- getFilesFromPath path
96 <$> map (\d -> let date' = fromJust $ _hyperdataDocument_publication_year d
97 title = fromJust $ _hyperdataDocument_title d
98 abstr = if (isJust $ _hyperdataDocument_abstract d)
99 then fromJust $ _hyperdataDocument_abstract d
101 in (date', title <> " " <> abstr))
103 <$> mapConcurrently (\file ->
104 filter (\d -> (isJust $ _hyperdataDocument_publication_year d)
105 && (isJust $ _hyperdataDocument_title d))
106 <$> parseFile WOS (path <> file) ) files
109 -- | To transform a Csv file into a readable corpus
110 csvToCorpus :: Int -> FilePath -> IO ([(Int,Text)])
111 csvToCorpus limit path = Vector.toList
112 <$> Vector.take limit
113 <$> Vector.map (\row -> (csv_publication_year row, (csv_title row) <> " " <> (csv_abstract row)))
114 <$> snd <$> Csv.readFile path
117 -- | To use the correct parser given a CorpusType
118 fileToCorpus :: CorpusParser -> FilePath -> IO ([(Int,Text)])
119 fileToCorpus parser path = case parser of
120 Wos limit -> wosToCorpus limit path
121 Csv limit -> csvToCorpus limit path
124 -- | To parse a file into a list of Document
125 fileToDocs :: CorpusParser -> FilePath -> TermList -> IO [Document]
126 fileToDocs parser path lst = do
127 corpus <- fileToCorpus parser path
128 let patterns = buildPatterns lst
129 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
140 printIOMsg "Starting the reconstruction"
142 printIOMsg "Read the configuration file"
144 jsonArgs <- (eitherDecode <$> readJson args) :: IO (Either String Config)
147 Left err -> putStrLn err
150 printIOMsg "Parse the corpus"
151 mapList <- csvGraphTermList (listPath config)
152 corpus <- fileToDocs (corpusParser config) (corpusPath config) mapList
153 printIOComment (show (length corpus) <> " parsed docs from the corpus")
155 printIOMsg "Reconstruct the Phylo"
157 let phylo = toPhylo corpus mapList config
159 printIOMsg "End of reconstruction, start the export"
161 let dot = toPhyloExport phylo
163 let output = (outputPath config)
164 <> (unpack $ phyloName config)