3 Description : Gargantext starter binary with Adaptative Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Adaptative Phylo binaries
13 {-# LANGUAGE DataKinds #-}
14 {-# LANGUAGE DeriveGeneric #-}
15 {-# LANGUAGE FlexibleInstances #-}
16 {-# LANGUAGE NoImplicitPrelude #-}
17 {-# LANGUAGE StandaloneDeriving #-}
18 {-# LANGUAGE TypeOperators #-}
19 {-# LANGUAGE OverloadedStrings #-}
20 {-# LANGUAGE Strict #-}
25 import Data.ByteString.Lazy (ByteString)
27 import Data.List (concat, nub)
28 import Data.String (String)
29 import Data.Text (Text, unwords)
31 import Gargantext.Prelude
32 import Gargantext.Text.Context (TermList)
33 import Gargantext.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year)
34 import Gargantext.Text.List.CSV (csvGraphTermList)
35 import Gargantext.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
36 import Gargantext.Viz.AdaptativePhylo
38 import GHC.IO (FilePath)
39 import Prelude (Either(..))
40 import System.Environment
42 import qualified Data.ByteString.Lazy as Lazy
43 import qualified Data.Vector as Vector
44 import qualified Gargantext.Text.Corpus.Parsers.CSV as Csv
52 -- | To print an important message as an IO()
53 printIOMsg :: String -> IO ()
58 <> "-- | " <> msg <> "\n" )
61 -- | To print a comment as an IO()
62 printIOComment :: String -> IO ()
64 putStrLn ( "\n" <> cmt <> "\n" )
67 -- | To read and decode a Json file
68 readJson :: FilePath -> IO ByteString
69 readJson path = Lazy.readFile path
71 -- | To filter the Ngrams of a document based on the termList
72 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
73 filterTerms patterns (y,d) = (y,termsInText patterns d)
75 --------------------------------------
76 termsInText :: Patterns -> Text -> [Text]
77 termsInText pats txt = nub $ concat $ map (map unwords) $ extractTermsWithList pats txt
78 --------------------------------------
81 -- | To transform a Csv nfile into a readable corpus
82 csvToCorpus :: Int -> FilePath -> IO ([(Int,Text)])
83 csvToCorpus limit path = Vector.toList
85 <$> Vector.map (\row -> (csv_publication_year row, (csv_title row) <> " " <> (csv_abstract row)))
86 <$> snd <$> Csv.readFile path
89 -- | To use the correct parser given a CorpusType
90 fileToCorpus :: CorpusParser -> Int -> FilePath -> IO ([(Int,Text)])
91 fileToCorpus parser limit path = case parser of
92 -- To do Wos from legacy Main.hs
94 Csv -> csvToCorpus limit path
97 -- | To parse a file into a list of Document
98 fileToDocs :: CorpusParser -> Int -> FilePath -> TermList -> IO [Document]
99 fileToDocs parser limit path lst = do
100 corpus <- fileToCorpus parser limit path
101 let patterns = buildPatterns lst
102 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
113 printIOMsg "Starting the reconstruction"
115 printIOMsg "Read the configuration file"
117 jsonArgs <- (eitherDecode <$> readJson args) :: IO (Either String Config)
120 Left err -> putStrLn err
123 printIOMsg "Parse the corpus"
124 mapList <- csvGraphTermList (listPath config)
125 corpus <- fileToDocs (corpusParser config) (corpusLimit config) (corpusPath config) mapList
126 printIOComment (show (length corpus) <> " parsed docs from the corpus")