3 Description : Gargantext starter binary with Adaptative Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Adaptative Phylo binaries
13 {-# LANGUAGE DataKinds #-}
14 {-# LANGUAGE DeriveGeneric #-}
15 {-# LANGUAGE FlexibleInstances #-}
16 {-# LANGUAGE NoImplicitPrelude #-}
17 {-# LANGUAGE StandaloneDeriving #-}
18 {-# LANGUAGE TypeOperators #-}
19 {-# LANGUAGE OverloadedStrings #-}
20 {-# LANGUAGE Strict #-}
25 import Data.ByteString.Lazy (ByteString)
26 import Data.Maybe (isJust, fromJust)
27 import Data.List (concat, nub, isSuffixOf, take)
28 import Data.String (String)
29 import Data.Text (Text, unwords)
31 import Gargantext.Prelude
32 import Gargantext.Database.Types.Node (HyperdataDocument(..))
33 import Gargantext.Text.Context (TermList)
34 import Gargantext.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year)
35 import Gargantext.Text.Corpus.Parsers (FileFormat(..),parseFile)
36 import Gargantext.Text.List.CSV (csvGraphTermList)
37 import Gargantext.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
38 import Gargantext.Viz.AdaptativePhylo
39 import Gargantext.Viz.Phylo.PhyloMaker (toPhylo)
41 import GHC.IO (FilePath)
42 import Prelude (Either(..))
43 import System.Environment
44 import System.Directory (listDirectory)
45 import Control.Concurrent.Async (mapConcurrently)
47 import qualified Data.ByteString.Lazy as Lazy
48 import qualified Data.Vector as Vector
49 import qualified Gargantext.Text.Corpus.Parsers.CSV as Csv
57 -- | To print an important message as an IO()
58 printIOMsg :: String -> IO ()
63 <> "-- | " <> msg <> "\n" )
66 -- | To print a comment as an IO()
67 printIOComment :: String -> IO ()
69 putStrLn ( "\n" <> cmt <> "\n" )
72 -- | To get all the files in a directory or just a file
73 getFilesFromPath :: FilePath -> IO([FilePath])
74 getFilesFromPath path = do
75 if (isSuffixOf "/" path)
76 then (listDirectory path)
85 -- | To read and decode a Json file
86 readJson :: FilePath -> IO ByteString
87 readJson path = Lazy.readFile path
94 -- | To filter the Ngrams of a document based on the termList
95 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
96 filterTerms patterns (y,d) = (y,termsInText patterns d)
98 --------------------------------------
99 termsInText :: Patterns -> Text -> [Text]
100 termsInText pats txt = nub $ concat $ map (map unwords) $ extractTermsWithList pats txt
101 --------------------------------------
104 -- | To transform a Wos file (or [file]) into a readable corpus
105 wosToCorpus :: Int -> FilePath -> IO ([(Int,Text)])
106 wosToCorpus limit path = do
107 files <- getFilesFromPath path
109 <$> map (\d -> let date' = fromJust $ _hyperdataDocument_publication_year d
110 title = fromJust $ _hyperdataDocument_title d
111 abstr = if (isJust $ _hyperdataDocument_abstract d)
112 then fromJust $ _hyperdataDocument_abstract d
114 in (date', title <> " " <> abstr))
116 <$> mapConcurrently (\file ->
117 filter (\d -> (isJust $ _hyperdataDocument_publication_year d)
118 && (isJust $ _hyperdataDocument_title d))
119 <$> parseFile WOS (path <> file) ) files
122 -- | To transform a Csv file into a readable corpus
123 csvToCorpus :: Int -> FilePath -> IO ([(Int,Text)])
124 csvToCorpus limit path = Vector.toList
125 <$> Vector.take limit
126 <$> Vector.map (\row -> (csv_publication_year row, (csv_title row) <> " " <> (csv_abstract row)))
127 <$> snd <$> Csv.readFile path
130 -- | To use the correct parser given a CorpusType
131 fileToCorpus :: CorpusParser -> FilePath -> IO ([(Int,Text)])
132 fileToCorpus parser path = case parser of
133 Wos limit -> wosToCorpus limit path
134 Csv limit -> csvToCorpus limit path
137 -- | To parse a file into a list of Document
138 fileToDocs :: CorpusParser -> FilePath -> TermList -> IO [Document]
139 fileToDocs parser path lst = do
140 corpus <- fileToCorpus parser path
141 let patterns = buildPatterns lst
142 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
153 printIOMsg "Starting the reconstruction"
155 printIOMsg "Read the configuration file"
157 jsonArgs <- (eitherDecode <$> readJson args) :: IO (Either String Config)
160 Left err -> putStrLn err
163 printIOMsg "Parse the corpus"
164 mapList <- csvGraphTermList (listPath config)
165 corpus <- fileToDocs (corpusParser config) (corpusPath config) mapList
166 printIOComment (show (length corpus) <> " parsed docs from the corpus")
168 printIOMsg "Reconstruct the Phylo"
169 let phylo = toPhylo corpus mapList config
171 printIOMsg "End of reconstruction"