3 Description : Gargantext starter binary with Adaptative Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Adaptative Phylo binaries
13 {-# LANGUAGE DataKinds #-}
14 {-# LANGUAGE DeriveGeneric #-}
15 {-# LANGUAGE FlexibleInstances #-}
16 {-# LANGUAGE NoImplicitPrelude #-}
17 {-# LANGUAGE StandaloneDeriving #-}
18 {-# LANGUAGE TypeOperators #-}
19 {-# LANGUAGE OverloadedStrings #-}
20 {-# LANGUAGE Strict #-}
25 import Data.ByteString.Lazy (ByteString)
26 import Data.Maybe (isJust, fromJust)
27 import Data.List (concat, nub, isSuffixOf, take)
28 import Data.String (String)
29 import Data.Text (Text, unwords, unpack)
31 import Gargantext.Prelude
32 import Gargantext.Database.Admin.Types.Node (HyperdataDocument(..))
33 import Gargantext.Text.Context (TermList)
34 import Gargantext.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year)
35 import Gargantext.Text.Corpus.Parsers (FileFormat(..),parseFile)
36 import Gargantext.Text.List.CSV (csvGraphTermList)
37 import Gargantext.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
38 import Gargantext.Viz.AdaptativePhylo
39 import Gargantext.Viz.Phylo.PhyloMaker (toPhylo)
40 import Gargantext.Viz.Phylo.PhyloTools (printIOMsg, printIOComment)
41 import Gargantext.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
42 -- import Gargantext.Viz.Phylo.SynchronicClustering (synchronicDistance')
44 import GHC.IO (FilePath)
45 import Prelude (Either(..))
46 import System.Environment
47 import System.Directory (listDirectory)
48 import Control.Concurrent.Async (mapConcurrently)
50 import qualified Data.ByteString.Lazy as Lazy
51 import qualified Data.Vector as Vector
52 import qualified Gargantext.Text.Corpus.Parsers.CSV as Csv
60 -- | To get all the files in a directory or just a file
61 getFilesFromPath :: FilePath -> IO([FilePath])
62 getFilesFromPath path = do
63 if (isSuffixOf "/" path)
64 then (listDirectory path)
73 -- | To read and decode a Json file
74 readJson :: FilePath -> IO ByteString
75 readJson path = Lazy.readFile path
82 -- | To filter the Ngrams of a document based on the termList
83 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
84 filterTerms patterns (y,d) = (y,termsInText patterns d)
86 --------------------------------------
87 termsInText :: Patterns -> Text -> [Text]
88 termsInText pats txt = nub $ concat $ map (map unwords) $ extractTermsWithList pats txt
89 --------------------------------------
92 -- | To transform a Wos file (or [file]) into a readable corpus
93 wosToCorpus :: Int -> FilePath -> IO ([(Int,Text)])
94 wosToCorpus limit path = do
95 files <- getFilesFromPath path
97 <$> map (\d -> let date' = fromJust $ _hyperdataDocument_publication_year d
98 title = fromJust $ _hyperdataDocument_title d
99 abstr = if (isJust $ _hyperdataDocument_abstract d)
100 then fromJust $ _hyperdataDocument_abstract d
102 in (date', title <> " " <> abstr))
104 <$> mapConcurrently (\file ->
105 filter (\d -> (isJust $ _hyperdataDocument_publication_year d)
106 && (isJust $ _hyperdataDocument_title d))
107 <$> parseFile WOS (path <> file) ) files
110 -- | To transform a Csv file into a readable corpus
111 csvToCorpus :: Int -> FilePath -> IO ([(Int,Text)])
112 csvToCorpus limit path = Vector.toList
113 <$> Vector.take limit
114 <$> Vector.map (\row -> (csv_publication_year row, (csv_title row) <> " " <> (csv_abstract row)))
115 <$> snd <$> Csv.readFile path
118 -- | To use the correct parser given a CorpusType
119 fileToCorpus :: CorpusParser -> FilePath -> IO ([(Int,Text)])
120 fileToCorpus parser path = case parser of
121 Wos limit -> wosToCorpus limit path
122 Csv limit -> csvToCorpus limit path
125 -- | To parse a file into a list of Document
126 fileToDocs :: CorpusParser -> FilePath -> TermList -> IO [Document]
127 fileToDocs parser path lst = do
128 corpus <- fileToCorpus parser path
129 let patterns = buildPatterns lst
130 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
141 printIOMsg "Starting the reconstruction"
143 printIOMsg "Read the configuration file"
145 jsonArgs <- (eitherDecode <$> readJson args) :: IO (Either String Config)
148 Left err -> putStrLn err
151 printIOMsg "Parse the corpus"
152 mapList <- csvGraphTermList (listPath config)
153 corpus <- fileToDocs (corpusParser config) (corpusPath config) mapList
154 printIOComment (show (length corpus) <> " parsed docs from the corpus")
156 printIOMsg "Reconstruct the Phylo"
158 let phylo = toPhylo corpus mapList config
162 -- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_synchronic_distance_cumu_jaccard.txt")
163 -- $ synchronicDistance' phylo 1
165 -- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_inflexion_points.txt")
166 -- $ inflexionPoints phylo 1
168 printIOMsg "End of reconstruction, start the export"
170 let dot = toPhyloExport phylo
172 let output = (outputPath config)
173 <> (unpack $ phyloName config)