]> Git — Sourcephile - gargantext.git/blob - bin/gargantext-adaptative-phylo/Main.hs
end of phylo clustering
[gargantext.git] / bin / gargantext-adaptative-phylo / Main.hs
1 {-|
2 Module : Main.hs
3 Description : Gargantext starter binary with Adaptative Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Adaptative Phylo binaries
11 -}
12
13 {-# LANGUAGE DataKinds #-}
14 {-# LANGUAGE DeriveGeneric #-}
15 {-# LANGUAGE FlexibleInstances #-}
16 {-# LANGUAGE NoImplicitPrelude #-}
17 {-# LANGUAGE StandaloneDeriving #-}
18 {-# LANGUAGE TypeOperators #-}
19 {-# LANGUAGE OverloadedStrings #-}
20 {-# LANGUAGE Strict #-}
21
22 module Main where
23
24 import Data.Aeson
25 import Data.ByteString.Lazy (ByteString)
26 import Data.Maybe (isJust, fromJust)
27 import Data.List (concat, nub, isSuffixOf, take)
28 import Data.String (String)
29 import Data.Text (Text, unwords, unpack)
30
31 import Gargantext.Prelude
32 import Gargantext.Database.Types.Node (HyperdataDocument(..))
33 import Gargantext.Text.Context (TermList)
34 import Gargantext.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year)
35 import Gargantext.Text.Corpus.Parsers (FileFormat(..),parseFile)
36 import Gargantext.Text.List.CSV (csvGraphTermList)
37 import Gargantext.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
38 import Gargantext.Viz.AdaptativePhylo
39 import Gargantext.Viz.Phylo.PhyloMaker (toPhylo)
40 import Gargantext.Viz.Phylo.PhyloTools (printIOMsg, printIOComment)
41 import Gargantext.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
42
43 import GHC.IO (FilePath)
44 import Prelude (Either(..))
45 import System.Environment
46 import System.Directory (listDirectory)
47 import Control.Concurrent.Async (mapConcurrently)
48
49 import qualified Data.ByteString.Lazy as Lazy
50 import qualified Data.Vector as Vector
51 import qualified Gargantext.Text.Corpus.Parsers.CSV as Csv
52
53
54 ---------------
55 -- | Tools | --
56 ---------------
57
58
59 -- | To get all the files in a directory or just a file
60 getFilesFromPath :: FilePath -> IO([FilePath])
61 getFilesFromPath path = do
62 if (isSuffixOf "/" path)
63 then (listDirectory path)
64 else return [path]
65
66
67 --------------
68 -- | Json | --
69 --------------
70
71
72 -- | To read and decode a Json file
73 readJson :: FilePath -> IO ByteString
74 readJson path = Lazy.readFile path
75
76
77 ----------------
78 -- | Parser | --
79 ----------------
80
81 -- | To filter the Ngrams of a document based on the termList
82 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
83 filterTerms patterns (y,d) = (y,termsInText patterns d)
84 where
85 --------------------------------------
86 termsInText :: Patterns -> Text -> [Text]
87 termsInText pats txt = nub $ concat $ map (map unwords) $ extractTermsWithList pats txt
88 --------------------------------------
89
90
91 -- | To transform a Wos file (or [file]) into a readable corpus
92 wosToCorpus :: Int -> FilePath -> IO ([(Int,Text)])
93 wosToCorpus limit path = do
94 files <- getFilesFromPath path
95 take limit
96 <$> map (\d -> let date' = fromJust $ _hyperdataDocument_publication_year d
97 title = fromJust $ _hyperdataDocument_title d
98 abstr = if (isJust $ _hyperdataDocument_abstract d)
99 then fromJust $ _hyperdataDocument_abstract d
100 else ""
101 in (date', title <> " " <> abstr))
102 <$> concat
103 <$> mapConcurrently (\file ->
104 filter (\d -> (isJust $ _hyperdataDocument_publication_year d)
105 && (isJust $ _hyperdataDocument_title d))
106 <$> parseFile WOS (path <> file) ) files
107
108
109 -- | To transform a Csv file into a readable corpus
110 csvToCorpus :: Int -> FilePath -> IO ([(Int,Text)])
111 csvToCorpus limit path = Vector.toList
112 <$> Vector.take limit
113 <$> Vector.map (\row -> (csv_publication_year row, (csv_title row) <> " " <> (csv_abstract row)))
114 <$> snd <$> Csv.readFile path
115
116
117 -- | To use the correct parser given a CorpusType
118 fileToCorpus :: CorpusParser -> FilePath -> IO ([(Int,Text)])
119 fileToCorpus parser path = case parser of
120 Wos limit -> wosToCorpus limit path
121 Csv limit -> csvToCorpus limit path
122
123
124 -- | To parse a file into a list of Document
125 fileToDocs :: CorpusParser -> FilePath -> TermList -> IO [Document]
126 fileToDocs parser path lst = do
127 corpus <- fileToCorpus parser path
128 let patterns = buildPatterns lst
129 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
130
131
132 --------------
133 -- | Main | --
134 --------------
135
136
137 main :: IO ()
138 main = do
139
140 printIOMsg "Starting the reconstruction"
141
142 printIOMsg "Read the configuration file"
143 [args] <- getArgs
144 jsonArgs <- (eitherDecode <$> readJson args) :: IO (Either String Config)
145
146 case jsonArgs of
147 Left err -> putStrLn err
148 Right config -> do
149
150 printIOMsg "Parse the corpus"
151 mapList <- csvGraphTermList (listPath config)
152 corpus <- fileToDocs (corpusParser config) (corpusPath config) mapList
153 printIOComment (show (length corpus) <> " parsed docs from the corpus")
154
155 printIOMsg "Reconstruct the Phylo"
156
157 let phylo = toPhylo corpus mapList config
158
159 printIOMsg "End of reconstruction, start the export"
160
161 let dot = toPhyloExport phylo
162
163 printIOMsg "End of export to dot"
164
165 let output = (outputPath config)
166 <> (unpack $ phyloName config)
167 <> "_V2.dot"
168
169 dotToFile output dot