]> Git — Sourcephile - gargantext.git/blob - bin/gargantext-adaptative-phylo/Main.hs
add new synchronic clustering
[gargantext.git] / bin / gargantext-adaptative-phylo / Main.hs
1 {-|
2 Module : Main.hs
3 Description : Gargantext starter binary with Adaptative Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Adaptative Phylo binaries
11 -}
12
13 {-# LANGUAGE DataKinds #-}
14 {-# LANGUAGE DeriveGeneric #-}
15 {-# LANGUAGE FlexibleInstances #-}
16 {-# LANGUAGE NoImplicitPrelude #-}
17 {-# LANGUAGE StandaloneDeriving #-}
18 {-# LANGUAGE TypeOperators #-}
19 {-# LANGUAGE OverloadedStrings #-}
20 {-# LANGUAGE Strict #-}
21
22 module Main where
23
24 import Data.Aeson
25 import Data.ByteString.Lazy (ByteString)
26 import Data.Maybe (isJust, fromJust)
27 import Data.List (concat, nub, isSuffixOf, take)
28 import Data.String (String)
29 import Data.Text (Text, unwords, unpack)
30
31 import Gargantext.Prelude
32 import Gargantext.Database.Types.Node (HyperdataDocument(..))
33 import Gargantext.Text.Context (TermList)
34 import Gargantext.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year)
35 import Gargantext.Text.Corpus.Parsers (FileFormat(..),parseFile)
36 import Gargantext.Text.List.CSV (csvGraphTermList)
37 import Gargantext.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
38 import Gargantext.Viz.AdaptativePhylo
39 import Gargantext.Viz.Phylo.PhyloMaker (toPhylo)
40 import Gargantext.Viz.Phylo.PhyloTools (printIOMsg, printIOComment)
41 import Gargantext.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
42 -- import Gargantext.Viz.Phylo.SynchronicClustering (synchronicDistance')
43
44 import GHC.IO (FilePath)
45 import Prelude (Either(..))
46 import System.Environment
47 import System.Directory (listDirectory)
48 import Control.Concurrent.Async (mapConcurrently)
49
50 import qualified Data.ByteString.Lazy as Lazy
51 import qualified Data.Vector as Vector
52 import qualified Gargantext.Text.Corpus.Parsers.CSV as Csv
53
54
55 ---------------
56 -- | Tools | --
57 ---------------
58
59
60 -- | To get all the files in a directory or just a file
61 getFilesFromPath :: FilePath -> IO([FilePath])
62 getFilesFromPath path = do
63 if (isSuffixOf "/" path)
64 then (listDirectory path)
65 else return [path]
66
67
68 --------------
69 -- | Json | --
70 --------------
71
72
73 -- | To read and decode a Json file
74 readJson :: FilePath -> IO ByteString
75 readJson path = Lazy.readFile path
76
77
78 ----------------
79 -- | Parser | --
80 ----------------
81
82 -- | To filter the Ngrams of a document based on the termList
83 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
84 filterTerms patterns (y,d) = (y,termsInText patterns d)
85 where
86 --------------------------------------
87 termsInText :: Patterns -> Text -> [Text]
88 termsInText pats txt = nub $ concat $ map (map unwords) $ extractTermsWithList pats txt
89 --------------------------------------
90
91
92 -- | To transform a Wos file (or [file]) into a readable corpus
93 wosToCorpus :: Int -> FilePath -> IO ([(Int,Text)])
94 wosToCorpus limit path = do
95 files <- getFilesFromPath path
96 take limit
97 <$> map (\d -> let date' = fromJust $ _hyperdataDocument_publication_year d
98 title = fromJust $ _hyperdataDocument_title d
99 abstr = if (isJust $ _hyperdataDocument_abstract d)
100 then fromJust $ _hyperdataDocument_abstract d
101 else ""
102 in (date', title <> " " <> abstr))
103 <$> concat
104 <$> mapConcurrently (\file ->
105 filter (\d -> (isJust $ _hyperdataDocument_publication_year d)
106 && (isJust $ _hyperdataDocument_title d))
107 <$> parseFile WOS (path <> file) ) files
108
109
110 -- | To transform a Csv file into a readable corpus
111 csvToCorpus :: Int -> FilePath -> IO ([(Int,Text)])
112 csvToCorpus limit path = Vector.toList
113 <$> Vector.take limit
114 <$> Vector.map (\row -> (csv_publication_year row, (csv_title row) <> " " <> (csv_abstract row)))
115 <$> snd <$> Csv.readFile path
116
117
118 -- | To use the correct parser given a CorpusType
119 fileToCorpus :: CorpusParser -> FilePath -> IO ([(Int,Text)])
120 fileToCorpus parser path = case parser of
121 Wos limit -> wosToCorpus limit path
122 Csv limit -> csvToCorpus limit path
123
124
125 -- | To parse a file into a list of Document
126 fileToDocs :: CorpusParser -> FilePath -> TermList -> IO [Document]
127 fileToDocs parser path lst = do
128 corpus <- fileToCorpus parser path
129 let patterns = buildPatterns lst
130 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
131
132
133 --------------
134 -- | Main | --
135 --------------
136
137
138 main :: IO ()
139 main = do
140
141 printIOMsg "Starting the reconstruction"
142
143 printIOMsg "Read the configuration file"
144 [args] <- getArgs
145 jsonArgs <- (eitherDecode <$> readJson args) :: IO (Either String Config)
146
147 case jsonArgs of
148 Left err -> putStrLn err
149 Right config -> do
150
151 printIOMsg "Parse the corpus"
152 mapList <- csvGraphTermList (listPath config)
153 corpus <- fileToDocs (corpusParser config) (corpusPath config) mapList
154 printIOComment (show (length corpus) <> " parsed docs from the corpus")
155
156 printIOMsg "Reconstruct the Phylo"
157
158 let phylo = toPhylo corpus mapList config
159
160 -- | probes
161
162 -- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_synchronic_distance_cumu_jaccard.txt")
163 -- $ synchronicDistance' phylo 1
164
165 -- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_inflexion_points.txt")
166 -- $ inflexionPoints phylo 1
167
168 printIOMsg "End of reconstruction, start the export"
169
170 let dot = toPhyloExport phylo
171
172 let output = (outputPath config)
173 <> (unpack $ phyloName config)
174 <> "_V2.dot"
175
176 dotToFile output dot