]> Git — Sourcephile - gargantext.git/blob - bin/gargantext-phylo/Main.hs
add rebranching to link distante branches
[gargantext.git] / bin / gargantext-phylo / Main.hs
1 {-|
2 Module : Main.hs
3 Description : Gargantext starter binary with Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Phylo binaries
11
12 -}
13
14 {-# LANGUAGE DataKinds #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE NoImplicitPrelude #-}
18 {-# LANGUAGE OverloadedStrings #-}
19 {-# LANGUAGE StandaloneDeriving #-}
20 {-# LANGUAGE TypeOperators #-}
21 {-# LANGUAGE Strict #-}
22
23 module Main where
24
25 import System.Directory (doesFileExist)
26
27 import Data.Aeson
28 import Data.Text (Text, unwords)
29 import Data.List ((++))
30 import GHC.Generics
31 import GHC.IO (FilePath)
32 import Gargantext.Prelude
33 import Gargantext.Text.List.CSV (csvGraphTermList)
34 import Gargantext.Text.Parsers.CSV (csv_title, csv_abstract, csv_publication_year)
35 import qualified Gargantext.Text.Parsers.CSV as CSV
36 import Gargantext.Text.Parsers (FileFormat(..),parseFile)
37 import Gargantext.Text.Terms.WithList
38 import Gargantext.Text.Context (TermList)
39
40 import System.Environment
41
42 import Gargantext.Viz.Phylo
43 import Gargantext.Viz.Phylo.Tools
44 import Gargantext.Viz.Phylo.LevelMaker
45 import Gargantext.Viz.Phylo.View.Export
46 import Gargantext.Viz.Phylo.View.ViewMaker
47
48 import Gargantext.Database.Types.Node
49 import Data.Maybe
50
51 import qualified Data.Map as DM
52 import qualified Data.Vector as DV
53 import qualified Data.List as DL
54 import qualified Data.Text as DT
55 import qualified Prelude as P
56 import qualified Data.ByteString.Lazy as L
57
58
59 --------------
60 -- | Conf | --
61 --------------
62
63
64 type ListPath = FilePath
65 type FisPath = FilePath
66 type CorpusPath = FilePath
67 data CorpusType = Wos | Csv deriving (Show,Generic)
68 type Limit = Int
69
70 data Conf =
71 Conf { corpusPath :: CorpusPath
72 , corpusType :: CorpusType
73 , listPath :: ListPath
74 , fisPath :: FilePath
75 , outputPath :: FilePath
76 , phyloName :: Text
77 , limit :: Limit
78 , timeGrain :: Int
79 , timeStep :: Int
80 , timeFrame :: Int
81 , timeTh :: Double
82 , timeSens :: Double
83 , clusterTh :: Double
84 , clusterSens :: Double
85 , phyloLevel :: Int
86 , viewLevel :: Int
87 , fisSupport :: Int
88 , fisClique :: Int
89 , minSizeBranch :: Int
90 } deriving (Show,Generic)
91
92 instance FromJSON Conf
93 instance ToJSON Conf
94
95 instance FromJSON CorpusType
96 instance ToJSON CorpusType
97
98
99 decoder :: P.Either a b -> b
100 decoder (P.Left _) = P.error "Error"
101 decoder (P.Right x) = x
102
103 -- | Get the conf from a Json file
104 getJson :: FilePath -> IO L.ByteString
105 getJson path = L.readFile path
106
107
108 ---------------
109 -- | Parse | --
110 ---------------
111
112
113 -- | To filter the Ngrams of a document based on the termList
114 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
115 filterTerms patterns (y,d) = (y,termsInText patterns d)
116 where
117 --------------------------------------
118 termsInText :: Patterns -> Text -> [Text]
119 termsInText pats txt = DL.nub $ DL.concat $ map (map unwords) $ extractTermsWithList pats txt
120 --------------------------------------
121
122
123 -- | To transform a Csv nfile into a readable corpus
124 csvToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
125 csvToCorpus limit csv = DV.toList
126 -- . DV.reverse
127 . DV.take limit
128 -- . DV.reverse
129 . DV.map (\n -> (csv_publication_year n, (csv_title n) <> " " <> (csv_abstract n)))
130 . snd <$> CSV.readFile csv
131
132
133 -- | To transform a Wos nfile into a readable corpus
134 wosToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
135 wosToCorpus limit path = DL.take limit
136 . map (\d -> ((fromJust $_hyperdataDocument_publication_year d)
137 ,(fromJust $_hyperdataDocument_title d) <> " " <> (fromJust $_hyperdataDocument_abstract d)))
138 . filter (\d -> (isJust $_hyperdataDocument_publication_year d)
139 && (isJust $_hyperdataDocument_title d)
140 && (isJust $_hyperdataDocument_abstract d))
141 <$> parseFile WOS path
142
143
144 -- | To use the correct parser given a CorpusType
145 fileToCorpus :: CorpusType -> Limit -> CorpusPath -> IO ([(Int,Text)])
146 fileToCorpus format limit path = case format of
147 Wos -> wosToCorpus limit path
148 Csv -> csvToCorpus limit path
149
150
151 -- | To parse a file into a list of Document
152 parse :: CorpusType -> Limit -> CorpusPath -> TermList -> IO [Document]
153 parse format limit path l = do
154 corpus <- fileToCorpus format limit path
155 let patterns = buildPatterns l
156 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
157
158
159 -- | To parse an existing Fis file
160 parseFis :: FisPath -> Text -> Int -> Int -> Int -> Int -> IO [PhyloFis]
161 parseFis path name grain step support clique = do
162 fisExists <- doesFileExist (path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json")
163 if fisExists
164 then do
165 fisJson <- (eitherDecode <$> getJson (path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json")) :: IO (P.Either P.String [PhyloFis])
166 case fisJson of
167 P.Left err -> do
168 putStrLn err
169 pure []
170 P.Right fis -> pure fis
171 else pure []
172
173 writeFis :: FisPath -> Text -> Int -> Int -> Int -> Int -> DM.Map (Date,Date) [PhyloFis] -> IO ()
174 writeFis path name grain step support clique fis = do
175 let fisPath = path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json"
176 L.writeFile fisPath $ encode (DL.concat $ DM.elems fis)
177
178 --------------
179 -- | Main | --
180 --------------
181
182
183 main :: IO ()
184 main = do
185
186 [jsonPath] <- getArgs
187
188 confJson <- (eitherDecode <$> getJson jsonPath) :: IO (P.Either P.String Conf)
189
190 case confJson of
191 P.Left err -> putStrLn err
192 P.Right conf -> do
193
194 termList <- csvGraphTermList (listPath conf)
195
196 corpus <- parse (corpusType conf) (limit conf) (corpusPath conf) termList
197
198 putStrLn $ ("\n" <> show (length corpus) <> " parsed docs")
199
200 let roots = DL.nub $ DL.concat $ map text corpus
201
202 putStrLn $ ("\n" <> show (length roots) <> " parsed foundation roots")
203
204 fis <- parseFis (fisPath conf) (phyloName conf) (timeGrain conf) (timeStep conf) (fisSupport conf) (fisClique conf)
205
206 putStrLn $ ("\n" <> show (length fis) <> " parsed fis")
207
208 let mFis = DM.fromListWith (++) $ DL.sortOn (fst . fst) $ map (\f -> (getFisPeriod f,[f])) fis
209
210 let query = PhyloQueryBuild (phyloName conf) "" (timeGrain conf) (timeStep conf)
211 (Fis $ FisParams True (fisSupport conf) (fisClique conf)) [] [] (WeightedLogJaccard $ WLJParams (timeTh conf) (timeSens conf)) (timeFrame conf) (phyloLevel conf)
212 (RelatedComponents $ RCParams $ WeightedLogJaccard $ WLJParams (clusterTh conf) (clusterSens conf))
213
214 let queryView = PhyloQueryView (viewLevel conf) Merge False 1 [BranchAge] [SizeBranch $ SBParams (minSizeBranch conf)] [BranchPeakFreq,GroupLabelCooc] (Just (ByBranchAge,Asc)) Json Flat True
215
216 let phylo = toPhylo query corpus roots termList mFis
217
218 writeFis (fisPath conf) (phyloName conf) (timeGrain conf) (timeStep conf) (fisSupport conf) (fisClique conf) (getPhyloFis phylo)
219
220 let view = toPhyloView queryView phylo
221
222 putStrLn $ ("phylo completed until level " <> show (phyloLevel conf) <> ", export at level " <> show (viewLevel conf))
223
224 let outputFile = (outputPath conf) <> (DT.unpack $ phyloName conf)
225 <> "_" <> show (limit conf) <> "_"
226 <> "_" <> show (timeTh conf) <> "_"
227 <> "_" <> show (timeSens conf) <> "_"
228 <> "_" <> show (clusterTh conf) <> "_"
229 <> "_" <> show (clusterSens conf)
230 <> ".dot"
231
232 P.writeFile outputFile $ dotToString $ viewToDot view