]> Git — Sourcephile - gargantext.git/blob - bin/gargantext-phylo/Main.hs
some refactoring & adding generality, specifivity, etc for the ngrams
[gargantext.git] / bin / gargantext-phylo / Main.hs
1 {-|
2 Module : Main.hs
3 Description : Gargantext starter binary with Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Phylo binaries
11
12 -}
13
14 {-# LANGUAGE DataKinds #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE NoImplicitPrelude #-}
18 {-# LANGUAGE OverloadedStrings #-}
19 {-# LANGUAGE StandaloneDeriving #-}
20 {-# LANGUAGE TypeOperators #-}
21 {-# LANGUAGE Strict #-}
22
23 module Main where
24
25 import System.Directory (doesFileExist)
26
27 import Data.Aeson
28 import Data.Text (Text, unwords, unlines)
29 import Data.List ((++))
30 import GHC.Generics
31 import GHC.IO (FilePath)
32 import Gargantext.Prelude
33 import Gargantext.Text.List.CSV (csvGraphTermList)
34 import Gargantext.Text.Parsers.CSV (csv_title, csv_abstract, csv_publication_year)
35 import qualified Gargantext.Text.Parsers.CSV as CSV
36 import Gargantext.Text.Parsers (FileFormat(..),parseFile)
37 import Gargantext.Text.Terms.WithList
38 import Gargantext.Text.Context (TermList)
39
40 import System.Environment
41
42 import Gargantext.Viz.Phylo
43 import Gargantext.Viz.Phylo.Tools
44 import Gargantext.Viz.Phylo.LevelMaker
45 import Gargantext.Viz.Phylo.View.Export
46 import Gargantext.Viz.Phylo.View.ViewMaker
47
48 import Gargantext.Database.Types.Node
49 import Data.Maybe
50
51 import qualified Data.Map as DM
52 import qualified Data.Vector as DV
53 import qualified Data.List as DL
54 import qualified Data.Text as DT
55 import qualified Prelude as P
56 import qualified Data.ByteString.Lazy as L
57
58
59 --------------
60 -- | Conf | --
61 --------------
62
63
64 type ListPath = FilePath
65 type FisPath = FilePath
66 type CorpusPath = FilePath
67 data CorpusType = Wos | Csv deriving (Show,Generic)
68 type Limit = Int
69
70 data Conf =
71 Conf { corpusPath :: CorpusPath
72 , corpusType :: CorpusType
73 , listPath :: ListPath
74 , fisPath :: FilePath
75 , outputPath :: FilePath
76 , phyloName :: Text
77 , limit :: Limit
78 , timeGrain :: Int
79 , timeStep :: Int
80 , timeFrame :: Int
81 , timeFrameTh :: Double
82 , timeTh :: Double
83 , timeSens :: Double
84 , reBranchThr :: Double
85 , reBranchNth :: Int
86 , clusterTh :: Double
87 , clusterSens :: Double
88 , phyloLevel :: Int
89 , viewLevel :: Int
90 , fisSupport :: Int
91 , fisClique :: Int
92 , minSizeBranch :: Int
93 } deriving (Show,Generic)
94
95 instance FromJSON Conf
96 instance ToJSON Conf
97
98 instance FromJSON CorpusType
99 instance ToJSON CorpusType
100
101
102 decoder :: P.Either a b -> b
103 decoder (P.Left _) = P.error "Error"
104 decoder (P.Right x) = x
105
106 -- | Get the conf from a Json file
107 getJson :: FilePath -> IO L.ByteString
108 getJson path = L.readFile path
109
110
111 ---------------
112 -- | Parse | --
113 ---------------
114
115
116 -- | To filter the Ngrams of a document based on the termList
117 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
118 filterTerms patterns (y,d) = (y,termsInText patterns d)
119 where
120 --------------------------------------
121 termsInText :: Patterns -> Text -> [Text]
122 termsInText pats txt = DL.nub $ DL.concat $ map (map unwords) $ extractTermsWithList pats txt
123 --------------------------------------
124
125
126 -- | To transform a Csv nfile into a readable corpus
127 csvToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
128 csvToCorpus limit csv = DV.toList
129 -- . DV.reverse
130 . DV.take limit
131 -- . DV.reverse
132 . DV.map (\n -> (csv_publication_year n, (csv_title n) <> " " <> (csv_abstract n)))
133 . snd <$> CSV.readFile csv
134
135
136 -- | To transform a Wos nfile into a readable corpus
137 wosToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
138 wosToCorpus limit path = DL.take limit
139 . map (\d -> ((fromJust $_hyperdataDocument_publication_year d)
140 ,(fromJust $_hyperdataDocument_title d) <> " " <> (fromJust $_hyperdataDocument_abstract d)))
141 . filter (\d -> (isJust $_hyperdataDocument_publication_year d)
142 && (isJust $_hyperdataDocument_title d)
143 && (isJust $_hyperdataDocument_abstract d))
144 <$> parseFile WOS path
145
146
147 -- | To use the correct parser given a CorpusType
148 fileToCorpus :: CorpusType -> Limit -> CorpusPath -> IO ([(Int,Text)])
149 fileToCorpus format limit path = case format of
150 Wos -> wosToCorpus limit path
151 Csv -> csvToCorpus limit path
152
153
154 -- | To parse a file into a list of Document
155 parse :: CorpusType -> Limit -> CorpusPath -> TermList -> IO [Document]
156 parse format limit path l = do
157 corpus <- fileToCorpus format limit path
158 let patterns = buildPatterns l
159 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
160
161
162 -- | To parse an existing Fis file
163 parseFis :: FisPath -> Text -> Int -> Int -> Int -> Int -> IO [PhyloFis]
164 parseFis path name grain step support clique = do
165 fisExists <- doesFileExist (path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json")
166 if fisExists
167 then do
168 fisJson <- (eitherDecode <$> getJson (path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json")) :: IO (P.Either P.String [PhyloFis])
169 case fisJson of
170 P.Left err -> do
171 putStrLn err
172 pure []
173 P.Right fis -> pure fis
174 else pure []
175
176 writeFis :: FisPath -> Text -> Int -> Int -> Int -> Int -> DM.Map (Date,Date) [PhyloFis] -> IO ()
177 writeFis path name grain step support clique fis = do
178 let fisPath = path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json"
179 L.writeFile fisPath $ encode (DL.concat $ DM.elems fis)
180
181 --------------
182 -- | Main | --
183 --------------
184
185
186 main :: IO ()
187 main = do
188
189 [jsonPath] <- getArgs
190
191 confJson <- (eitherDecode <$> getJson jsonPath) :: IO (P.Either P.String Conf)
192
193 case confJson of
194 P.Left err -> putStrLn err
195 P.Right conf -> do
196
197 termList <- csvGraphTermList (listPath conf)
198
199 corpus <- parse (corpusType conf) (limit conf) (corpusPath conf) termList
200
201 putStrLn $ ("\n" <> show (length corpus) <> " parsed docs")
202
203 fis <- parseFis (fisPath conf) (phyloName conf) (timeGrain conf) (timeStep conf) (fisSupport conf) (fisClique conf)
204
205 putStrLn $ ("\n" <> show (length fis) <> " parsed fis")
206
207 let fis' = DM.fromListWith (++) $ DL.sortOn (fst . fst) $ map (\f -> (getFisPeriod f,[f])) fis
208
209 let query = PhyloQueryBuild (phyloName conf) "" (timeGrain conf) (timeStep conf)
210 (Fis $ FisParams True (fisSupport conf) (fisClique conf)) [] [] (WeightedLogJaccard $ WLJParams (timeTh conf) (timeSens conf)) (timeFrame conf) (timeFrameTh conf)
211 (reBranchThr conf) (reBranchNth conf) (phyloLevel conf)
212 (RelatedComponents $ RCParams $ WeightedLogJaccard $ WLJParams (clusterTh conf) (clusterSens conf))
213
214 let queryView = PhyloQueryView (viewLevel conf) Merge False 1 [BranchAge] [SizeBranch $ SBParams (minSizeBranch conf)] [BranchPeakFreq,GroupLabelCooc] (Just (ByBranchAge,Asc)) Json Flat True
215
216 let phylo = toPhylo query corpus termList fis'
217
218 writeFis (fisPath conf) (phyloName conf) (timeGrain conf) (timeStep conf) (fisSupport conf) (fisClique conf) (getPhyloFis phylo)
219
220 let view = toPhyloView queryView phylo
221
222 putStrLn $ ("phylo completed until level " <> show (phyloLevel conf) <> ", export at level " <> show (viewLevel conf))
223
224 let outputFile = (outputPath conf) <> (DT.unpack $ phyloName conf)
225 <> "_" <> show (limit conf) <> "_"
226 <> "_" <> show (timeTh conf) <> "_"
227 <> "_" <> show (timeSens conf) <> "_"
228 <> "_" <> show (clusterTh conf) <> "_"
229 <> "_" <> show (clusterSens conf)
230 <> ".dot"
231
232 P.writeFile outputFile $ dotToString $ viewToDot view