]> Git — Sourcephile - gargantext.git/blob - bin/gargantext-phylo/Main.hs
[refactoring] add some default extensions to package.yaml
[gargantext.git] / bin / gargantext-phylo / Main.hs
1 {-|
2 Module : Main.hs
3 Description : Gargantext starter binary with Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Phylo binaries
11
12 -}
13
14 {-# LANGUAGE StandaloneDeriving #-}
15 {-# LANGUAGE TypeOperators #-}
16 {-# LANGUAGE Strict #-}
17
18 module Main where
19
20 import Control.Concurrent.Async as CCA (mapConcurrently)
21 import Control.Monad (mapM)
22 import Data.Aeson
23 import Data.List ((++),concat)
24 import Data.Maybe
25 import Data.Text (Text, unwords, unlines)
26 import GHC.Generics
27 import GHC.IO (FilePath)
28 import Gargantext.Database.Admin.Types.Node
29 import Gargantext.Prelude
30 import Gargantext.Text.Context (TermList)
31 import Gargantext.Text.Corpus.Parsers (FileFormat(..),parseFile)
32 import Gargantext.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year)
33 import Gargantext.Text.List.CSV (csvGraphTermList)
34 import Gargantext.Text.Terms.WithList
35 import Gargantext.Viz.Phylo
36 import Gargantext.Viz.Phylo.LevelMaker
37 import Gargantext.Viz.Phylo.Tools
38 import Gargantext.Viz.Phylo.View.Export
39 import Gargantext.Viz.Phylo.View.ViewMaker
40 import System.Directory (doesFileExist)
41 import System.Environment
42 import qualified Data.ByteString.Lazy as L
43 import qualified Data.List as DL
44 import qualified Data.Map as DM
45 import qualified Data.Text as DT
46 import qualified Data.Vector as DV
47 import qualified Gargantext.Text.Corpus.Parsers.CSV as CSV
48 import qualified Prelude as P
49
50
51 --------------
52 -- | Conf | --
53 --------------
54
55
56 type ListPath = FilePath
57 type FisPath = FilePath
58 type CorpusPath = FilePath
59 data CorpusType = Wos | Csv deriving (Show,Generic)
60 type Limit = Int
61
62 data Conf =
63 Conf { corpusPath :: CorpusPath
64 , corpusType :: CorpusType
65 , listPath :: ListPath
66 , fisPath :: FilePath
67 , outputPath :: FilePath
68 , phyloName :: Text
69 , limit :: Limit
70 , timeGrain :: Int
71 , timeStep :: Int
72 , timeFrame :: Int
73 , timeFrameTh :: Double
74 , timeTh :: Double
75 , timeSens :: Double
76 , reBranchThr :: Double
77 , reBranchNth :: Int
78 , clusterTh :: Double
79 , clusterSens :: Double
80 , phyloLevel :: Int
81 , viewLevel :: Int
82 , fisSupport :: Int
83 , fisClique :: Int
84 , minSizeBranch :: Int
85 } deriving (Show,Generic)
86
87 instance FromJSON Conf
88 instance ToJSON Conf
89
90 instance FromJSON CorpusType
91 instance ToJSON CorpusType
92
93
94 decoder :: P.Either a b -> b
95 decoder (P.Left _) = P.error "Error"
96 decoder (P.Right x) = x
97
98 -- | Get the conf from a Json file
99 getJson :: FilePath -> IO L.ByteString
100 getJson path = L.readFile path
101
102
103 ---------------
104 -- | Parse | --
105 ---------------
106
107
108 -- | To filter the Ngrams of a document based on the termList
109 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
110 filterTerms patterns (y,d) = (y,termsInText patterns d)
111 where
112 --------------------------------------
113 termsInText :: Patterns -> Text -> [Text]
114 termsInText pats txt = DL.nub
115 $ DL.concat
116 $ map (map unwords)
117 $ extractTermsWithList pats txt
118 --------------------------------------
119
120
121 -- | To transform a Csv nfile into a readable corpus
122 csvToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
123 csvToCorpus limit csv = DV.toList
124 -- . DV.reverse
125 . DV.take limit
126 -- . DV.reverse
127 . DV.map (\n -> (csv_publication_year n, (csv_title n) <> " " <> (csv_abstract n)))
128 . snd <$> CSV.readFile csv
129
130
131 -- | To transform a Wos nfile into a readable corpus
132 wosToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
133 wosToCorpus limit path = DL.take limit
134 . map (\d -> ((fromJust $_hyperdataDocument_publication_year d)
135 ,(fromJust $_hyperdataDocument_title d) <> " " <> (fromJust $_hyperdataDocument_abstract d)))
136 . filter (\d -> (isJust $_hyperdataDocument_publication_year d)
137 && (isJust $_hyperdataDocument_title d)
138 && (isJust $_hyperdataDocument_abstract d))
139 . concat
140 <$> mapConcurrently (\idx -> parseFile WOS (path <> show(idx) <> ".txt")) [1..20]
141
142
143 -- | To use the correct parser given a CorpusType
144 fileToCorpus :: CorpusType -> Limit -> CorpusPath -> IO ([(Int,Text)])
145 fileToCorpus format limit path = case format of
146 Wos -> wosToCorpus limit path
147 Csv -> csvToCorpus limit path
148
149
150 -- | To parse a file into a list of Document
151 parse :: CorpusType -> Limit -> CorpusPath -> TermList -> IO [Document]
152 parse format limit path l = do
153 corpus <- fileToCorpus format limit path
154 let patterns = buildPatterns l
155 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
156
157
158 -- | To parse an existing Fis file
159 parseFis :: FisPath -> Text -> Int -> Int -> Int -> Int -> IO [PhyloFis]
160 parseFis path name grain step support clique = do
161 fisExists <- doesFileExist (path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json")
162 if fisExists
163 then do
164 fisJson <- (eitherDecode <$> getJson (path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json")) :: IO (P.Either P.String [PhyloFis])
165 case fisJson of
166 P.Left err -> do
167 putStrLn err
168 pure []
169 P.Right fis -> pure fis
170 else pure []
171
172 writeFis :: FisPath -> Text -> Int -> Int -> Int -> Int -> DM.Map (Date,Date) [PhyloFis] -> IO ()
173 writeFis path name grain step support clique fis = do
174 let fisPath = path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json"
175 L.writeFile fisPath $ encode (DL.concat $ DM.elems fis)
176
177 --------------
178 -- | Main | --
179 --------------
180
181
182 main :: IO ()
183 main = do
184
185 [jsonPath] <- getArgs
186
187 confJson <- (eitherDecode <$> getJson jsonPath) :: IO (P.Either P.String Conf)
188
189 case confJson of
190 P.Left err -> putStrLn err
191 P.Right conf -> do
192
193 termList <- csvGraphTermList (listPath conf)
194
195 corpus <- parse (corpusType conf) (limit conf) (corpusPath conf) termList
196
197 putStrLn $ ("\n" <> show (length corpus) <> " parsed docs")
198
199 fis <- parseFis (fisPath conf) (phyloName conf) (timeGrain conf) (timeStep conf) (fisSupport conf) (fisClique conf)
200
201 putStrLn $ ("\n" <> show (length fis) <> " parsed fis")
202
203 let fis' = DM.fromListWith (++) $ DL.sortOn (fst . fst) $ map (\f -> (getFisPeriod f,[f])) fis
204
205 let query = PhyloQueryBuild (phyloName conf) "" (timeGrain conf) (timeStep conf)
206 (Fis $ FisParams True (fisSupport conf) (fisClique conf)) [] [] (WeightedLogJaccard $ WLJParams (timeTh conf) (timeSens conf)) (timeFrame conf) (timeFrameTh conf)
207 (reBranchThr conf) (reBranchNth conf) (phyloLevel conf)
208 (RelatedComponents $ RCParams $ WeightedLogJaccard $ WLJParams (clusterTh conf) (clusterSens conf))
209
210 let queryView = PhyloQueryView (viewLevel conf) Merge False 1 [BranchAge,BranchBirth,BranchGroups] [SizeBranch $ SBParams (minSizeBranch conf)] [GroupLabelIncDyn,BranchPeakInc] (Just (ByBranchBirth,Asc)) Json Flat True
211
212 let phylo = toPhylo query corpus termList fis'
213
214 writeFis (fisPath conf) (phyloName conf) (timeGrain conf) (timeStep conf) (fisSupport conf) (fisClique conf) (getPhyloFis phylo)
215
216 let view = toPhyloView queryView phylo
217
218 putStrLn $ ("phylo completed until level " <> show (phyloLevel conf) <> ", export at level " <> show (viewLevel conf))
219
220 let outputFile = (outputPath conf) <> (DT.unpack $ phyloName conf)
221 <> "_" <> show (limit conf) <> "_"
222 <> "_" <> show (timeTh conf) <> "_"
223 <> "_" <> show (timeSens conf) <> "_"
224 <> "_" <> show (clusterTh conf) <> "_"
225 <> "_" <> show (clusterSens conf)
226 <> ".dot"
227
228 P.writeFile outputFile $ dotToString $ viewToDot view