]> Git — Sourcephile - gargantext.git/blob - bin/gargantext-phylo/Main.hs
Merge branch 'dev-phylo' of ssh://gitlab.iscpif.fr:20022/gargantext/haskell-gargantex...
[gargantext.git] / bin / gargantext-phylo / Main.hs
1 {-|
2 Module : Main.hs
3 Description : Gargantext starter binary with Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Phylo binaries
11
12 -}
13
14 {-# LANGUAGE DataKinds #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE NoImplicitPrelude #-}
18 {-# LANGUAGE OverloadedStrings #-}
19 {-# LANGUAGE StandaloneDeriving #-}
20 {-# LANGUAGE TypeOperators #-}
21 {-# LANGUAGE Strict #-}
22
23 module Main where
24
25 import System.Directory (doesFileExist)
26
27 import Data.Aeson
28 import Data.Text (Text, unwords, unlines)
29 import Data.List ((++),concat)
30 import GHC.Generics
31 import GHC.IO (FilePath)
32 import Gargantext.Prelude
33 import Gargantext.Text.List.CSV (csvGraphTermList)
34 import Gargantext.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year)
35 import qualified Gargantext.Text.Corpus.Parsers.CSV as CSV
36 import Gargantext.Text.Corpus.Parsers (FileFormat(..),parseFile)
37 import Gargantext.Text.Terms.WithList
38 import Gargantext.Text.Context (TermList)
39
40 import Control.Monad (mapM)
41
42 import System.Environment
43
44 import Gargantext.Viz.Phylo
45 import Gargantext.Viz.Phylo.Tools
46 import Gargantext.Viz.Phylo.LevelMaker
47 import Gargantext.Viz.Phylo.View.Export
48 import Gargantext.Viz.Phylo.View.ViewMaker
49
50 import Gargantext.Database.Types.Node
51 import Data.Maybe
52
53 import Control.Concurrent.Async as CCA (mapConcurrently)
54
55 import qualified Data.Map as DM
56 import qualified Data.Vector as DV
57 import qualified Data.List as DL
58 import qualified Data.Text as DT
59 import qualified Prelude as P
60 import qualified Data.ByteString.Lazy as L
61
62
63 --------------
64 -- | Conf | --
65 --------------
66
67
68 type ListPath = FilePath
69 type FisPath = FilePath
70 type CorpusPath = FilePath
71 data CorpusType = Wos | Csv deriving (Show,Generic)
72 type Limit = Int
73
74 data Conf =
75 Conf { corpusPath :: CorpusPath
76 , corpusType :: CorpusType
77 , listPath :: ListPath
78 , fisPath :: FilePath
79 , outputPath :: FilePath
80 , phyloName :: Text
81 , limit :: Limit
82 , timeGrain :: Int
83 , timeStep :: Int
84 , timeFrame :: Int
85 , timeFrameTh :: Double
86 , timeTh :: Double
87 , timeSens :: Double
88 , reBranchThr :: Double
89 , reBranchNth :: Int
90 , clusterTh :: Double
91 , clusterSens :: Double
92 , phyloLevel :: Int
93 , viewLevel :: Int
94 , fisSupport :: Int
95 , fisClique :: Int
96 , minSizeBranch :: Int
97 } deriving (Show,Generic)
98
99 instance FromJSON Conf
100 instance ToJSON Conf
101
102 instance FromJSON CorpusType
103 instance ToJSON CorpusType
104
105
106 decoder :: P.Either a b -> b
107 decoder (P.Left _) = P.error "Error"
108 decoder (P.Right x) = x
109
110 -- | Get the conf from a Json file
111 getJson :: FilePath -> IO L.ByteString
112 getJson path = L.readFile path
113
114
115 ---------------
116 -- | Parse | --
117 ---------------
118
119
120 -- | To filter the Ngrams of a document based on the termList
121 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
122 filterTerms patterns (y,d) = (y,termsInText patterns d)
123 where
124 --------------------------------------
125 termsInText :: Patterns -> Text -> [Text]
126 termsInText pats txt = DL.nub $ DL.concat $ map (map unwords) $ extractTermsWithList pats txt
127 --------------------------------------
128
129
130 -- | To transform a Csv nfile into a readable corpus
131 csvToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
132 csvToCorpus limit csv = DV.toList
133 -- . DV.reverse
134 . DV.take limit
135 -- . DV.reverse
136 . DV.map (\n -> (csv_publication_year n, (csv_title n) <> " " <> (csv_abstract n)))
137 . snd <$> CSV.readFile csv
138
139
140 -- | To transform a Wos nfile into a readable corpus
141 wosToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
142 wosToCorpus limit path = DL.take limit
143 . map (\d -> ((fromJust $_hyperdataDocument_publication_year d)
144 ,(fromJust $_hyperdataDocument_title d) <> " " <> (fromJust $_hyperdataDocument_abstract d)))
145 . filter (\d -> (isJust $_hyperdataDocument_publication_year d)
146 && (isJust $_hyperdataDocument_title d)
147 && (isJust $_hyperdataDocument_abstract d))
148 . concat
149 <$> mapConcurrently (\idx -> parseFile WOS (path <> show(idx) <> ".txt")) [1..20]
150
151
152 -- | To use the correct parser given a CorpusType
153 fileToCorpus :: CorpusType -> Limit -> CorpusPath -> IO ([(Int,Text)])
154 fileToCorpus format limit path = case format of
155 Wos -> wosToCorpus limit path
156 Csv -> csvToCorpus limit path
157
158
159 -- | To parse a file into a list of Document
160 parse :: CorpusType -> Limit -> CorpusPath -> TermList -> IO [Document]
161 parse format limit path l = do
162 corpus <- fileToCorpus format limit path
163 let patterns = buildPatterns l
164 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
165
166
167 -- | To parse an existing Fis file
168 parseFis :: FisPath -> Text -> Int -> Int -> Int -> Int -> IO [PhyloFis]
169 parseFis path name grain step support clique = do
170 fisExists <- doesFileExist (path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json")
171 if fisExists
172 then do
173 fisJson <- (eitherDecode <$> getJson (path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json")) :: IO (P.Either P.String [PhyloFis])
174 case fisJson of
175 P.Left err -> do
176 putStrLn err
177 pure []
178 P.Right fis -> pure fis
179 else pure []
180
181 writeFis :: FisPath -> Text -> Int -> Int -> Int -> Int -> DM.Map (Date,Date) [PhyloFis] -> IO ()
182 writeFis path name grain step support clique fis = do
183 let fisPath = path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json"
184 L.writeFile fisPath $ encode (DL.concat $ DM.elems fis)
185
186 --------------
187 -- | Main | --
188 --------------
189
190
191 main :: IO ()
192 main = do
193
194 [jsonPath] <- getArgs
195
196 confJson <- (eitherDecode <$> getJson jsonPath) :: IO (P.Either P.String Conf)
197
198 case confJson of
199 P.Left err -> putStrLn err
200 P.Right conf -> do
201
202 termList <- csvGraphTermList (listPath conf)
203
204 corpus <- parse (corpusType conf) (limit conf) (corpusPath conf) termList
205
206 putStrLn $ ("\n" <> show (length corpus) <> " parsed docs")
207
208 fis <- parseFis (fisPath conf) (phyloName conf) (timeGrain conf) (timeStep conf) (fisSupport conf) (fisClique conf)
209
210 putStrLn $ ("\n" <> show (length fis) <> " parsed fis")
211
212 let fis' = DM.fromListWith (++) $ DL.sortOn (fst . fst) $ map (\f -> (getFisPeriod f,[f])) fis
213
214 let query = PhyloQueryBuild (phyloName conf) "" (timeGrain conf) (timeStep conf)
215 (Fis $ FisParams True (fisSupport conf) (fisClique conf)) [] [] (WeightedLogJaccard $ WLJParams (timeTh conf) (timeSens conf)) (timeFrame conf) (timeFrameTh conf)
216 (reBranchThr conf) (reBranchNth conf) (phyloLevel conf)
217 (RelatedComponents $ RCParams $ WeightedLogJaccard $ WLJParams (clusterTh conf) (clusterSens conf))
218
219 let queryView = PhyloQueryView (viewLevel conf) Merge False 1 [BranchAge,BranchBirth,BranchGroups] [SizeBranch $ SBParams (minSizeBranch conf)] [GroupLabelIncDyn,BranchPeakInc] (Just (ByBranchBirth,Asc)) Json Flat True
220
221 let phylo = toPhylo query corpus termList fis'
222
223 writeFis (fisPath conf) (phyloName conf) (timeGrain conf) (timeStep conf) (fisSupport conf) (fisClique conf) (getPhyloFis phylo)
224
225 let view = toPhyloView queryView phylo
226
227 putStrLn $ ("phylo completed until level " <> show (phyloLevel conf) <> ", export at level " <> show (viewLevel conf))
228
229 let outputFile = (outputPath conf) <> (DT.unpack $ phyloName conf)
230 <> "_" <> show (limit conf) <> "_"
231 <> "_" <> show (timeTh conf) <> "_"
232 <> "_" <> show (timeSens conf) <> "_"
233 <> "_" <> show (clusterTh conf) <> "_"
234 <> "_" <> show (clusterSens conf)
235 <> ".dot"
236
237 P.writeFile outputFile $ dotToString $ viewToDot view