]> Git — Sourcephile - gargantext.git/blob - bin/gargantext-phylo/Main.hs
[FIX] merge dev-phylo
[gargantext.git] / bin / gargantext-phylo / Main.hs
1 {-|
2 Module : Main.hs
3 Description : Gargantext starter binary with Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Phylo binaries
11
12 -}
13
14 {-# LANGUAGE StandaloneDeriving #-}
15 {-# LANGUAGE TypeOperators #-}
16 {-# LANGUAGE Strict #-}
17
18 module Main where
19
20 import Control.Concurrent.Async as CCA (mapConcurrently)
21 import Data.Aeson
22 import Data.List ((++),concat)
23 import Data.Maybe
24 import Data.Text (Text, unwords)
25 import GHC.Generics
26 import GHC.IO (FilePath)
27 import Gargantext.Database.Admin.Types.Hyperdata
28 import Gargantext.Prelude
29 import Gargantext.Core.Text.Context (TermList)
30 import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..),parseFile)
31 import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year)
32 import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
33 import Gargantext.Core.Text.Terms.WithList
34 import Gargantext.Core.Viz.Phylo
35 import Gargantext.Core.Viz.Phylo.LevelMaker
36 import Gargantext.Core.Viz.Phylo.Tools
37 import Gargantext.Core.Viz.Phylo.View.Export
38 import Gargantext.Core.Viz.Phylo.View.ViewMaker
39 import System.Directory (doesFileExist)
40 import System.Environment
41 import qualified Data.ByteString.Lazy as L
42 import qualified Data.List as DL
43 import qualified Data.Map as DM
44 import qualified Data.Text as DT
45 import qualified Data.Vector as DV
46 import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as CSV
47 import qualified Prelude as P
48
49
50 --------------
51 -- | Conf | --
52 --------------
53
54
55 type ListPath = FilePath
56 type FisPath = FilePath
57 type CorpusPath = FilePath
58 data CorpusType = Wos | Csv deriving (Show,Generic)
59 type Limit = Int
60
61 data Conf =
62 Conf { corpusPath :: CorpusPath
63 , corpusType :: CorpusType
64 , listPath :: ListPath
65 , fisPath :: FilePath
66 , outputPath :: FilePath
67 , phyloName :: Text
68 , limit :: Limit
69 , timeGrain :: Int
70 , timeStep :: Int
71 , timeFrame :: Int
72 , timeFrameTh :: Double
73 , timeTh :: Double
74 , timeSens :: Double
75 , reBranchThr :: Double
76 , reBranchNth :: Int
77 , clusterTh :: Double
78 , clusterSens :: Double
79 , phyloLevel :: Int
80 , viewLevel :: Int
81 , fisSupport :: Int
82 , fisClique :: Int
83 , minSizeBranch :: Int
84 } deriving (Show,Generic)
85
86 instance FromJSON Conf
87 instance ToJSON Conf
88
89 instance FromJSON CorpusType
90 instance ToJSON CorpusType
91
92
93 decoder :: P.Either a b -> b
94 decoder (P.Left _) = P.error "Error"
95 decoder (P.Right x) = x
96
97 -- | Get the conf from a Json file
98 getJson :: FilePath -> IO L.ByteString
99 getJson path = L.readFile path
100
101
102 ---------------
103 -- | Parse | --
104 ---------------
105
106
107 -- | To filter the Ngrams of a document based on the termList
108 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
109 filterTerms patterns (y,d) = (y,termsInText patterns d)
110 where
111 --------------------------------------
112 termsInText :: Patterns -> Text -> [Text]
113 termsInText pats txt = DL.nub
114 $ DL.concat
115 $ map (map unwords)
116 $ extractTermsWithList pats txt
117 --------------------------------------
118
119
120 -- | To transform a Csv nfile into a readable corpus
121 csvToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
122 csvToCorpus limit csv = DV.toList
123 -- . DV.reverse
124 . DV.take limit
125 -- . DV.reverse
126 . DV.map (\n -> (csv_publication_year n, (csv_title n) <> " " <> (csv_abstract n)))
127 . snd <$> CSV.readFile csv
128
129
130 -- | To transform a Wos nfile into a readable corpus
131 wosToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
132 wosToCorpus limit path = DL.take limit
133 . map (\d -> ((fromJust $_hd_publication_year d)
134 ,(fromJust $_hd_title d) <> " " <> (fromJust $_hd_abstract d)))
135 . filter (\d -> (isJust $_hd_publication_year d)
136 && (isJust $_hd_title d)
137 && (isJust $_hd_abstract d))
138 . concat
139 <$> mapConcurrently (\idx -> parseFile WOS (path <> show(idx) <> ".txt")) [1..20]
140
141
142 -- | To use the correct parser given a CorpusType
143 fileToCorpus :: CorpusType -> Limit -> CorpusPath -> IO ([(Int,Text)])
144 fileToCorpus format limit path = case format of
145 Wos -> wosToCorpus limit path
146 Csv -> csvToCorpus limit path
147
148
149 -- | To parse a file into a list of Document
150 parse :: CorpusType -> Limit -> CorpusPath -> TermList -> IO [Document]
151 parse format limit path l = do
152 corpus <- fileToCorpus format limit path
153 let patterns = buildPatterns l
154 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
155
156
157 -- | To parse an existing Fis file
158 parseFis :: FisPath -> Text -> Int -> Int -> Int -> Int -> IO [PhyloFis]
159 parseFis path name grain step support clique = do
160 fisExists <- doesFileExist (path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json")
161 if fisExists
162 then do
163 fisJson <- (eitherDecode <$> getJson (path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json")) :: IO (P.Either P.String [PhyloFis])
164 case fisJson of
165 P.Left err -> do
166 putStrLn err
167 pure []
168 P.Right fis -> pure fis
169 else pure []
170
171 writeFis :: FisPath -> Text -> Int -> Int -> Int -> Int -> DM.Map (Date,Date) [PhyloFis] -> IO ()
172 writeFis path name grain step support clique fis = do
173 let fisPath = path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json"
174 L.writeFile fisPath $ encode (DL.concat $ DM.elems fis)
175
176 --------------
177 -- | Main | --
178 --------------
179
180
181 main :: IO ()
182 main = do
183
184 [jsonPath] <- getArgs
185
186 confJson <- (eitherDecode <$> getJson jsonPath) :: IO (P.Either P.String Conf)
187
188 case confJson of
189 P.Left err -> putStrLn err
190 P.Right conf -> do
191
192 termList <- csvMapTermList (listPath conf)
193
194 corpus <- parse (corpusType conf) (limit conf) (corpusPath conf) termList
195
196 putStrLn $ ("\n" <> show (length corpus) <> " parsed docs")
197
198 fis <- parseFis (fisPath conf) (phyloName conf) (timeGrain conf) (timeStep conf) (fisSupport conf) (fisClique conf)
199
200 putStrLn $ ("\n" <> show (length fis) <> " parsed fis")
201
202 let fis' = DM.fromListWith (++) $ DL.sortOn (fst . fst) $ map (\f -> (getFisPeriod f,[f])) fis
203
204 let query = PhyloQueryBuild (phyloName conf) "" (timeGrain conf) (timeStep conf)
205 (Fis $ FisParams True (fisSupport conf) (fisClique conf)) [] [] (WeightedLogJaccard $ WLJParams (timeTh conf) (timeSens conf)) (timeFrame conf) (timeFrameTh conf)
206 (reBranchThr conf) (reBranchNth conf) (phyloLevel conf)
207 (RelatedComponents $ RCParams $ WeightedLogJaccard $ WLJParams (clusterTh conf) (clusterSens conf))
208
209 let queryView = PhyloQueryView (viewLevel conf) Merge False 1 [BranchAge,BranchBirth,BranchGroups] [SizeBranch $ SBParams (minSizeBranch conf)] [GroupLabelIncDyn,BranchPeakInc] (Just (ByBranchBirth,Asc)) Json Flat True
210
211 let phylo = toPhylo query corpus termList fis'
212
213 writeFis (fisPath conf) (phyloName conf) (timeGrain conf) (timeStep conf) (fisSupport conf) (fisClique conf) (getPhyloFis phylo)
214
215 let view = toPhyloView queryView phylo
216
217 putStrLn $ ("phylo completed until level " <> show (phyloLevel conf) <> ", export at level " <> show (viewLevel conf))
218
219 let outputFile = (outputPath conf) <> (DT.unpack $ phyloName conf)
220 <> "_" <> show (limit conf) <> "_"
221 <> "_" <> show (timeTh conf) <> "_"
222 <> "_" <> show (timeSens conf) <> "_"
223 <> "_" <> show (clusterTh conf) <> "_"
224 <> "_" <> show (clusterSens conf)
225 <> ".dot"
226
227 P.writeFile outputFile $ dotToString $ viewToDot view