]> Git — Sourcephile - gargantext.git/blob - bin/gargantext-phylo/Main.hs
[Jobs] Instances Show ScraperStatus
[gargantext.git] / bin / gargantext-phylo / Main.hs
1 {-|
2 Module : Main.hs
3 Description : Gargantext starter binary with Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Phylo binaries
11
12 -}
13
14 {-# LANGUAGE DataKinds #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE NoImplicitPrelude #-}
18 {-# LANGUAGE OverloadedStrings #-}
19 {-# LANGUAGE StandaloneDeriving #-}
20 {-# LANGUAGE TypeOperators #-}
21 {-# LANGUAGE Strict #-}
22
23 module Main where
24
25 import System.Directory (doesFileExist)
26
27 import Data.Aeson
28 import Data.Text (Text, unwords, unlines)
29 import Data.List ((++),concat)
30 import GHC.Generics
31 import GHC.IO (FilePath)
32 import Gargantext.Prelude
33 import Gargantext.Text.List.CSV (csvGraphTermList)
34 import Gargantext.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year)
35 import qualified Gargantext.Text.Corpus.Parsers.CSV as CSV
36 import Gargantext.Text.Corpus.Parsers (FileFormat(..),parseFile)
37 import Gargantext.Text.Terms.WithList
38 import Gargantext.Text.Context (TermList)
39
40 import Control.Monad (mapM)
41
42 import System.Environment
43
44 import Gargantext.Viz.Phylo
45 import Gargantext.Viz.Phylo.Tools
46 import Gargantext.Viz.Phylo.LevelMaker
47 import Gargantext.Viz.Phylo.View.Export
48 import Gargantext.Viz.Phylo.View.ViewMaker
49
50 import Gargantext.Database.Types.Node
51 import Data.Maybe
52
53 import Control.Concurrent.Async as CCA (mapConcurrently)
54
55 import qualified Data.Map as DM
56 import qualified Data.Vector as DV
57 import qualified Data.List as DL
58 import qualified Data.Text as DT
59 import qualified Prelude as P
60 import qualified Data.ByteString.Lazy as L
61
62
63 --------------
64 -- | Conf | --
65 --------------
66
67
68 type ListPath = FilePath
69 type FisPath = FilePath
70 type CorpusPath = FilePath
71 data CorpusType = Wos | Csv deriving (Show,Generic)
72 type Limit = Int
73
74 data Conf =
75 Conf { corpusPath :: CorpusPath
76 , corpusType :: CorpusType
77 , listPath :: ListPath
78 , fisPath :: FilePath
79 , outputPath :: FilePath
80 , phyloName :: Text
81 , limit :: Limit
82 , timeGrain :: Int
83 , timeStep :: Int
84 , timeFrame :: Int
85 , timeFrameTh :: Double
86 , timeTh :: Double
87 , timeSens :: Double
88 , reBranchThr :: Double
89 , reBranchNth :: Int
90 , clusterTh :: Double
91 , clusterSens :: Double
92 , phyloLevel :: Int
93 , viewLevel :: Int
94 , fisSupport :: Int
95 , fisClique :: Int
96 , minSizeBranch :: Int
97 } deriving (Show,Generic)
98
99 instance FromJSON Conf
100 instance ToJSON Conf
101
102 instance FromJSON CorpusType
103 instance ToJSON CorpusType
104
105
106 decoder :: P.Either a b -> b
107 decoder (P.Left _) = P.error "Error"
108 decoder (P.Right x) = x
109
110 -- | Get the conf from a Json file
111 getJson :: FilePath -> IO L.ByteString
112 getJson path = L.readFile path
113
114
115 ---------------
116 -- | Parse | --
117 ---------------
118
119
120 -- | To filter the Ngrams of a document based on the termList
121 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
122 filterTerms patterns (y,d) = (y,termsInText patterns d)
123 where
124 --------------------------------------
125 termsInText :: Patterns -> Text -> [Text]
126 termsInText pats txt = DL.nub
127 $ DL.concat
128 $ map (map unwords)
129 $ extractTermsWithList pats txt
130 --------------------------------------
131
132
133 -- | To transform a Csv nfile into a readable corpus
134 csvToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
135 csvToCorpus limit csv = DV.toList
136 -- . DV.reverse
137 . DV.take limit
138 -- . DV.reverse
139 . DV.map (\n -> (csv_publication_year n, (csv_title n) <> " " <> (csv_abstract n)))
140 . snd <$> CSV.readFile csv
141
142
143 -- | To transform a Wos nfile into a readable corpus
144 wosToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
145 wosToCorpus limit path = DL.take limit
146 . map (\d -> ((fromJust $_hyperdataDocument_publication_year d)
147 ,(fromJust $_hyperdataDocument_title d) <> " " <> (fromJust $_hyperdataDocument_abstract d)))
148 . filter (\d -> (isJust $_hyperdataDocument_publication_year d)
149 && (isJust $_hyperdataDocument_title d)
150 && (isJust $_hyperdataDocument_abstract d))
151 . concat
152 <$> mapConcurrently (\idx -> parseFile WOS (path <> show(idx) <> ".txt")) [1..20]
153
154
155 -- | To use the correct parser given a CorpusType
156 fileToCorpus :: CorpusType -> Limit -> CorpusPath -> IO ([(Int,Text)])
157 fileToCorpus format limit path = case format of
158 Wos -> wosToCorpus limit path
159 Csv -> csvToCorpus limit path
160
161
162 -- | To parse a file into a list of Document
163 parse :: CorpusType -> Limit -> CorpusPath -> TermList -> IO [Document]
164 parse format limit path l = do
165 corpus <- fileToCorpus format limit path
166 let patterns = buildPatterns l
167 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
168
169
170 -- | To parse an existing Fis file
171 parseFis :: FisPath -> Text -> Int -> Int -> Int -> Int -> IO [PhyloFis]
172 parseFis path name grain step support clique = do
173 fisExists <- doesFileExist (path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json")
174 if fisExists
175 then do
176 fisJson <- (eitherDecode <$> getJson (path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json")) :: IO (P.Either P.String [PhyloFis])
177 case fisJson of
178 P.Left err -> do
179 putStrLn err
180 pure []
181 P.Right fis -> pure fis
182 else pure []
183
184 writeFis :: FisPath -> Text -> Int -> Int -> Int -> Int -> DM.Map (Date,Date) [PhyloFis] -> IO ()
185 writeFis path name grain step support clique fis = do
186 let fisPath = path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json"
187 L.writeFile fisPath $ encode (DL.concat $ DM.elems fis)
188
189 --------------
190 -- | Main | --
191 --------------
192
193
194 main :: IO ()
195 main = do
196
197 [jsonPath] <- getArgs
198
199 confJson <- (eitherDecode <$> getJson jsonPath) :: IO (P.Either P.String Conf)
200
201 case confJson of
202 P.Left err -> putStrLn err
203 P.Right conf -> do
204
205 termList <- csvGraphTermList (listPath conf)
206
207 corpus <- parse (corpusType conf) (limit conf) (corpusPath conf) termList
208
209 putStrLn $ ("\n" <> show (length corpus) <> " parsed docs")
210
211 fis <- parseFis (fisPath conf) (phyloName conf) (timeGrain conf) (timeStep conf) (fisSupport conf) (fisClique conf)
212
213 putStrLn $ ("\n" <> show (length fis) <> " parsed fis")
214
215 let fis' = DM.fromListWith (++) $ DL.sortOn (fst . fst) $ map (\f -> (getFisPeriod f,[f])) fis
216
217 let query = PhyloQueryBuild (phyloName conf) "" (timeGrain conf) (timeStep conf)
218 (Fis $ FisParams True (fisSupport conf) (fisClique conf)) [] [] (WeightedLogJaccard $ WLJParams (timeTh conf) (timeSens conf)) (timeFrame conf) (timeFrameTh conf)
219 (reBranchThr conf) (reBranchNth conf) (phyloLevel conf)
220 (RelatedComponents $ RCParams $ WeightedLogJaccard $ WLJParams (clusterTh conf) (clusterSens conf))
221
222 let queryView = PhyloQueryView (viewLevel conf) Merge False 1 [BranchAge,BranchBirth,BranchGroups] [SizeBranch $ SBParams (minSizeBranch conf)] [GroupLabelIncDyn,BranchPeakInc] (Just (ByBranchBirth,Asc)) Json Flat True
223
224 let phylo = toPhylo query corpus termList fis'
225
226 writeFis (fisPath conf) (phyloName conf) (timeGrain conf) (timeStep conf) (fisSupport conf) (fisClique conf) (getPhyloFis phylo)
227
228 let view = toPhyloView queryView phylo
229
230 putStrLn $ ("phylo completed until level " <> show (phyloLevel conf) <> ", export at level " <> show (viewLevel conf))
231
232 let outputFile = (outputPath conf) <> (DT.unpack $ phyloName conf)
233 <> "_" <> show (limit conf) <> "_"
234 <> "_" <> show (timeTh conf) <> "_"
235 <> "_" <> show (timeSens conf) <> "_"
236 <> "_" <> show (clusterTh conf) <> "_"
237 <> "_" <> show (clusterSens conf)
238 <> ".dot"
239
240 P.writeFile outputFile $ dotToString $ viewToDot view