]> Git — Sourcephile - gargantext.git/blob - bin/gargantext-phylo/Main.hs
[TOOLING] sql queries to remove unused documents and ngrams from database. Usage...
[gargantext.git] / bin / gargantext-phylo / Main.hs
1 {-|
2 Module : Main.hs
3 Description : Gargantext starter binary with Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Phylo binaries
11
12 -}
13
14 {-# LANGUAGE StandaloneDeriving #-}
15 {-# LANGUAGE TypeOperators #-}
16 {-# LANGUAGE Strict #-}
17
18 module Main where
19
20 import Control.Concurrent.Async as CCA (mapConcurrently)
21 import Data.Aeson
22 import Data.List ((++),concat)
23 import Data.Maybe
24 import Data.Text (Text, unwords)
25 import GHC.Generics
26 import GHC.IO (FilePath)
27 import System.Directory (doesFileExist)
28 import System.Environment
29 import qualified Data.ByteString.Lazy as L
30 import qualified Data.List as DL
31 import qualified Data.Map as DM
32 import qualified Data.Text as DT
33 import qualified Data.Vector as DV
34 import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as CSV
35 import qualified Prelude as P
36
37 import Gargantext.Database.Admin.Types.Hyperdata
38 import Gargantext.Prelude
39 import Gargantext.Core.Text.Context (TermList)
40 import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..),parseFile)
41 import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year)
42 import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
43 import Gargantext.Core.Text.Terms.WithList
44 import Gargantext.Core.Viz.Phylo
45 import Gargantext.Core.Viz.Phylo.LevelMaker
46 import Gargantext.Core.Viz.Phylo.Tools
47 import Gargantext.Core.Viz.Phylo.View.Export
48 import Gargantext.Core.Viz.Phylo.View.ViewMaker
49
50
51 --------------
52 -- | Conf | --
53 --------------
54
55
56 type ListPath = FilePath
57 type FisPath = FilePath
58 type CorpusPath = FilePath
59 data CorpusType = Wos | Csv deriving (Show,Generic)
60 type Limit = Int
61
62 data Conf =
63 Conf { corpusPath :: CorpusPath
64 , corpusType :: CorpusType
65 , listPath :: ListPath
66 , fisPath :: FilePath
67 , outputPath :: FilePath
68 , phyloName :: Text
69 , limit :: Limit
70 , timeGrain :: Int
71 , timeStep :: Int
72 , timeFrame :: Int
73 , timeFrameTh :: Double
74 , timeTh :: Double
75 , timeSens :: Double
76 , reBranchThr :: Double
77 , reBranchNth :: Int
78 , clusterTh :: Double
79 , clusterSens :: Double
80 , phyloLevel :: Int
81 , viewLevel :: Int
82 , fisSupport :: Int
83 , fisClique :: Int
84 , minSizeBranch :: Int
85 } deriving (Show,Generic)
86
87 instance FromJSON Conf
88 instance ToJSON Conf
89
90 instance FromJSON CorpusType
91 instance ToJSON CorpusType
92
93
94 decoder :: P.Either a b -> b
95 decoder (P.Left _) = P.error "Error"
96 decoder (P.Right x) = x
97
98 -- | Get the conf from a Json file
99 getJson :: FilePath -> IO L.ByteString
100 getJson path = L.readFile path
101
102
103 ---------------
104 -- | Parse | --
105 ---------------
106
107 -- | To filter the Ngrams of a document based on the termList
108 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
109 filterTerms patterns (y,d) = (y,termsInText patterns d)
110
111
112 -- | To transform a Csv nfile into a readable corpus
113 csvToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
114 csvToCorpus limit csv = DV.toList
115 -- . DV.reverse
116 . DV.take limit
117 -- . DV.reverse
118 . DV.map (\n -> (csv_publication_year n, (csv_title n) <> " " <> (csv_abstract n)))
119 . snd <$> CSV.readFile csv
120
121
122 -- | To transform a Wos nfile into a readable corpus
123 wosToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
124 wosToCorpus limit path = DL.take limit
125 . map (\d -> ((fromJust $_hd_publication_year d)
126 ,(fromJust $_hd_title d) <> " " <> (fromJust $_hd_abstract d)))
127 . filter (\d -> (isJust $_hd_publication_year d)
128 && (isJust $_hd_title d)
129 && (isJust $_hd_abstract d))
130 . concat
131 <$> mapConcurrently (\idx -> parseFile WOS (path <> show(idx) <> ".txt")) [1..20]
132
133
134 -- | To use the correct parser given a CorpusType
135 fileToCorpus :: CorpusType -> Limit -> CorpusPath -> IO ([(Int,Text)])
136 fileToCorpus format limit path = case format of
137 Wos -> wosToCorpus limit path
138 Csv -> csvToCorpus limit path
139
140
141 -- | To parse a file into a list of Document
142 parse :: CorpusType -> Limit -> CorpusPath -> TermList -> IO [Document]
143 parse format limit path l = do
144 corpus <- fileToCorpus format limit path
145 let patterns = buildPatterns l
146 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
147
148
149 -- | To parse an existing Fis file
150 parseFis :: FisPath -> Text -> Int -> Int -> Int -> Int -> IO [PhyloFis]
151 parseFis path name grain step support clique = do
152 fisExists <- doesFileExist (path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json")
153 if fisExists
154 then do
155 fisJson <- (eitherDecode <$> getJson (path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json")) :: IO (P.Either P.String [PhyloFis])
156 case fisJson of
157 P.Left err -> do
158 putStrLn err
159 pure []
160 P.Right fis -> pure fis
161 else pure []
162
163 writeFis :: FisPath -> Text -> Int -> Int -> Int -> Int -> DM.Map (Date,Date) [PhyloFis] -> IO ()
164 writeFis path name grain step support clique fis = do
165 let fisPath = path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json"
166 L.writeFile fisPath $ encode (DL.concat $ DM.elems fis)
167
168 --------------
169 -- | Main | --
170 --------------
171
172
173 main :: IO ()
174 main = do
175
176 [jsonPath] <- getArgs
177
178 confJson <- (eitherDecode <$> getJson jsonPath) :: IO (P.Either P.String Conf)
179
180 case confJson of
181 P.Left err -> putStrLn err
182 P.Right conf -> do
183
184 termList <- csvMapTermList (listPath conf)
185
186 corpus <- parse (corpusType conf) (limit conf) (corpusPath conf) termList
187
188 putStrLn $ ("\n" <> show (length corpus) <> " parsed docs")
189
190 fis <- parseFis (fisPath conf) (phyloName conf) (timeGrain conf) (timeStep conf) (fisSupport conf) (fisClique conf)
191
192 putStrLn $ ("\n" <> show (length fis) <> " parsed fis")
193
194 let fis' = DM.fromListWith (++) $ DL.sortOn (fst . fst) $ map (\f -> (getFisPeriod f,[f])) fis
195
196 let query = PhyloQueryBuild (phyloName conf) "" (timeGrain conf) (timeStep conf)
197 (Fis $ FisParams True (fisSupport conf) (fisClique conf)) [] [] (WeightedLogJaccard $ WLJParams (timeTh conf) (timeSens conf)) (timeFrame conf) (timeFrameTh conf)
198 (reBranchThr conf) (reBranchNth conf) (phyloLevel conf)
199 (RelatedComponents $ RCParams $ WeightedLogJaccard $ WLJParams (clusterTh conf) (clusterSens conf))
200
201 let queryView = PhyloQueryView (viewLevel conf) Merge False 1 [BranchAge,BranchBirth,BranchGroups] [SizeBranch $ SBParams (minSizeBranch conf)] [GroupLabelIncDyn,BranchPeakInc] (Just (ByBranchBirth,Asc)) Json Flat True
202
203 let phylo = toPhylo query corpus termList fis'
204
205 writeFis (fisPath conf) (phyloName conf) (timeGrain conf) (timeStep conf) (fisSupport conf) (fisClique conf) (getPhyloFis phylo)
206
207 let view = toPhyloView queryView phylo
208
209 putStrLn $ ("phylo completed until level " <> show (phyloLevel conf) <> ", export at level " <> show (viewLevel conf))
210
211 let outputFile = (outputPath conf) <> (DT.unpack $ phyloName conf)
212 <> "_" <> show (limit conf) <> "_"
213 <> "_" <> show (timeTh conf) <> "_"
214 <> "_" <> show (timeSens conf) <> "_"
215 <> "_" <> show (clusterTh conf) <> "_"
216 <> "_" <> show (clusterSens conf)
217 <> ".dot"
218
219 P.writeFile outputFile $ dotToString $ viewToDot view