]> Git — Sourcephile - gargantext.git/blob - bin/gargantext-phylo/Main.hs
remove bad tracers
[gargantext.git] / bin / gargantext-phylo / Main.hs
1 {-|
2 Module : Main.hs
3 Description : Gargantext starter binary with Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Phylo binaries
11
12 -}
13
14 {-# LANGUAGE DataKinds #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE NoImplicitPrelude #-}
18 {-# LANGUAGE OverloadedStrings #-}
19 {-# LANGUAGE StandaloneDeriving #-}
20 {-# LANGUAGE TypeOperators #-}
21 {-# LANGUAGE Strict #-}
22
23 module Main where
24
25 import Data.Aeson
26 import Data.Text (Text, unwords)
27 import GHC.Generics
28 import GHC.IO (FilePath)
29 import Gargantext.Prelude
30 import Gargantext.Text.List.CSV (csvGraphTermList)
31 import Gargantext.Text.Parsers.CSV (csv_title, csv_abstract, csv_publication_year)
32 import qualified Gargantext.Text.Parsers.CSV as CSV
33 import Gargantext.Text.Parsers (FileFormat(..),parseFile)
34 import Gargantext.Text.Terms.WithList
35 import Gargantext.Text.Context (TermList)
36
37 import System.Environment
38
39 import Gargantext.Viz.Phylo
40 import Gargantext.Viz.Phylo.Tools
41 import Gargantext.Viz.Phylo.LevelMaker
42 import Gargantext.Viz.Phylo.View.Export
43 import Gargantext.Viz.Phylo.View.ViewMaker
44
45
46 import Gargantext.Database.Types.Node
47
48 import Data.Maybe
49
50
51 import qualified Data.Map as DM
52 import qualified Data.Vector as DV
53 import qualified Data.List as DL
54 import qualified Data.Text as DT
55 import qualified Prelude as P
56 import qualified Data.ByteString.Lazy as L
57
58
59 --------------
60 -- | Conf | --
61 --------------
62
63
64 type ListPath = FilePath
65 type CorpusPath = FilePath
66 data CorpusType = Wos | Csv deriving (Show,Generic)
67 type Limit = Int
68
69 data Conf =
70 Conf { corpusPath :: CorpusPath
71 , corpusType :: CorpusType
72 , listPath :: ListPath
73 , outputPath :: FilePath
74 , phyloName :: Text
75 , limit :: Limit
76 , timeGrain :: Int
77 , timeStep :: Int
78 , timeTh :: Double
79 , timeSens :: Double
80 , clusterTh :: Double
81 , clusterSens :: Double
82 , phyloLevel :: Int
83 , viewLevel :: Int
84 , fisSupport :: Int
85 , fisClique :: Int
86 , minSizeBranch :: Int
87 } deriving (Show,Generic)
88
89 instance FromJSON Conf
90 instance ToJSON Conf
91
92 instance FromJSON CorpusType
93 instance ToJSON CorpusType
94
95 -- | Get the conf from a Json file
96 getJson :: FilePath -> IO L.ByteString
97 getJson path = L.readFile path
98
99
100 ---------------
101 -- | Parse | --
102 ---------------
103
104
105 -- | To filter the Ngrams of a document based on the termList
106 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
107 filterTerms patterns (y,d) = (y,termsInText patterns d)
108 where
109 --------------------------------------
110 termsInText :: Patterns -> Text -> [Text]
111 termsInText pats txt = DL.nub $ DL.concat $ map (map unwords) $ extractTermsWithList pats txt
112 --------------------------------------
113
114
115 -- | To transform a Csv nfile into a readable corpus
116 csvToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
117 csvToCorpus limit csv = DV.toList
118 -- . DV.reverse
119 . DV.take limit
120 -- . DV.reverse
121 . DV.map (\n -> (csv_publication_year n, (csv_title n) <> " " <> (csv_abstract n)))
122 . snd <$> CSV.readFile csv
123
124
125 -- | To transform a Wos nfile into a readable corpus
126 wosToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
127 wosToCorpus limit path = DL.take limit
128 . map (\d -> ((fromJust $_hyperdataDocument_publication_year d)
129 ,(fromJust $_hyperdataDocument_title d) <> " " <> (fromJust $_hyperdataDocument_abstract d)))
130 . filter (\d -> (isJust $_hyperdataDocument_publication_year d)
131 && (isJust $_hyperdataDocument_title d)
132 && (isJust $_hyperdataDocument_abstract d))
133 <$> parseFile WOS path
134
135
136 -- | To use the correct parser given a CorpusType
137 fileToCorpus :: CorpusType -> Limit -> CorpusPath -> IO ([(Int,Text)])
138 fileToCorpus format limit path = case format of
139 Wos -> wosToCorpus limit path
140 Csv -> csvToCorpus limit path
141
142
143 -- | To parse a file into a list of Document
144 parse :: CorpusType -> Limit -> CorpusPath -> TermList -> IO [Document]
145 parse format limit path l = do
146 corpus <- fileToCorpus format limit path
147 let patterns = buildPatterns l
148 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
149
150
151 --------------
152 -- | Main | --
153 --------------
154
155
156 main :: IO ()
157 main = do
158
159 [jsonPath] <- getArgs
160
161 confJson <- (eitherDecode <$> getJson jsonPath) :: IO (P.Either P.String Conf)
162
163 case confJson of
164 P.Left err -> putStrLn err
165 P.Right conf -> do
166
167 termList <- csvGraphTermList (listPath conf)
168
169 corpus <- parse (corpusType conf) (limit conf) (corpusPath conf) termList
170
171 let roots = DL.nub $ DL.concat $ map text corpus
172
173 putStrLn $ ("\n" <> show (length corpus) <> " parsed docs")
174
175 let query = PhyloQueryBuild (phyloName conf) "" (timeGrain conf) (timeStep conf)
176 (Fis $ FisParams True (fisSupport conf) (fisClique conf)) [] [] (WeightedLogJaccard $ WLJParams (timeTh conf) (timeSens conf)) (phyloLevel conf)
177 (RelatedComponents $ RCParams $ WeightedLogJaccard $ WLJParams (clusterTh conf) (clusterSens conf))
178
179 let queryView = PhyloQueryView (viewLevel conf) Merge False 1 [BranchAge] [SizeBranch $ SBParams (minSizeBranch conf)] [BranchPeakFreq,GroupLabelCooc] (Just (ByBranchAge,Asc)) Json Flat True
180
181 let phylo = toPhylo query corpus roots termList
182
183 let view = toPhyloView queryView phylo
184
185 putStrLn $ ("phylo completed until level " <> show (phyloLevel conf) <> ", export at level " <> show (viewLevel conf))
186
187 let outputFile = (outputPath conf) <> (DT.unpack $ phyloName conf)
188 <> "_" <> show (limit conf) <> "_"
189 <> "_" <> show (timeTh conf) <> "_"
190 <> "_" <> show (timeSens conf) <> "_"
191 <> "_" <> show (clusterTh conf) <> "_"
192 <> "_" <> show (clusterSens conf)
193 <> ".dot"
194
195 P.writeFile outputFile $ dotToString $ viewToDot view