]> Git — Sourcephile - gargantext.git/blob - bin/gargantext-phylo/Main.hs
test
[gargantext.git] / bin / gargantext-phylo / Main.hs
1 {-|
2 Module : Main.hs
3 Description : Gargantext starter binary with Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Phylo binaries
11
12 -}
13
14 {-# LANGUAGE DataKinds #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE NoImplicitPrelude #-}
18 {-# LANGUAGE OverloadedStrings #-}
19 {-# LANGUAGE StandaloneDeriving #-}
20 {-# LANGUAGE TypeOperators #-}
21 {-# LANGUAGE Strict #-}
22
23 module Main where
24
25 import Data.Aeson
26 import Data.Text (Text, unwords)
27 import GHC.Generics
28 import GHC.IO (FilePath)
29 import Gargantext.Prelude
30 import Gargantext.Text.List.CSV (csvGraphTermList)
31 import Gargantext.Text.Parsers.CSV (readCsv, csv_title, csv_abstract, csv_publication_year)
32 import Gargantext.Text.Parsers (FileFormat(..),parseDocs)
33 import Gargantext.Text.Terms.WithList
34 import Gargantext.Text.Context (TermList)
35
36 import System.Environment
37
38 import Gargantext.Viz.Phylo
39 import Gargantext.Viz.Phylo.Tools
40 import Gargantext.Viz.Phylo.LevelMaker
41 import Gargantext.Viz.Phylo.View.Export
42 import Gargantext.Viz.Phylo.View.ViewMaker
43
44
45 import Gargantext.Database.Types.Node
46
47 import Data.Maybe
48
49
50 import qualified Data.Map as DM
51 import qualified Data.Vector as DV
52 import qualified Data.List as DL
53 import qualified Data.Text as DT
54 import qualified Prelude as P
55 import qualified Data.ByteString.Lazy as L
56
57
58 --------------
59 -- | Conf | --
60 --------------
61
62
63 type ListPath = FilePath
64 type CorpusPath = FilePath
65 data CorpusType = Wos | Csv deriving (Show,Generic)
66 type Limit = Int
67
68 data Conf =
69 Conf { corpusPath :: CorpusPath
70 , corpusType :: CorpusType
71 , listPath :: ListPath
72 , outputPath :: FilePath
73 , phyloName :: Text
74 , limit :: Limit
75 , timeGrain :: Int
76 , timeStep :: Int
77 , timeTh :: Double
78 , timeSens :: Double
79 , clusterTh :: Double
80 , clusterSens :: Double
81 , phyloLevel :: Int
82 , viewLevel :: Int
83 , fisSupport :: Int
84 , fisClique :: Int
85 , minSizeBranch :: Int
86 } deriving (Show,Generic)
87
88 instance FromJSON Conf
89 instance ToJSON Conf
90
91 instance FromJSON CorpusType
92 instance ToJSON CorpusType
93
94 -- | Get the conf from a Json file
95 getJson :: FilePath -> IO L.ByteString
96 getJson path = L.readFile path
97
98
99 ---------------
100 -- | Parse | --
101 ---------------
102
103
104 -- | To filter the Ngrams of a document based on the termList
105 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
106 filterTerms patterns (y,d) = (y,termsInText patterns d)
107 where
108 --------------------------------------
109 termsInText :: Patterns -> Text -> [Text]
110 termsInText pats txt = DL.nub $ DL.concat $ map (map unwords) $ extractTermsWithList pats txt
111 --------------------------------------
112
113
114 -- | To transform a Csv nfile into a readable corpus
115 csvToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
116 csvToCorpus limit csv = DV.toList
117 . DV.take limit
118 . DV.map (\n -> (csv_publication_year n, (csv_title n) <> " " <> (csv_abstract n)))
119 . snd <$> readCsv csv
120
121
122 -- | To transform a Wos nfile into a readable corpus
123 wosToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
124 wosToCorpus limit path = DL.take limit
125 . map (\d -> ((fromJust $_hyperdataDocument_publication_year d)
126 ,(fromJust $_hyperdataDocument_title d) <> " " <> (fromJust $_hyperdataDocument_abstract d)))
127 . filter (\d -> (isJust $_hyperdataDocument_publication_year d)
128 && (isJust $_hyperdataDocument_title d)
129 && (isJust $_hyperdataDocument_abstract d))
130 <$> parseDocs WOS path
131
132
133 -- | To use the correct parser given a CorpusType
134 fileToCorpus :: CorpusType -> Limit -> CorpusPath -> IO ([(Int,Text)])
135 fileToCorpus format limit path = case format of
136 Wos -> wosToCorpus limit path
137 Csv -> csvToCorpus limit path
138
139
140 -- | To parse a file into a list of Document
141 parse :: CorpusType -> Limit -> CorpusPath -> TermList -> IO [Document]
142 parse format limit path l = do
143 corpus <- fileToCorpus format limit path
144 let patterns = buildPatterns l
145 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
146
147
148 --------------
149 -- | Main | --
150 --------------
151
152
153 main :: IO ()
154 main = do
155
156 [jsonPath] <- getArgs
157
158 confJson <- (eitherDecode <$> getJson jsonPath) :: IO (P.Either P.String Conf)
159
160 case confJson of
161 P.Left err -> putStrLn err
162 P.Right conf -> do
163
164 termList <- csvGraphTermList (listPath conf)
165
166 corpus <- parse (corpusType conf) (limit conf) (corpusPath conf) termList
167
168 let roots = DL.nub $ DL.concat $ map text corpus
169
170 putStrLn $ ("\n" <> show (length corpus) <> " parsed docs")
171
172 let query = PhyloQueryBuild (phyloName conf) "" (timeGrain conf) (timeStep conf)
173 (Fis $ FisParams True (fisSupport conf) (fisClique conf)) [] [] (WeightedLogJaccard $ WLJParams (timeTh conf) (timeSens conf)) (phyloLevel conf)
174 (RelatedComponents $ RCParams $ WeightedLogJaccard $ WLJParams (clusterTh conf) (clusterSens conf))
175
176 let queryView = PhyloQueryView (viewLevel conf) Merge False 1 [BranchAge] [SizeBranch $ SBParams (minSizeBranch conf)] [BranchPeakFreq,GroupLabelCooc] (Just (ByBranchAge,Asc)) Json Flat True
177
178 let phylo = toPhylo query corpus roots termList
179
180 let view = toPhyloView queryView phylo
181
182 putStrLn $ ("phylo completed until level " <> show (phyloLevel conf) <> ", export at level " <> show (viewLevel conf))
183
184 let outputFile = (outputPath conf) <> (DT.unpack $ phyloName conf)
185 <> "_" <> show (limit conf) <> "_"
186 <> "_" <> show (timeTh conf) <> "_"
187 <> "_" <> show (timeSens conf) <> "_"
188 <> "_" <> show (clusterTh conf) <> "_"
189 <> "_" <> show (clusterSens conf)
190 <> ".dot"
191
192 P.writeFile outputFile $ dotToString $ viewToDot view