]> Git — Sourcephile - gargantext.git/blob - bin/gargantext-phylo/Main.hs
Merge branch 'dev-phylo' of https://gitlab.iscpif.fr/gargantext/haskell-gargantext...
[gargantext.git] / bin / gargantext-phylo / Main.hs
1 {-|
2 Module : Main.hs
3 Description : Gargantext starter binary with Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Phylo binaries
11
12 -}
13
14 {-# LANGUAGE DataKinds #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE NoImplicitPrelude #-}
18 {-# LANGUAGE OverloadedStrings #-}
19 {-# LANGUAGE StandaloneDeriving #-}
20 {-# LANGUAGE TypeOperators #-}
21 {-# LANGUAGE Strict #-}
22
23 module Main where
24
25 import Data.Aeson
26 import Data.Text (Text, unwords)
27 import GHC.Generics
28 import GHC.IO (FilePath)
29 import Gargantext.Prelude
30 import Gargantext.Text.List.CSV (csvGraphTermList)
31 import Gargantext.Text.Parsers.CSV (readCsv, csv_title, csv_abstract, csv_publication_year)
32 import Gargantext.Text.Parsers (FileFormat(..),parseDocs)
33 import Gargantext.Text.Terms.WithList
34 import Gargantext.Text.Context (TermList)
35
36 import System.Environment
37
38 import Gargantext.Viz.Phylo
39 import Gargantext.Viz.Phylo.Tools
40 import Gargantext.Viz.Phylo.LevelMaker
41 import Gargantext.Viz.Phylo.View.Export
42 import Gargantext.Viz.Phylo.View.ViewMaker
43
44
45 import Gargantext.Database.Types.Node
46
47 import Data.Maybe
48
49
50 import qualified Data.Map as DM
51 import qualified Data.Vector as DV
52 import qualified Data.List as DL
53 import qualified Data.Text as DT
54 import qualified Prelude as P
55 import qualified Data.ByteString.Lazy as L
56
57
58 --------------
59 -- | Conf | --
60 --------------
61
62
63 type ListPath = FilePath
64 type CorpusPath = FilePath
65 data CorpusType = Wos | Csv deriving (Show,Generic)
66 type Limit = Int
67
68 data Conf =
69 Conf { corpusPath :: CorpusPath
70 , corpusType :: CorpusType
71 , listPath :: ListPath
72 , outputPath :: FilePath
73 , phyloName :: Text
74 , limit :: Limit
75 , timeGrain :: Int
76 , timeStep :: Int
77 , timeTh :: Double
78 , timeSens :: Double
79 , clusterTh :: Double
80 , clusterSens :: Double
81 } deriving (Show,Generic)
82
83 instance FromJSON Conf
84 instance ToJSON Conf
85
86 instance FromJSON CorpusType
87 instance ToJSON CorpusType
88
89 -- | Get the conf from a Json file
90 getJson :: FilePath -> IO L.ByteString
91 getJson path = L.readFile path
92
93
94 ---------------
95 -- | Parse | --
96 ---------------
97
98
99 -- | To filter the Ngrams of a document based on the termList
100 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
101 filterTerms patterns (y,d) = (y,termsInText patterns d)
102 where
103 --------------------------------------
104 termsInText :: Patterns -> Text -> [Text]
105 termsInText pats txt = DL.nub $ DL.concat $ map (map unwords) $ extractTermsWithList pats txt
106 --------------------------------------
107
108
109 -- | To transform a Csv nfile into a readable corpus
110 csvToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
111 csvToCorpus limit csv = DV.toList
112 . DV.take limit
113 . DV.map (\n -> (csv_publication_year n, (csv_title n) <> " " <> (csv_abstract n)))
114 . snd <$> readCsv csv
115
116
117 -- | To transform a Wos nfile into a readable corpus
118 wosToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
119 wosToCorpus limit path = DL.take limit
120 . map (\d -> ((fromJust $_hyperdataDocument_publication_year d)
121 ,(fromJust $_hyperdataDocument_title d) <> " " <> (fromJust $_hyperdataDocument_abstract d)))
122 . filter (\d -> (isJust $_hyperdataDocument_publication_year d)
123 && (isJust $_hyperdataDocument_title d)
124 && (isJust $_hyperdataDocument_abstract d))
125 <$> parseDocs WOS path
126
127
128 -- | To use the correct parser given a CorpusType
129 fileToCorpus :: CorpusType -> Limit -> CorpusPath -> IO ([(Int,Text)])
130 fileToCorpus format limit path = case format of
131 Wos -> wosToCorpus limit path
132 Csv -> csvToCorpus limit path
133
134
135 -- | To parse a file into a list of Document
136 parse :: CorpusType -> Limit -> CorpusPath -> TermList -> IO [Document]
137 parse format limit path l = do
138 corpus <- fileToCorpus format limit path
139 let patterns = buildPatterns l
140 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
141
142
143 --------------
144 -- | Main | --
145 --------------
146
147
148 main :: IO ()
149 main = do
150
151 putStrLn $ show ("--| Read the conf |--")
152
153 [jsonPath] <- getArgs
154
155 confJson <- (eitherDecode <$> getJson jsonPath) :: IO (P.Either P.String Conf)
156
157 case confJson of
158 P.Left err -> putStrLn err
159 P.Right conf -> do
160
161 putStrLn $ show ("--| Parse the corpus |--")
162
163 termList <- csvGraphTermList (listPath conf)
164
165 corpus <- parse (corpusType conf) (limit conf) (corpusPath conf) termList
166
167 let roots = DL.nub $ DL.concat $ map text corpus
168
169 putStrLn $ ("-- | parsed docs : " <> show (length corpus) <> " |--")
170
171 putStrLn $ show ("--| Build the phylo |--")
172
173 let query = PhyloQueryBuild (phyloName conf) "" (timeGrain conf) (timeStep conf)
174 defaultFis [] [] (WeightedLogJaccard $ WLJParams (timeTh conf) (timeSens conf)) 2
175 (RelatedComponents $ RCParams $ WeightedLogJaccard $ WLJParams (clusterTh conf) (clusterSens conf))
176
177 let queryView = PhyloQueryView 2 Merge False 1 [BranchAge] [defaultSmallBranch] [BranchPeakFreq,GroupLabelCooc] (Just (ByBranchAge,Asc)) Json Flat True
178
179 let phylo = toPhylo query corpus roots termList
180
181 let view = toPhyloView queryView phylo
182
183 putStrLn $ show ("--| Export the phylo as a dot graph |--")
184
185 let outputFile = (outputPath conf) <> (DT.unpack $ phyloName conf)
186 <> "_" <> show (limit conf) <> "_"
187 <> "_" <> show (timeTh conf) <> "_"
188 <> "_" <> show (timeSens conf) <> "_"
189 <> "_" <> show (clusterTh conf) <> "_"
190 <> "_" <> show (clusterSens conf)
191 <> ".dot"
192
193 P.writeFile outputFile $ dotToString $ viewToDot view