]> Git — Sourcephile - gargantext.git/blob - bin/gargantext-phylo/Main.hs
add trace to cluster and optimisation to find candidates
[gargantext.git] / bin / gargantext-phylo / Main.hs
1 {-|
2 Module : Main.hs
3 Description : Gargantext starter binary with Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Phylo binaries
11
12 -}
13
14 {-# LANGUAGE DataKinds #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE NoImplicitPrelude #-}
18 {-# LANGUAGE OverloadedStrings #-}
19 {-# LANGUAGE StandaloneDeriving #-}
20 {-# LANGUAGE TypeOperators #-}
21 {-# LANGUAGE Strict #-}
22
23 module Main where
24
25 import Data.Aeson
26 import Data.Text (Text, unwords)
27 import GHC.Generics
28 import GHC.IO (FilePath)
29 import Gargantext.Prelude
30 import Gargantext.Text.List.CSV (csvGraphTermList)
31 import Gargantext.Text.Parsers.CSV (readCsv, csv_title, csv_abstract, csv_publication_year)
32 import Gargantext.Text.Parsers (FileFormat(..),parseDocs)
33 import Gargantext.Text.Terms.WithList
34 import Gargantext.Text.Context (TermList)
35
36 import System.Environment
37
38 import Gargantext.Viz.Phylo
39 import Gargantext.Viz.Phylo.Tools
40 import Gargantext.Viz.Phylo.LevelMaker
41 import Gargantext.Viz.Phylo.View.Export
42 import Gargantext.Viz.Phylo.View.ViewMaker
43
44
45 import Gargantext.Database.Types.Node
46
47 import Data.Maybe
48
49
50 import qualified Data.Map as DM
51 import qualified Data.Vector as DV
52 import qualified Data.List as DL
53 import qualified Data.Text as DT
54 import qualified Prelude as P
55 import qualified Data.ByteString.Lazy as L
56
57
58 --------------
59 -- | Conf | --
60 --------------
61
62
63 type ListPath = FilePath
64 type CorpusPath = FilePath
65 data CorpusType = Wos | Csv deriving (Show,Generic)
66 type Limit = Int
67
68 data Conf =
69 Conf { corpusPath :: CorpusPath
70 , corpusType :: CorpusType
71 , listPath :: ListPath
72 , outputPath :: FilePath
73 , phyloName :: Text
74 , limit :: Limit
75 , timeGrain :: Int
76 , timeStep :: Int
77 , timeTh :: Double
78 , timeSens :: Double
79 , clusterTh :: Double
80 , clusterSens :: Double
81 , phyloLevel :: Int
82 , viewLevel :: Int
83 , fisSupport :: Int
84 , fisClique :: Int
85 } deriving (Show,Generic)
86
87 instance FromJSON Conf
88 instance ToJSON Conf
89
90 instance FromJSON CorpusType
91 instance ToJSON CorpusType
92
93 -- | Get the conf from a Json file
94 getJson :: FilePath -> IO L.ByteString
95 getJson path = L.readFile path
96
97
98 ---------------
99 -- | Parse | --
100 ---------------
101
102
103 -- | To filter the Ngrams of a document based on the termList
104 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
105 filterTerms patterns (y,d) = (y,termsInText patterns d)
106 where
107 --------------------------------------
108 termsInText :: Patterns -> Text -> [Text]
109 termsInText pats txt = DL.nub $ DL.concat $ map (map unwords) $ extractTermsWithList pats txt
110 --------------------------------------
111
112
113 -- | To transform a Csv nfile into a readable corpus
114 csvToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
115 csvToCorpus limit csv = DV.toList
116 . DV.take limit
117 . DV.map (\n -> (csv_publication_year n, (csv_title n) <> " " <> (csv_abstract n)))
118 . snd <$> readCsv csv
119
120
121 -- | To transform a Wos nfile into a readable corpus
122 wosToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
123 wosToCorpus limit path = DL.take limit
124 . map (\d -> ((fromJust $_hyperdataDocument_publication_year d)
125 ,(fromJust $_hyperdataDocument_title d) <> " " <> (fromJust $_hyperdataDocument_abstract d)))
126 . filter (\d -> (isJust $_hyperdataDocument_publication_year d)
127 && (isJust $_hyperdataDocument_title d)
128 && (isJust $_hyperdataDocument_abstract d))
129 <$> parseDocs WOS path
130
131
132 -- | To use the correct parser given a CorpusType
133 fileToCorpus :: CorpusType -> Limit -> CorpusPath -> IO ([(Int,Text)])
134 fileToCorpus format limit path = case format of
135 Wos -> wosToCorpus limit path
136 Csv -> csvToCorpus limit path
137
138
139 -- | To parse a file into a list of Document
140 parse :: CorpusType -> Limit -> CorpusPath -> TermList -> IO [Document]
141 parse format limit path l = do
142 corpus <- fileToCorpus format limit path
143 let patterns = buildPatterns l
144 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
145
146
147 --------------
148 -- | Main | --
149 --------------
150
151
152 main :: IO ()
153 main = do
154
155 [jsonPath] <- getArgs
156
157 confJson <- (eitherDecode <$> getJson jsonPath) :: IO (P.Either P.String Conf)
158
159 case confJson of
160 P.Left err -> putStrLn err
161 P.Right conf -> do
162
163 termList <- csvGraphTermList (listPath conf)
164
165 corpus <- parse (corpusType conf) (limit conf) (corpusPath conf) termList
166
167 let roots = DL.nub $ DL.concat $ map text corpus
168
169 putStrLn $ (show (length corpus) <> " parsed docs")
170
171 let query = PhyloQueryBuild (phyloName conf) "" (timeGrain conf) (timeStep conf)
172 (Fis $ FisParams True (fisSupport conf) (fisClique conf)) [] [] (WeightedLogJaccard $ WLJParams (timeTh conf) (timeSens conf)) (phyloLevel conf)
173 (RelatedComponents $ RCParams $ WeightedLogJaccard $ WLJParams (clusterTh conf) (clusterSens conf))
174
175 let queryView = PhyloQueryView (viewLevel conf) Merge False 1 [BranchAge] [defaultSmallBranch] [BranchPeakFreq,GroupLabelCooc] (Just (ByBranchAge,Asc)) Json Flat True
176
177 let phylo = toPhylo query corpus roots termList
178
179 let view = toPhyloView queryView phylo
180
181 putStrLn $ ("phylo completed until level " <> show (phyloLevel conf) <> ", export at level " <> show (viewLevel conf))
182
183 let outputFile = (outputPath conf) <> (DT.unpack $ phyloName conf)
184 <> "_" <> show (limit conf) <> "_"
185 <> "_" <> show (timeTh conf) <> "_"
186 <> "_" <> show (timeSens conf) <> "_"
187 <> "_" <> show (clusterTh conf) <> "_"
188 <> "_" <> show (clusterSens conf)
189 <> ".dot"
190
191 P.writeFile outputFile $ dotToString $ viewToDot view