]> Git — Sourcephile - gargantext.git/blob - bin/gargantext-phylo/Main.hs
phylo from wos in progress
[gargantext.git] / bin / gargantext-phylo / Main.hs
1 {-|
2 Module : Main.hs
3 Description : Gargantext starter binary with Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Phylo binaries
11
12 -}
13
14 {-# LANGUAGE DataKinds #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE NoImplicitPrelude #-}
18 {-# LANGUAGE OverloadedStrings #-}
19 {-# LANGUAGE StandaloneDeriving #-}
20 {-# LANGUAGE TypeOperators #-}
21 {-# LANGUAGE Strict #-}
22
23 module Main where
24
25 import Data.Aeson
26 import Data.Text (Text, unwords)
27 import GHC.Generics
28 import GHC.IO (FilePath)
29 import Gargantext.Prelude
30 import Gargantext.Text.List.CSV (csvGraphTermList)
31 import Gargantext.Text.Parsers.CSV (readCsv, csv_title, csv_abstract, csv_publication_year)
32 import Gargantext.Text.Parsers (FileFormat(..),parseDocs)
33 import Gargantext.Text.Terms.WithList
34 import Gargantext.Text.Context (TermList)
35
36 import System.Environment
37
38 import Gargantext.Viz.Phylo
39 import Gargantext.Viz.Phylo.Tools
40 import Gargantext.Viz.Phylo.LevelMaker
41 import Gargantext.Viz.Phylo.View.Export
42 import Gargantext.Viz.Phylo.View.ViewMaker
43
44 import qualified Data.Map as DM
45 import qualified Data.Vector as DV
46 import qualified Data.List as DL
47 import qualified Data.Text as DT
48 import qualified Prelude as P
49 import qualified Data.ByteString.Lazy as L
50
51
52 --------------
53 -- | Conf | --
54 --------------
55
56
57 type ListPath = FilePath
58 type CorpusPath = FilePath
59 data CorpusType = Wos | Csv deriving (Show,Generic)
60 type Limit = Int
61
62 data Conf =
63 Conf { corpusPath :: CorpusPath
64 , corpusType :: CorpusType
65 , listPath :: ListPath
66 , outputPath :: FilePath
67 , phyloName :: Text
68 , limit :: Limit
69 } deriving (Show,Generic)
70
71 instance FromJSON Conf
72 instance ToJSON Conf
73
74 instance FromJSON CorpusType
75 instance ToJSON CorpusType
76
77 -- | Get the conf from a Json file
78 getJson :: FilePath -> IO L.ByteString
79 getJson path = L.readFile path
80
81
82 ---------------
83 -- | Parse | --
84 ---------------
85
86
87 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
88 filterTerms patterns (year', doc) = (year',termsInText patterns doc)
89 where
90 termsInText :: Patterns -> Text -> [Text]
91 termsInText pats txt = DL.nub $ DL.concat $ map (map unwords) $ extractTermsWithList pats txt
92
93
94 csvToCorpus :: Int -> CorpusPath -> IO ([(Int,Text)])
95 csvToCorpus limit csv = DV.toList
96 . DV.take limit
97 . DV.map (\n -> (csv_publication_year n, (csv_title n) <> " " <> (csv_abstract n)))
98 . snd <$> readCsv csv
99
100
101 wosToCorpus :: Int -> CorpusPath -> IO ([(Int,Text)])
102 wosToCorpus limit path = undefined
103
104
105 fileToCorpus :: CorpusType -> Int -> CorpusPath -> IO ([(Int,Text)])
106 fileToCorpus format limit path = case format of
107 Wos -> wosToCorpus limit path
108 Csv -> csvToCorpus limit path
109
110
111 parse :: Limit -> CorpusPath -> TermList -> IO [Document]
112 parse limit corpus lst = do
113 corpus' <- csvToCorpus limit corpus
114 let patterns = buildPatterns lst
115 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus'
116
117
118 --------------
119 -- | Main | --
120 --------------
121
122
123 main :: IO ()
124 main = do
125
126 putStrLn $ show "--| Read the conf |--"
127
128 [jsonPath] <- getArgs
129
130 confJson <- (eitherDecode <$> getJson jsonPath) :: IO (P.Either P.String Conf)
131
132 case confJson of
133 P.Left err -> putStrLn err
134 P.Right conf -> do
135
136 putStrLn $ show "--| Parse the corpus |--"
137
138 termList <- csvGraphTermList (listPath conf)
139
140 corpus <- parse (limit conf) (corpusPath conf) termList
141
142 let roots = DL.nub $ DL.concat $ map text corpus
143
144 putStrLn $ show "--| Build the phylo |--"
145
146 let query = PhyloQueryBuild (phyloName conf) "" 5 3 defaultFis [] [] (WeightedLogJaccard $ WLJParams 0.00001 10) 2 (RelatedComponents $ RCParams $ WeightedLogJaccard $ WLJParams 0.5 10)
147
148 let queryView = PhyloQueryView 2 Merge False 1 [BranchAge] [defaultSmallBranch] [BranchPeakFreq,GroupLabelCooc] (Just (ByBranchAge,Asc)) Json Flat True
149
150 let phylo = toPhylo query corpus roots termList
151
152 let view = toPhyloView queryView phylo
153
154 putStrLn $ show "--| Export the phylo as a dot graph |--"
155
156 let outputFile = (outputPath conf) P.++ (DT.unpack $ phyloName conf) P.++ ".dot"
157
158 P.writeFile outputFile $ dotToString $ viewToDot view