]> Git — Sourcephile - gargantext.git/blob - bin/gargantext-phylo/Main.hs
[DB/FACT] Schema NodeNodeNgrams -> Query (with warnings)
[gargantext.git] / bin / gargantext-phylo / Main.hs
1 {-|
2 Module : Main.hs
3 Description : Gargantext starter binary with Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Phylo binaries
11
12 -}
13
14 {-# LANGUAGE DataKinds #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE NoImplicitPrelude #-}
18 {-# LANGUAGE OverloadedStrings #-}
19 {-# LANGUAGE StandaloneDeriving #-}
20 {-# LANGUAGE TypeOperators #-}
21 {-# LANGUAGE Strict #-}
22
23 module Main where
24
25 import Control.Concurrent.Async as CCA (mapConcurrently)
26 import Control.Monad (mapM)
27 import Data.Aeson
28 import Data.List ((++),concat)
29 import Data.Maybe
30 import Data.Text (Text, unwords, unlines)
31 import GHC.Generics
32 import GHC.IO (FilePath)
33 import Gargantext.Database.Admin.Types.Node
34 import Gargantext.Prelude
35 import Gargantext.Text.Context (TermList)
36 import Gargantext.Text.Corpus.Parsers (FileFormat(..),parseFile)
37 import Gargantext.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year)
38 import Gargantext.Text.List.CSV (csvGraphTermList)
39 import Gargantext.Text.Terms.WithList
40 import Gargantext.Viz.Phylo
41 import Gargantext.Viz.Phylo.LevelMaker
42 import Gargantext.Viz.Phylo.Tools
43 import Gargantext.Viz.Phylo.View.Export
44 import Gargantext.Viz.Phylo.View.ViewMaker
45 import System.Directory (doesFileExist)
46 import System.Environment
47 import qualified Data.ByteString.Lazy as L
48 import qualified Data.List as DL
49 import qualified Data.Map as DM
50 import qualified Data.Text as DT
51 import qualified Data.Vector as DV
52 import qualified Gargantext.Text.Corpus.Parsers.CSV as CSV
53 import qualified Prelude as P
54
55
56 --------------
57 -- | Conf | --
58 --------------
59
60
61 type ListPath = FilePath
62 type FisPath = FilePath
63 type CorpusPath = FilePath
64 data CorpusType = Wos | Csv deriving (Show,Generic)
65 type Limit = Int
66
67 data Conf =
68 Conf { corpusPath :: CorpusPath
69 , corpusType :: CorpusType
70 , listPath :: ListPath
71 , fisPath :: FilePath
72 , outputPath :: FilePath
73 , phyloName :: Text
74 , limit :: Limit
75 , timeGrain :: Int
76 , timeStep :: Int
77 , timeFrame :: Int
78 , timeFrameTh :: Double
79 , timeTh :: Double
80 , timeSens :: Double
81 , reBranchThr :: Double
82 , reBranchNth :: Int
83 , clusterTh :: Double
84 , clusterSens :: Double
85 , phyloLevel :: Int
86 , viewLevel :: Int
87 , fisSupport :: Int
88 , fisClique :: Int
89 , minSizeBranch :: Int
90 } deriving (Show,Generic)
91
92 instance FromJSON Conf
93 instance ToJSON Conf
94
95 instance FromJSON CorpusType
96 instance ToJSON CorpusType
97
98
99 decoder :: P.Either a b -> b
100 decoder (P.Left _) = P.error "Error"
101 decoder (P.Right x) = x
102
103 -- | Get the conf from a Json file
104 getJson :: FilePath -> IO L.ByteString
105 getJson path = L.readFile path
106
107
108 ---------------
109 -- | Parse | --
110 ---------------
111
112
113 -- | To filter the Ngrams of a document based on the termList
114 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
115 filterTerms patterns (y,d) = (y,termsInText patterns d)
116 where
117 --------------------------------------
118 termsInText :: Patterns -> Text -> [Text]
119 termsInText pats txt = DL.nub
120 $ DL.concat
121 $ map (map unwords)
122 $ extractTermsWithList pats txt
123 --------------------------------------
124
125
126 -- | To transform a Csv nfile into a readable corpus
127 csvToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
128 csvToCorpus limit csv = DV.toList
129 -- . DV.reverse
130 . DV.take limit
131 -- . DV.reverse
132 . DV.map (\n -> (csv_publication_year n, (csv_title n) <> " " <> (csv_abstract n)))
133 . snd <$> CSV.readFile csv
134
135
136 -- | To transform a Wos nfile into a readable corpus
137 wosToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
138 wosToCorpus limit path = DL.take limit
139 . map (\d -> ((fromJust $_hyperdataDocument_publication_year d)
140 ,(fromJust $_hyperdataDocument_title d) <> " " <> (fromJust $_hyperdataDocument_abstract d)))
141 . filter (\d -> (isJust $_hyperdataDocument_publication_year d)
142 && (isJust $_hyperdataDocument_title d)
143 && (isJust $_hyperdataDocument_abstract d))
144 . concat
145 <$> mapConcurrently (\idx -> parseFile WOS (path <> show(idx) <> ".txt")) [1..20]
146
147
148 -- | To use the correct parser given a CorpusType
149 fileToCorpus :: CorpusType -> Limit -> CorpusPath -> IO ([(Int,Text)])
150 fileToCorpus format limit path = case format of
151 Wos -> wosToCorpus limit path
152 Csv -> csvToCorpus limit path
153
154
155 -- | To parse a file into a list of Document
156 parse :: CorpusType -> Limit -> CorpusPath -> TermList -> IO [Document]
157 parse format limit path l = do
158 corpus <- fileToCorpus format limit path
159 let patterns = buildPatterns l
160 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
161
162
163 -- | To parse an existing Fis file
164 parseFis :: FisPath -> Text -> Int -> Int -> Int -> Int -> IO [PhyloFis]
165 parseFis path name grain step support clique = do
166 fisExists <- doesFileExist (path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json")
167 if fisExists
168 then do
169 fisJson <- (eitherDecode <$> getJson (path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json")) :: IO (P.Either P.String [PhyloFis])
170 case fisJson of
171 P.Left err -> do
172 putStrLn err
173 pure []
174 P.Right fis -> pure fis
175 else pure []
176
177 writeFis :: FisPath -> Text -> Int -> Int -> Int -> Int -> DM.Map (Date,Date) [PhyloFis] -> IO ()
178 writeFis path name grain step support clique fis = do
179 let fisPath = path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json"
180 L.writeFile fisPath $ encode (DL.concat $ DM.elems fis)
181
182 --------------
183 -- | Main | --
184 --------------
185
186
187 main :: IO ()
188 main = do
189
190 [jsonPath] <- getArgs
191
192 confJson <- (eitherDecode <$> getJson jsonPath) :: IO (P.Either P.String Conf)
193
194 case confJson of
195 P.Left err -> putStrLn err
196 P.Right conf -> do
197
198 termList <- csvGraphTermList (listPath conf)
199
200 corpus <- parse (corpusType conf) (limit conf) (corpusPath conf) termList
201
202 putStrLn $ ("\n" <> show (length corpus) <> " parsed docs")
203
204 fis <- parseFis (fisPath conf) (phyloName conf) (timeGrain conf) (timeStep conf) (fisSupport conf) (fisClique conf)
205
206 putStrLn $ ("\n" <> show (length fis) <> " parsed fis")
207
208 let fis' = DM.fromListWith (++) $ DL.sortOn (fst . fst) $ map (\f -> (getFisPeriod f,[f])) fis
209
210 let query = PhyloQueryBuild (phyloName conf) "" (timeGrain conf) (timeStep conf)
211 (Fis $ FisParams True (fisSupport conf) (fisClique conf)) [] [] (WeightedLogJaccard $ WLJParams (timeTh conf) (timeSens conf)) (timeFrame conf) (timeFrameTh conf)
212 (reBranchThr conf) (reBranchNth conf) (phyloLevel conf)
213 (RelatedComponents $ RCParams $ WeightedLogJaccard $ WLJParams (clusterTh conf) (clusterSens conf))
214
215 let queryView = PhyloQueryView (viewLevel conf) Merge False 1 [BranchAge,BranchBirth,BranchGroups] [SizeBranch $ SBParams (minSizeBranch conf)] [GroupLabelIncDyn,BranchPeakInc] (Just (ByBranchBirth,Asc)) Json Flat True
216
217 let phylo = toPhylo query corpus termList fis'
218
219 writeFis (fisPath conf) (phyloName conf) (timeGrain conf) (timeStep conf) (fisSupport conf) (fisClique conf) (getPhyloFis phylo)
220
221 let view = toPhyloView queryView phylo
222
223 putStrLn $ ("phylo completed until level " <> show (phyloLevel conf) <> ", export at level " <> show (viewLevel conf))
224
225 let outputFile = (outputPath conf) <> (DT.unpack $ phyloName conf)
226 <> "_" <> show (limit conf) <> "_"
227 <> "_" <> show (timeTh conf) <> "_"
228 <> "_" <> show (timeSens conf) <> "_"
229 <> "_" <> show (clusterTh conf) <> "_"
230 <> "_" <> show (clusterSens conf)
231 <> ".dot"
232
233 P.writeFile outputFile $ dotToString $ viewToDot view