]> Git — Sourcephile - gargantext.git/blob - bin/gargantext-phylo/Main.hs
[ngrams] add score update endpoint + sorting
[gargantext.git] / bin / gargantext-phylo / Main.hs
1 {-|
2 Module : Main.hs
3 Description : Gargantext starter binary with Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Phylo binaries
11
12 -}
13
14 {-# LANGUAGE StandaloneDeriving #-}
15 {-# LANGUAGE TypeOperators #-}
16 {-# LANGUAGE Strict #-}
17
18 module Main where
19
20 import Control.Concurrent.Async as CCA (mapConcurrently)
21 import Control.Monad (mapM)
22 import Data.Aeson
23 import Data.List ((++),concat)
24 import Data.Maybe
25 import Data.Text (Text, unwords, unlines)
26 import GHC.Generics
27 import GHC.IO (FilePath)
28 import Gargantext.Database.Admin.Types.Hyperdata
29 import Gargantext.Database.Admin.Types.Node
30 import Gargantext.Prelude
31 import Gargantext.Core.Text.Context (TermList)
32 import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..),parseFile)
33 import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year)
34 import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
35 import Gargantext.Core.Text.Terms.WithList
36 import Gargantext.Core.Viz.Phylo
37 import Gargantext.Core.Viz.Phylo.LevelMaker
38 import Gargantext.Core.Viz.Phylo.Tools
39 import Gargantext.Core.Viz.Phylo.View.Export
40 import Gargantext.Core.Viz.Phylo.View.ViewMaker
41 import System.Directory (doesFileExist)
42 import System.Environment
43 import qualified Data.ByteString.Lazy as L
44 import qualified Data.List as DL
45 import qualified Data.Map as DM
46 import qualified Data.Text as DT
47 import qualified Data.Vector as DV
48 import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as CSV
49 import qualified Prelude as P
50
51
52 --------------
53 -- | Conf | --
54 --------------
55
56
57 type ListPath = FilePath
58 type FisPath = FilePath
59 type CorpusPath = FilePath
60 data CorpusType = Wos | Csv deriving (Show,Generic)
61 type Limit = Int
62
63 data Conf =
64 Conf { corpusPath :: CorpusPath
65 , corpusType :: CorpusType
66 , listPath :: ListPath
67 , fisPath :: FilePath
68 , outputPath :: FilePath
69 , phyloName :: Text
70 , limit :: Limit
71 , timeGrain :: Int
72 , timeStep :: Int
73 , timeFrame :: Int
74 , timeFrameTh :: Double
75 , timeTh :: Double
76 , timeSens :: Double
77 , reBranchThr :: Double
78 , reBranchNth :: Int
79 , clusterTh :: Double
80 , clusterSens :: Double
81 , phyloLevel :: Int
82 , viewLevel :: Int
83 , fisSupport :: Int
84 , fisClique :: Int
85 , minSizeBranch :: Int
86 } deriving (Show,Generic)
87
88 instance FromJSON Conf
89 instance ToJSON Conf
90
91 instance FromJSON CorpusType
92 instance ToJSON CorpusType
93
94
95 decoder :: P.Either a b -> b
96 decoder (P.Left _) = P.error "Error"
97 decoder (P.Right x) = x
98
99 -- | Get the conf from a Json file
100 getJson :: FilePath -> IO L.ByteString
101 getJson path = L.readFile path
102
103
104 ---------------
105 -- | Parse | --
106 ---------------
107
108
109 -- | To filter the Ngrams of a document based on the termList
110 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
111 filterTerms patterns (y,d) = (y,termsInText patterns d)
112 where
113 --------------------------------------
114 termsInText :: Patterns -> Text -> [Text]
115 termsInText pats txt = DL.nub
116 $ DL.concat
117 $ map (map unwords)
118 $ extractTermsWithList pats txt
119 --------------------------------------
120
121
122 -- | To transform a Csv nfile into a readable corpus
123 csvToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
124 csvToCorpus limit csv = DV.toList
125 -- . DV.reverse
126 . DV.take limit
127 -- . DV.reverse
128 . DV.map (\n -> (csv_publication_year n, (csv_title n) <> " " <> (csv_abstract n)))
129 . snd <$> CSV.readFile csv
130
131
132 -- | To transform a Wos nfile into a readable corpus
133 wosToCorpus :: Limit -> CorpusPath -> IO ([(Int,Text)])
134 wosToCorpus limit path = DL.take limit
135 . map (\d -> ((fromJust $_hd_publication_year d)
136 ,(fromJust $_hd_title d) <> " " <> (fromJust $_hd_abstract d)))
137 . filter (\d -> (isJust $_hd_publication_year d)
138 && (isJust $_hd_title d)
139 && (isJust $_hd_abstract d))
140 . concat
141 <$> mapConcurrently (\idx -> parseFile WOS (path <> show(idx) <> ".txt")) [1..20]
142
143
144 -- | To use the correct parser given a CorpusType
145 fileToCorpus :: CorpusType -> Limit -> CorpusPath -> IO ([(Int,Text)])
146 fileToCorpus format limit path = case format of
147 Wos -> wosToCorpus limit path
148 Csv -> csvToCorpus limit path
149
150
151 -- | To parse a file into a list of Document
152 parse :: CorpusType -> Limit -> CorpusPath -> TermList -> IO [Document]
153 parse format limit path l = do
154 corpus <- fileToCorpus format limit path
155 let patterns = buildPatterns l
156 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
157
158
159 -- | To parse an existing Fis file
160 parseFis :: FisPath -> Text -> Int -> Int -> Int -> Int -> IO [PhyloFis]
161 parseFis path name grain step support clique = do
162 fisExists <- doesFileExist (path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json")
163 if fisExists
164 then do
165 fisJson <- (eitherDecode <$> getJson (path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json")) :: IO (P.Either P.String [PhyloFis])
166 case fisJson of
167 P.Left err -> do
168 putStrLn err
169 pure []
170 P.Right fis -> pure fis
171 else pure []
172
173 writeFis :: FisPath -> Text -> Int -> Int -> Int -> Int -> DM.Map (Date,Date) [PhyloFis] -> IO ()
174 writeFis path name grain step support clique fis = do
175 let fisPath = path <> (DT.unpack name) <> "_" <> show(grain) <> "_" <> show(step) <> "_" <> show(support) <> "_" <> show(clique) <> ".json"
176 L.writeFile fisPath $ encode (DL.concat $ DM.elems fis)
177
178 --------------
179 -- | Main | --
180 --------------
181
182
183 main :: IO ()
184 main = do
185
186 [jsonPath] <- getArgs
187
188 confJson <- (eitherDecode <$> getJson jsonPath) :: IO (P.Either P.String Conf)
189
190 case confJson of
191 P.Left err -> putStrLn err
192 P.Right conf -> do
193
194 termList <- csvMapTermList (listPath conf)
195
196 corpus <- parse (corpusType conf) (limit conf) (corpusPath conf) termList
197
198 putStrLn $ ("\n" <> show (length corpus) <> " parsed docs")
199
200 fis <- parseFis (fisPath conf) (phyloName conf) (timeGrain conf) (timeStep conf) (fisSupport conf) (fisClique conf)
201
202 putStrLn $ ("\n" <> show (length fis) <> " parsed fis")
203
204 let fis' = DM.fromListWith (++) $ DL.sortOn (fst . fst) $ map (\f -> (getFisPeriod f,[f])) fis
205
206 let query = PhyloQueryBuild (phyloName conf) "" (timeGrain conf) (timeStep conf)
207 (Fis $ FisParams True (fisSupport conf) (fisClique conf)) [] [] (WeightedLogJaccard $ WLJParams (timeTh conf) (timeSens conf)) (timeFrame conf) (timeFrameTh conf)
208 (reBranchThr conf) (reBranchNth conf) (phyloLevel conf)
209 (RelatedComponents $ RCParams $ WeightedLogJaccard $ WLJParams (clusterTh conf) (clusterSens conf))
210
211 let queryView = PhyloQueryView (viewLevel conf) Merge False 1 [BranchAge,BranchBirth,BranchGroups] [SizeBranch $ SBParams (minSizeBranch conf)] [GroupLabelIncDyn,BranchPeakInc] (Just (ByBranchBirth,Asc)) Json Flat True
212
213 let phylo = toPhylo query corpus termList fis'
214
215 writeFis (fisPath conf) (phyloName conf) (timeGrain conf) (timeStep conf) (fisSupport conf) (fisClique conf) (getPhyloFis phylo)
216
217 let view = toPhyloView queryView phylo
218
219 putStrLn $ ("phylo completed until level " <> show (phyloLevel conf) <> ", export at level " <> show (viewLevel conf))
220
221 let outputFile = (outputPath conf) <> (DT.unpack $ phyloName conf)
222 <> "_" <> show (limit conf) <> "_"
223 <> "_" <> show (timeTh conf) <> "_"
224 <> "_" <> show (timeSens conf) <> "_"
225 <> "_" <> show (clusterTh conf) <> "_"
226 <> "_" <> show (clusterSens conf)
227 <> ".dot"
228
229 P.writeFile outputFile $ dotToString $ viewToDot view