]> Git — Sourcephile - gargantext.git/blob - bin/gargantext-phylo/Main.hs
[FIX] exposing right Phylo Module into Gargantext lib (to compile binaries).
[gargantext.git] / bin / gargantext-phylo / Main.hs
1 {-|
2 Module : Main.hs
3 Description : Gargantext starter binary with Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Phylo binaries
11
12 -}
13
14 {-# LANGUAGE DataKinds #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE NoImplicitPrelude #-}
18 {-# LANGUAGE OverloadedStrings #-}
19 {-# LANGUAGE StandaloneDeriving #-}
20 {-# LANGUAGE TypeOperators #-}
21 {-# LANGUAGE Strict #-}
22
23 module Main where
24
25 import Data.Aeson
26 import Data.Text (Text, unwords)
27 import GHC.Generics
28 import GHC.IO (FilePath)
29 import Gargantext.Prelude
30 import Gargantext.Text.List.CSV (csvGraphTermList)
31 import Gargantext.Text.Parsers.CSV (readCsv, csv_title, csv_abstract, csv_publication_year)
32 import Gargantext.Text.Terms.WithList
33 import System.Environment
34
35 import Gargantext.Viz.Phylo
36 import Gargantext.Viz.Phylo.Tools
37 import Gargantext.Viz.Phylo.LevelMaker
38 import Gargantext.Viz.Phylo.View.Export
39 import Gargantext.Viz.Phylo.View.ViewMaker
40
41 import qualified Data.Map as DM
42 import qualified Data.Vector as DV
43 import qualified Data.List as DL
44 import qualified Data.Text as DT
45 import qualified Prelude as P
46 import qualified Data.ByteString.Lazy as L
47
48 ------------------------------------------------------------------------
49 -- Format to produce the Phylo
50 data TextsByYear =
51 TextsByYear { year :: Int
52 , texts :: [[Text]]
53 } deriving (Show, Generic)
54
55 instance ToJSON TextsByYear
56
57 instance ToJSON Document
58 ------------------------------------------------------------------------
59
60 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
61 filterTerms patterns (year', doc) = (year',termsInText patterns doc)
62 where
63 termsInText :: Patterns -> Text -> [Text]
64 termsInText pats txt = DL.nub $ DL.concat $ map (map unwords) $ extractTermsWithList pats txt
65
66
67 -- csvToCorpus :: Int -> FilePath -> IO (DM.Map Int [Text])
68 csvToCorpus :: Int -> FilePath -> IO ([(Int,Text)])
69 csvToCorpus limit csv = DV.toList
70 -- DM.fromListWith (<>)
71 . DV.take limit
72 . DV.map (\n -> (csv_publication_year n, (csv_title n) <> " " <> (csv_abstract n)))
73 . snd <$> readCsv csv
74
75 type ListPath = FilePath
76 type CorpusPath = FilePath
77 type Limit = Int
78
79 parse :: Limit -> CorpusPath -> ListPath -> IO [Document]
80 parse limit corpus liste = do
81 corpus' <- csvToCorpus limit corpus
82 liste' <- csvGraphTermList liste
83 let patterns = buildPatterns liste'
84 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus'
85
86
87 main :: IO ()
88 main = do
89
90 -- [corpusFile, termListFile, outputFile] <- getArgs
91
92 let corpusPath = "/home/qlobbe/data/epique/corpus/cultural_evolution/texts/fullCorpus.csv"
93 let termListPath = "/home/qlobbe/data/epique/corpus/cultural_evolution/termList.csv"
94 let outputPath = "/home/qlobbe/data/epique/output/cultural_evolution.dot"
95
96 let query = PhyloQueryBuild "cultural_evolution" "Test" 5 3 defaultFis [] [] defaultWeightedLogJaccard 3 defaultRelatedComponents
97 let queryView = PhyloQueryView 2 Merge False 1 [BranchAge] [defaultSmallBranch] [BranchPeakFreq,GroupLabelCooc] (Just (ByBranchAge,Asc)) Json Flat True
98
99 corpus <- parse 5000 corpusPath termListPath
100
101 let foundations = DL.nub $ DL.concat $ map text corpus
102
103 -- putStrLn $ show $ csvGraphTermList termListPath
104
105 let phylo = toPhylo query corpus foundations []
106
107 let view = toPhyloView queryView phylo
108
109 -- TODO Phylo here
110 P.writeFile outputPath $ dotToString $ viewToDot view
111 -- L.writeFile outputPath $ encode corpus
112
113