]> Git — Sourcephile - gargantext.git/blob - bin/gargantext-phylo/Main.hs
tune the threshold and sensibility of the WeightedLogJaccard
[gargantext.git] / bin / gargantext-phylo / Main.hs
1 {-|
2 Module : Main.hs
3 Description : Gargantext starter binary with Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Phylo binaries
11
12 -}
13
14 {-# LANGUAGE DataKinds #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE NoImplicitPrelude #-}
18 {-# LANGUAGE OverloadedStrings #-}
19 {-# LANGUAGE StandaloneDeriving #-}
20 {-# LANGUAGE TypeOperators #-}
21 {-# LANGUAGE Strict #-}
22
23 module Main where
24
25 import Data.Aeson
26 import Data.Text (Text, unwords)
27 import GHC.Generics
28 import GHC.IO (FilePath)
29 import Gargantext.Prelude
30 import Gargantext.Text.List.CSV (csvGraphTermList)
31 import Gargantext.Text.Parsers.CSV (readCsv, csv_title, csv_abstract, csv_publication_year)
32 import Gargantext.Text.Terms.WithList
33
34 import System.Environment
35
36 import Gargantext.Viz.Phylo
37 import Gargantext.Viz.Phylo.Tools
38 import Gargantext.Viz.Phylo.LevelMaker
39 import Gargantext.Viz.Phylo.View.Export
40 import Gargantext.Viz.Phylo.View.ViewMaker
41
42 import qualified Data.Map as DM
43 import qualified Data.Vector as DV
44 import qualified Data.List as DL
45 import qualified Data.Text as DT
46 import qualified Prelude as P
47 import qualified Data.ByteString.Lazy as L
48
49 ------------------------------------------------------------------------
50 -- Format to produce the Phylo
51 data TextsByYear =
52 TextsByYear { year :: Int
53 , texts :: [[Text]]
54 } deriving (Show, Generic)
55
56 instance ToJSON TextsByYear
57
58 instance ToJSON Document
59 ------------------------------------------------------------------------
60
61 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
62 filterTerms patterns (year', doc) = (year',termsInText patterns doc)
63 where
64 termsInText :: Patterns -> Text -> [Text]
65 termsInText pats txt = DL.nub $ DL.concat $ map (map unwords) $ extractTermsWithList pats txt
66
67
68 csvToCorpus :: Int -> FilePath -> IO ([(Int,Text)])
69 csvToCorpus limit csv = DV.toList
70 . DV.take limit
71 . DV.map (\n -> (csv_publication_year n, (csv_title n) <> " " <> (csv_abstract n)))
72 . snd <$> readCsv csv
73
74 type ListPath = FilePath
75 type CorpusPath = FilePath
76 type Limit = Int
77
78 parse :: Limit -> CorpusPath -> ListPath -> IO [Document]
79 parse limit corpus liste = do
80 corpus' <- csvToCorpus limit corpus
81 liste' <- csvGraphTermList liste
82 let patterns = buildPatterns liste'
83 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus'
84
85
86 main :: IO ()
87 main = do
88
89
90 -- [corpusPath, termListPath, outputPath] <- getArgs
91
92 let corpusPath = "/home/qlobbe/data/epique/corpus/cultural_evolution/texts/fullCorpus.csv"
93 let termListPath = "/home/qlobbe/data/epique/corpus/cultural_evolution/termList.csv"
94 let outputPath = "/home/qlobbe/data/epique/output/cultural_evolution.dot"
95
96 let query = PhyloQueryBuild "cultural_evolution" "" 5 3 defaultFis [] [] (WeightedLogJaccard $ WLJParams 0 0)
97 2 (RelatedComponents $ RCParams $ WeightedLogJaccard $ WLJParams 0.1 10)
98
99 let queryView = PhyloQueryView 2 Merge False 1 [BranchAge] [defaultSmallBranch] [BranchPeakFreq,GroupLabelCooc] (Just (ByBranchAge,Asc)) Json Flat True
100
101 putStrLn $ show "-- Start parsing the corpus"
102
103 corpus <- parse 500 corpusPath termListPath
104
105 let foundations = DL.nub $ DL.concat $ map text corpus
106
107 -- putStrLn $ show (map text corpus)
108
109 -- foundations <- DL.concat <$> DL.concat <$> map snd <$> csvGraphTermList termListPath
110
111 -- putStrLn $ show foundations
112
113 -- a <- map snd <$> csvGraphTermList liste
114
115 let phylo = toPhylo query corpus foundations []
116
117 let view = toPhyloView queryView phylo
118
119 -- TODO Phylo here
120 P.writeFile outputPath $ dotToString $ viewToDot view
121 -- L.writeFile outputPath $ encode corpus
122
123