3 Description : Gargantext starter binary with Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
14 {-# LANGUAGE DataKinds #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE NoImplicitPrelude #-}
18 {-# LANGUAGE OverloadedStrings #-}
19 {-# LANGUAGE StandaloneDeriving #-}
20 {-# LANGUAGE TypeOperators #-}
21 {-# LANGUAGE Strict #-}
26 import Data.Text (Text, unwords)
28 import GHC.IO (FilePath)
29 import Gargantext.Prelude
30 import Gargantext.Text.List.CSV (csvGraphTermList)
31 import Gargantext.Text.Parsers.CSV (readCsv, csv_title, csv_abstract, csv_publication_year)
32 import Gargantext.Text.Terms.WithList
34 import System.Environment
36 import Gargantext.Viz.Phylo
37 import Gargantext.Viz.Phylo.Tools
38 import Gargantext.Viz.Phylo.LevelMaker
39 import Gargantext.Viz.Phylo.View.Export
40 import Gargantext.Viz.Phylo.View.ViewMaker
42 import qualified Data.Map as DM
43 import qualified Data.Vector as DV
44 import qualified Data.List as DL
45 import qualified Data.Text as DT
46 import qualified Prelude as P
47 import qualified Data.ByteString.Lazy as L
49 ------------------------------------------------------------------------
50 -- Format to produce the Phylo
52 TextsByYear { year :: Int
54 } deriving (Show, Generic)
56 instance ToJSON TextsByYear
58 instance ToJSON Document
59 ------------------------------------------------------------------------
61 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
62 filterTerms patterns (year', doc) = (year',termsInText patterns doc)
64 termsInText :: Patterns -> Text -> [Text]
65 termsInText pats txt = DL.nub $ DL.concat $ map (map unwords) $ extractTermsWithList pats txt
68 csvToCorpus :: Int -> FilePath -> IO ([(Int,Text)])
69 csvToCorpus limit csv = DV.toList
71 . DV.map (\n -> (csv_publication_year n, (csv_title n) <> " " <> (csv_abstract n)))
74 type ListPath = FilePath
75 type CorpusPath = FilePath
78 parse :: Limit -> CorpusPath -> ListPath -> IO [Document]
79 parse limit corpus liste = do
80 corpus' <- csvToCorpus limit corpus
81 liste' <- csvGraphTermList liste
82 let patterns = buildPatterns liste'
83 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus'
90 -- [corpusPath, termListPath, outputPath] <- getArgs
92 let corpusPath = "/home/qlobbe/data/epique/corpus/cultural_evolution/texts/fullCorpus.csv"
93 let termListPath = "/home/qlobbe/data/epique/corpus/cultural_evolution/termList.csv"
94 let outputPath = "/home/qlobbe/data/epique/output/cultural_evolution.dot"
96 let query = PhyloQueryBuild "cultural_evolution" "" 5 3 defaultFis [] [] (WeightedLogJaccard $ WLJParams 0 0)
97 2 (RelatedComponents $ RCParams $ WeightedLogJaccard $ WLJParams 0.1 10)
99 let queryView = PhyloQueryView 2 Merge False 1 [BranchAge] [defaultSmallBranch] [BranchPeakFreq,GroupLabelCooc] (Just (ByBranchAge,Asc)) Json Flat True
101 putStrLn $ show "-- Start parsing the corpus"
103 corpus <- parse 500 corpusPath termListPath
105 let foundations = DL.nub $ DL.concat $ map text corpus
107 -- putStrLn $ show (map text corpus)
109 -- foundations <- DL.concat <$> DL.concat <$> map snd <$> csvGraphTermList termListPath
111 -- putStrLn $ show foundations
113 -- a <- map snd <$> csvGraphTermList liste
115 let phylo = toPhylo query corpus foundations []
117 let view = toPhyloView queryView phylo
120 P.writeFile outputPath $ dotToString $ viewToDot view
121 -- L.writeFile outputPath $ encode corpus