3 Description : Gargantext starter binary with Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
14 {-# LANGUAGE DataKinds #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE NoImplicitPrelude #-}
18 {-# LANGUAGE OverloadedStrings #-}
19 {-# LANGUAGE StandaloneDeriving #-}
20 {-# LANGUAGE TypeOperators #-}
21 {-# LANGUAGE Strict #-}
26 import Data.Text (Text, unwords)
28 import GHC.IO (FilePath)
29 import Gargantext.Prelude
30 import Gargantext.Text.List.CSV (csvGraphTermList)
31 import Gargantext.Text.Parsers.CSV (readCsv, csv_title, csv_abstract, csv_publication_year)
32 import Gargantext.Text.Terms.WithList
33 import System.Environment
35 import Gargantext.Viz.Phylo
36 import Gargantext.Viz.Phylo.Tools
37 import Gargantext.Viz.Phylo.LevelMaker
38 import Gargantext.Viz.Phylo.View.Export
39 import Gargantext.Viz.Phylo.View.ViewMaker
41 import qualified Data.Map as DM
42 import qualified Data.Vector as DV
43 import qualified Data.List as DL
44 import qualified Data.Text as DT
45 import qualified Prelude as P
46 import qualified Data.ByteString.Lazy as L
48 ------------------------------------------------------------------------
49 -- Format to produce the Phylo
51 TextsByYear { year :: Int
53 } deriving (Show, Generic)
55 instance ToJSON TextsByYear
57 instance ToJSON Document
58 ------------------------------------------------------------------------
60 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
61 filterTerms patterns (year', doc) = (year',termsInText patterns doc)
63 termsInText :: Patterns -> Text -> [Text]
64 termsInText pats txt = DL.nub $ DL.concat $ map (map unwords) $ extractTermsWithList pats txt
67 -- csvToCorpus :: Int -> FilePath -> IO (DM.Map Int [Text])
68 csvToCorpus :: Int -> FilePath -> IO ([(Int,Text)])
69 csvToCorpus limit csv = DV.toList
70 -- DM.fromListWith (<>)
72 . DV.map (\n -> (csv_publication_year n, (csv_title n) <> " " <> (csv_abstract n)))
75 type ListPath = FilePath
76 type CorpusPath = FilePath
79 parse :: Limit -> CorpusPath -> ListPath -> IO [Document]
80 parse limit corpus liste = do
81 corpus' <- csvToCorpus limit corpus
82 liste' <- csvGraphTermList liste
83 let patterns = buildPatterns liste'
84 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus'
90 -- [corpusFile, termListFile, outputFile] <- getArgs
92 let corpusPath = "/home/qlobbe/data/epique/corpus/cultural_evolution/texts/fullCorpus.csv"
93 let termListPath = "/home/qlobbe/data/epique/corpus/cultural_evolution/termList.csv"
94 let outputPath = "/home/qlobbe/data/epique/output/cultural_evolution.dot"
96 let query = PhyloQueryBuild "cultural_evolution" "Test" 5 3 defaultFis [] [] defaultWeightedLogJaccard 3 defaultRelatedComponents
97 let queryView = PhyloQueryView 2 Merge False 1 [BranchAge] [defaultSmallBranch] [BranchPeakFreq,GroupLabelCooc] (Just (ByBranchAge,Asc)) Json Flat True
99 corpus <- parse 5000 corpusPath termListPath
101 let foundations = DL.nub $ DL.concat $ map text corpus
103 -- putStrLn $ show $ csvGraphTermList termListPath
105 let phylo = toPhylo query corpus foundations []
107 let view = toPhyloView queryView phylo
110 P.writeFile outputPath $ dotToString $ viewToDot view
111 -- L.writeFile outputPath $ encode corpus