]> Git — Sourcephile - gargantext.git/blob - bin/gargantext-adaptative-phylo/Main.hs
add the phyloBase, Fis and Cooc
[gargantext.git] / bin / gargantext-adaptative-phylo / Main.hs
1 {-|
2 Module : Main.hs
3 Description : Gargantext starter binary with Adaptative Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Adaptative Phylo binaries
11 -}
12
13 {-# LANGUAGE DataKinds #-}
14 {-# LANGUAGE DeriveGeneric #-}
15 {-# LANGUAGE FlexibleInstances #-}
16 {-# LANGUAGE NoImplicitPrelude #-}
17 {-# LANGUAGE StandaloneDeriving #-}
18 {-# LANGUAGE TypeOperators #-}
19 {-# LANGUAGE OverloadedStrings #-}
20 {-# LANGUAGE Strict #-}
21
22 module Main where
23
24 import Data.Aeson
25 import Data.ByteString.Lazy (ByteString)
26 import Data.Maybe (isJust, fromJust)
27 import Data.List (concat, nub, isSuffixOf, take)
28 import Data.String (String)
29 import Data.Text (Text, unwords)
30
31 import Gargantext.Prelude
32 import Gargantext.Database.Types.Node (HyperdataDocument(..))
33 import Gargantext.Text.Context (TermList)
34 import Gargantext.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year)
35 import Gargantext.Text.Corpus.Parsers (FileFormat(..),parseFile)
36 import Gargantext.Text.List.CSV (csvGraphTermList)
37 import Gargantext.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
38 import Gargantext.Viz.AdaptativePhylo
39 import Gargantext.Viz.Phylo.PhyloMaker (toPhylo)
40
41 import GHC.IO (FilePath)
42 import Prelude (Either(..))
43 import System.Environment
44 import System.Directory (listDirectory)
45 import Control.Concurrent.Async (mapConcurrently)
46
47 import qualified Data.ByteString.Lazy as Lazy
48 import qualified Data.Vector as Vector
49 import qualified Gargantext.Text.Corpus.Parsers.CSV as Csv
50
51
52 ---------------
53 -- | Tools | --
54 ---------------
55
56
57 -- | To print an important message as an IO()
58 printIOMsg :: String -> IO ()
59 printIOMsg msg =
60 putStrLn ( "\n"
61 <> "------------"
62 <> "\n"
63 <> "-- | " <> msg <> "\n" )
64
65
66 -- | To print a comment as an IO()
67 printIOComment :: String -> IO ()
68 printIOComment cmt =
69 putStrLn ( "\n" <> cmt <> "\n" )
70
71
72 -- | To get all the files in a directory or just a file
73 getFilesFromPath :: FilePath -> IO([FilePath])
74 getFilesFromPath path = do
75 if (isSuffixOf "/" path)
76 then (listDirectory path)
77 else return [path]
78
79
80 --------------
81 -- | Json | --
82 --------------
83
84
85 -- | To read and decode a Json file
86 readJson :: FilePath -> IO ByteString
87 readJson path = Lazy.readFile path
88
89
90 ----------------
91 -- | Parser | --
92 ----------------
93
94 -- | To filter the Ngrams of a document based on the termList
95 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
96 filterTerms patterns (y,d) = (y,termsInText patterns d)
97 where
98 --------------------------------------
99 termsInText :: Patterns -> Text -> [Text]
100 termsInText pats txt = nub $ concat $ map (map unwords) $ extractTermsWithList pats txt
101 --------------------------------------
102
103
104 -- | To transform a Wos file (or [file]) into a readable corpus
105 wosToCorpus :: Int -> FilePath -> IO ([(Int,Text)])
106 wosToCorpus limit path = do
107 files <- getFilesFromPath path
108 take limit
109 <$> map (\d -> let date' = fromJust $ _hyperdataDocument_publication_year d
110 title = fromJust $ _hyperdataDocument_title d
111 abstr = if (isJust $ _hyperdataDocument_abstract d)
112 then fromJust $ _hyperdataDocument_abstract d
113 else ""
114 in (date', title <> " " <> abstr))
115 <$> concat
116 <$> mapConcurrently (\file ->
117 filter (\d -> (isJust $ _hyperdataDocument_publication_year d)
118 && (isJust $ _hyperdataDocument_title d))
119 <$> parseFile WOS (path <> file) ) files
120
121
122 -- | To transform a Csv file into a readable corpus
123 csvToCorpus :: Int -> FilePath -> IO ([(Int,Text)])
124 csvToCorpus limit path = Vector.toList
125 <$> Vector.take limit
126 <$> Vector.map (\row -> (csv_publication_year row, (csv_title row) <> " " <> (csv_abstract row)))
127 <$> snd <$> Csv.readFile path
128
129
130 -- | To use the correct parser given a CorpusType
131 fileToCorpus :: CorpusParser -> Int -> FilePath -> IO ([(Int,Text)])
132 fileToCorpus parser limit path = case parser of
133 Wos -> wosToCorpus limit path
134 Csv -> csvToCorpus limit path
135
136
137 -- | To parse a file into a list of Document
138 fileToDocs :: CorpusParser -> Int -> FilePath -> TermList -> IO [Document]
139 fileToDocs parser limit path lst = do
140 corpus <- fileToCorpus parser limit path
141 let patterns = buildPatterns lst
142 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
143
144
145 --------------
146 -- | Main | --
147 --------------
148
149
150 main :: IO ()
151 main = do
152
153 printIOMsg "Starting the reconstruction"
154
155 printIOMsg "Read the configuration file"
156 [args] <- getArgs
157 jsonArgs <- (eitherDecode <$> readJson args) :: IO (Either String Config)
158
159 case jsonArgs of
160 Left err -> putStrLn err
161 Right config -> do
162
163 printIOMsg "Parse the corpus"
164 mapList <- csvGraphTermList (listPath config)
165 corpus <- fileToDocs (corpusParser config) (corpusLimit config) (corpusPath config) mapList
166 printIOComment (show (length corpus) <> " parsed docs from the corpus")
167
168 printIOMsg "Reconstruct the Phylo"
169 let phylo = toPhylo corpus mapList config
170
171 printIOMsg "End of reconstruction"