]> Git — Sourcephile - gargantext.git/blob - bin/gargantext-adaptative-phylo/Main.hs
working on the adaptative matching
[gargantext.git] / bin / gargantext-adaptative-phylo / Main.hs
1 {-|
2 Module : Main.hs
3 Description : Gargantext starter binary with Adaptative Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Adaptative Phylo binaries
11 -}
12
13 {-# LANGUAGE DataKinds #-}
14 {-# LANGUAGE DeriveGeneric #-}
15 {-# LANGUAGE FlexibleInstances #-}
16 {-# LANGUAGE NoImplicitPrelude #-}
17 {-# LANGUAGE StandaloneDeriving #-}
18 {-# LANGUAGE TypeOperators #-}
19 {-# LANGUAGE OverloadedStrings #-}
20 {-# LANGUAGE Strict #-}
21
22 module Main where
23
24 import Data.Aeson
25 import Data.ByteString.Lazy (ByteString)
26 import Data.Maybe ()
27 import Data.List (concat, nub)
28 import Data.String (String)
29 import Data.Text (Text, unwords)
30
31 import Gargantext.Prelude
32 import Gargantext.Text.Context (TermList)
33 import Gargantext.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year)
34 import Gargantext.Text.List.CSV (csvGraphTermList)
35 import Gargantext.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
36 import Gargantext.Viz.AdaptativePhylo
37
38 import GHC.IO (FilePath)
39 import Prelude (Either(..))
40 import System.Environment
41
42 import qualified Data.ByteString.Lazy as Lazy
43 import qualified Data.Vector as Vector
44 import qualified Gargantext.Text.Corpus.Parsers.CSV as Csv
45
46
47 ---------------
48 -- | Tools | --
49 ---------------
50
51
52 -- | To print an important message as an IO()
53 printIOMsg :: String -> IO ()
54 printIOMsg msg =
55 putStrLn ( "\n"
56 <> "------------"
57 <> "\n"
58 <> "-- | " <> msg <> "\n" )
59
60
61 -- | To print a comment as an IO()
62 printIOComment :: String -> IO ()
63 printIOComment cmt =
64 putStrLn ( "\n" <> cmt <> "\n" )
65
66
67 -- | To read and decode a Json file
68 readJson :: FilePath -> IO ByteString
69 readJson path = Lazy.readFile path
70
71 -- | To filter the Ngrams of a document based on the termList
72 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
73 filterTerms patterns (y,d) = (y,termsInText patterns d)
74 where
75 --------------------------------------
76 termsInText :: Patterns -> Text -> [Text]
77 termsInText pats txt = nub $ concat $ map (map unwords) $ extractTermsWithList pats txt
78 --------------------------------------
79
80
81 -- | To transform a Csv nfile into a readable corpus
82 csvToCorpus :: Int -> FilePath -> IO ([(Int,Text)])
83 csvToCorpus limit path = Vector.toList
84 <$> Vector.take limit
85 <$> Vector.map (\row -> (csv_publication_year row, (csv_title row) <> " " <> (csv_abstract row)))
86 <$> snd <$> Csv.readFile path
87
88
89 -- | To use the correct parser given a CorpusType
90 fileToCorpus :: CorpusParser -> Int -> FilePath -> IO ([(Int,Text)])
91 fileToCorpus parser limit path = case parser of
92 -- To do Wos from legacy Main.hs
93 Wos -> undefined
94 Csv -> csvToCorpus limit path
95
96
97 -- | To parse a file into a list of Document
98 fileToDocs :: CorpusParser -> Int -> FilePath -> TermList -> IO [Document]
99 fileToDocs parser limit path lst = do
100 corpus <- fileToCorpus parser limit path
101 let patterns = buildPatterns lst
102 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
103
104
105 --------------
106 -- | Main | --
107 --------------
108
109
110 main :: IO ()
111 main = do
112
113 printIOMsg "Starting the reconstruction"
114
115 printIOMsg "Read the configuration file"
116 [args] <- getArgs
117 jsonArgs <- (eitherDecode <$> readJson args) :: IO (Either String Config)
118
119 case jsonArgs of
120 Left err -> putStrLn err
121 Right config -> do
122
123 printIOMsg "Parse the corpus"
124 mapList <- csvGraphTermList (listPath config)
125 corpus <- fileToDocs (corpusParser config) (corpusLimit config) (corpusPath config) mapList
126 printIOComment (show (length corpus) <> " parsed docs from the corpus")
127
128