bin/gargantext-adaptative-phylo/Main.hs

   1 {-|
   2 Module      : Main.hs
   3 Description : Gargantext starter binary with Adaptative Phylo
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Adaptative Phylo binaries
  11  -}
  12
  13 {-# LANGUAGE DataKinds          #-}
  14 {-# LANGUAGE DeriveGeneric      #-}
  15 {-# LANGUAGE FlexibleInstances  #-}
  16 {-# LANGUAGE NoImplicitPrelude  #-}
  17 {-# LANGUAGE StandaloneDeriving #-}
  18 {-# LANGUAGE TypeOperators      #-}
  19 {-# LANGUAGE OverloadedStrings  #-}
  20 {-# LANGUAGE Strict             #-}
  21
  22 module Main where
  23
  24 import Data.Aeson
  25 import Data.ByteString.Lazy (ByteString)
  26 import Data.Maybe (isJust, fromJust)
  27 import Data.List  (concat, nub, isSuffixOf, take)
  28 import Data.String (String)
  29 import Data.Text  (Text, unwords, unpack)
  30
  31 import Gargantext.Prelude
  32 import Gargantext.Database.Types.Node (HyperdataDocument(..))
  33 import Gargantext.Text.Context (TermList)
  34 import Gargantext.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year)
  35 import Gargantext.Text.Corpus.Parsers (FileFormat(..),parseFile)
  36 import Gargantext.Text.List.CSV (csvGraphTermList)
  37 import Gargantext.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
  38 import Gargantext.Viz.AdaptativePhylo
  39 import Gargantext.Viz.Phylo.PhyloMaker  (toPhylo)
  40 import Gargantext.Viz.Phylo.PhyloTools  (printIOMsg, printIOComment)
  41 import Gargantext.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
  42 -- import Gargantext.Viz.Phylo.SynchronicClustering (synchronicDistance')
  43
  44 import GHC.IO (FilePath)
  45 import Prelude (Either(..))
  46 import System.Environment
  47 import System.Directory (listDirectory)
  48 import Control.Concurrent.Async (mapConcurrently)
  49
  50 import qualified Data.ByteString.Lazy as Lazy
  51 import qualified Data.Vector as Vector
  52 import qualified Gargantext.Text.Corpus.Parsers.CSV as Csv
  53
  54
  55 ---------------
  56 -- | Tools | --
  57 ---------------
  58
  59
  60 -- | To get all the files in a directory or just a file
  61 getFilesFromPath :: FilePath -> IO([FilePath])
  62 getFilesFromPath path = do
  63   if (isSuffixOf "/" path)
  64     then (listDirectory path)
  65     else return [path]
  66
  67
  68 --------------
  69 -- | Json | --
  70 --------------
  71
  72
  73 -- | To read and decode a Json file
  74 readJson :: FilePath -> IO ByteString
  75 readJson path = Lazy.readFile path
  76
  77
  78 ----------------
  79 -- | Parser | --
  80 ----------------
  81
  82 -- | To filter the Ngrams of a document based on the termList
  83 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
  84 filterTerms patterns (y,d) = (y,termsInText patterns d)
  85   where
  86     --------------------------------------
  87     termsInText :: Patterns -> Text -> [Text]
  88     termsInText pats txt = nub $ concat $ map (map unwords) $ extractTermsWithList pats txt
  89     --------------------------------------
  90
  91
  92 -- | To transform a Wos file (or [file]) into a readable corpus
  93 wosToCorpus :: Int -> FilePath -> IO ([(Int,Text)])
  94 wosToCorpus limit path = do
  95       files <- getFilesFromPath path
  96       take limit
  97         <$> map (\d -> let date' = fromJust $ _hyperdataDocument_publication_year d
  98                            title = fromJust $ _hyperdataDocument_title d
  99                            abstr = if (isJust $ _hyperdataDocument_abstract d)
 100                                    then fromJust $ _hyperdataDocument_abstract d
 101                                    else ""
 102                         in (date', title <> " " <> abstr))
 103         <$> concat
 104         <$> mapConcurrently (\file ->
 105               filter (\d -> (isJust $ _hyperdataDocument_publication_year d)
 106                          && (isJust $ _hyperdataDocument_title d))
 107                 <$> parseFile WOS (path <> file) ) files
 108
 109
 110 -- | To transform a Csv file into a readable corpus
 111 csvToCorpus :: Int -> FilePath -> IO ([(Int,Text)])
 112 csvToCorpus limit path = Vector.toList
 113     <$> Vector.take limit
 114     <$> Vector.map (\row -> (csv_publication_year row, (csv_title row) <> " " <> (csv_abstract row)))
 115     <$> snd <$> Csv.readFile path
 116
 117
 118 -- | To use the correct parser given a CorpusType
 119 fileToCorpus :: CorpusParser -> FilePath -> IO ([(Int,Text)])
 120 fileToCorpus parser path = case parser of
 121   Wos limit -> wosToCorpus limit path
 122   Csv limit -> csvToCorpus limit path
 123
 124
 125 -- | To parse a file into a list of Document
 126 fileToDocs :: CorpusParser -> FilePath -> TermList -> IO [Document]
 127 fileToDocs parser path lst = do
 128   corpus <- fileToCorpus parser path
 129   let patterns = buildPatterns lst
 130   pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
 131
 132
 133 --------------
 134 -- | Main | --
 135 --------------
 136
 137
 138 main :: IO ()
 139 main = do
 140
 141     printIOMsg "Starting the reconstruction"
 142
 143     printIOMsg "Read the configuration file"
 144     [args]   <- getArgs
 145     jsonArgs <- (eitherDecode <$> readJson args) :: IO (Either String Config)
 146
 147     case jsonArgs of
 148         Left err     -> putStrLn err
 149         Right config -> do
 150
 151             printIOMsg "Parse the corpus"
 152             mapList <- csvGraphTermList (listPath config)
 153             corpus  <- fileToDocs (corpusParser config) (corpusPath config) mapList
 154             printIOComment (show (length corpus) <> " parsed docs from the corpus")
 155
 156             printIOMsg "Reconstruct the Phylo"
 157
 158             let phylo = toPhylo corpus mapList config
 159
 160             -- | probes
 161
 162             -- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_synchronic_distance_cumu_jaccard.txt")
 163             --          $ synchronicDistance' phylo 1
 164
 165             -- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_inflexion_points.txt")
 166             --         $ inflexionPoints phylo 1
 167
 168             printIOMsg "End of reconstruction, start the export"
 169
 170             let dot = toPhyloExport phylo
 171
 172             let output = (outputPath config)
 173                       <> (unpack $ phyloName config)
 174                       <> "_V2.dot"
 175
 176             dotToFile output dot