src/Gargantext/Text/Parsers.hs

   1 {-|
   2 Module      : Gargantext.Text.Parsers
   3 Description : All parsers of Gargantext in one file.
   4 Copyright   : (c) CNRS, 2017
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Gargantext enables analyzing semi-structured text that should be parsed
  11 in order to be analyzed.
  12
  13 The parsers suppose we know the format of the Text (TextFormat data
  14 type) according to which the right parser is chosen among the list of
  15 available parsers.
  16
  17 This module mainly describe how to add a new parser to Gargantext,
  18 please follow the types.
  19 -}
  20
  21 {-# LANGUAGE NoImplicitPrelude #-}
  22
  23 module Gargantext.Text.Parsers -- (parse, FileFormat(..))
  24     where
  25
  26 import System.FilePath (FilePath(), takeExtension)
  27 import Codec.Archive.Zip (withArchive, getEntry, getEntries)
  28
  29 import Data.Either.Extra (partitionEithers)
  30 import Data.List (concat)
  31 import qualified Data.Map        as DM
  32 import qualified Data.ByteString as DB
  33 import Data.Ord()
  34 import Data.String()
  35 import Data.Either(Either(..))
  36 import Data.Attoparsec.ByteString (parseOnly, Parser)
  37
  38 import Data.Text (Text)
  39 import qualified Data.Text as DT
  40
  41 -- Activate Async for to parse in parallel
  42 import Control.Concurrent.Async as CCA (mapConcurrently)
  43
  44 import Data.Text.Encoding (decodeUtf8)
  45 import Data.String (String())
  46
  47 ------------------------------------------------------------------------
  48 import Gargantext.Prelude
  49 import Gargantext.Text.Parsers.WOS (wosParser)
  50 ------------------------------------------------------------------------
  51
  52
  53 type ParseError = String
  54 type Field      = Text
  55 type Document   = DM.Map Field Text
  56
  57 type FilesParsed = DM.Map FilePath FileParsed
  58 data FileParsed  = FileParsed { _fileParsed_errors ::  Maybe ParseError
  59                               , _fileParsed_result :: [Document]
  60                               } deriving (Show)
  61
  62
  63 -- | According to the format of Input file,
  64 -- different parser are available.
  65 data FileFormat = WOS        -- Implemented (ISI Format)
  66 --                | DOC        -- Not Implemented / import Pandoc
  67 --                | ODT        -- Not Implemented / import Pandoc
  68 --                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
  69 --                | XML        -- Not Implemented / see :
  70 --                             -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
  71
  72 -- TODO: to debug maybe add the filepath in error message
  73
  74
  75 parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
  76 parse format path = do
  77     files <- case takeExtension path of
  78               ".zip" -> openZip              path
  79               _      -> pure <$> DB.readFile path
  80     (as, bs) <- partitionEithers <$> mapConcurrently (runParser format) files
  81     pure (as, map toText $ concat bs)
  82       where
  83         -- TODO : decode with bayesian inference on encodings
  84         toText = map (\(a,b) -> (decodeUtf8 a, decodeUtf8 b))
  85
  86
  87 -- | withParser:
  88 -- According the format of the text, choosing the right parser.
  89 -- TODO  withParser :: FileFormat -> Parser [Document]
  90 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
  91 withParser WOS = wosParser
  92 --withParser DOC = docParser
  93 --withParser ODT = odtParser
  94 --withParser XML = xmlParser
  95 --withParser _   = error "[ERROR] Parser not implemented yet"
  96
  97 runParser :: FileFormat -> DB.ByteString
  98           -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
  99 runParser format text = pure $ parseOnly (withParser format) text
 100
 101 openZip :: FilePath -> IO [DB.ByteString]
 102 openZip fp = do
 103     entries <- withArchive fp (DM.keys <$> getEntries)
 104     bs      <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
 105     pure bs
 106
 107 clean :: Text -> Text
 108 clean txt = DT.map clean' txt
 109   where
 110     clean' '’' = '\''
 111     clean' c  = c
 112
 113