src/Gargantext/Parsers.hs

   1 {-|
   2 Module      : Gargantext.Parsers
   3 Description : All parsers of Gargantext in one file.
   4 Copyright   : (c) CNRS, 2017
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Gargantext enables analyzing semi-structured text that should be parsed
  11 in order to be analyzed.
  12
  13 The parsers suppose we know the format of the Text (TextFormat data
  14 type) according to which the right parser is chosen among the list of
  15 available parsers.
  16
  17 This module mainly describe how to add a new parser to Gargantext,
  18 please follow the types.
  19 -}
  20
  21 module Gargantext.Parsers -- (parse, FileFormat(..))
  22     where
  23
  24 import Gargantext.Prelude
  25
  26 import System.FilePath (takeExtension, FilePath())
  27 import Data.Attoparsec.ByteString (parseOnly, Parser)
  28 import qualified Data.ByteString as DB
  29 import qualified Data.Map        as DM
  30 import Data.Either.Extra (partitionEithers)
  31 import Data.Ord()
  32 import Data.Foldable (concat)
  33 import Data.String()
  34 import Data.Either.Extra(Either())
  35
  36 import Data.Text (Text)
  37 import Data.Text.Encoding (decodeUtf8)
  38 ----
  39 --import Control.Monad (join)
  40 import Codec.Archive.Zip (withArchive, getEntry, getEntries)
  41 import Path.IO (resolveFile')
  42 ------ import qualified Data.ByteString.Lazy as B
  43 --import Control.Applicative ( (<$>) )
  44 import Control.Concurrent.Async as CCA (mapConcurrently)
  45
  46 import Data.String (String())
  47 import Gargantext.Parsers.WOS (wosParser)
  48 ---- import Gargantext.Parsers.XML (xmlParser)
  49 ---- import Gargantext.Parsers.DOC (docParser)
  50 ---- import Gargantext.Parsers.ODT (odtParser)
  51
  52 --import Gargantext.Prelude (pm)
  53 --import Gargantext.Types.Main (ErrorMessage(), Corpus)
  54
  55 -- FIXME
  56 --type Field = Text
  57 type ParseError = String
  58 --
  59 --data Corpus = Corpus { _corpusErrors :: [ParseError]
  60 --                     , _corpusMap    :: Map FilePath (Map Field Text)
  61 --                    }
  62
  63
  64 -- | According to the format of Input file,
  65 -- different parser are available.
  66 data FileFormat = WOS        -- Implemented (ISI Format)
  67 --                | DOC        -- Not Implemented / import Pandoc
  68 --                | ODT        -- Not Implemented / import Pandoc
  69 --                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
  70 --                | XML        -- Not Implemented / see :
  71 --                             -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
  72
  73 -- TODO: to debug maybe add the filepath in error message
  74
  75
  76 parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
  77 parse format path = do
  78     files <- case takeExtension path of
  79               ".zip" -> openZip              path
  80               _      -> pure <$> DB.readFile path
  81     (as, bs) <- partitionEithers <$> mapConcurrently (runParser format) files
  82     pure (as, map toText $ concat bs)
  83       where
  84         -- TODO : decode with bayesian inference on encodings
  85         toText = map (\(a,b) -> (decodeUtf8 a, decodeUtf8 b))
  86
  87
  88 -- | withParser:
  89 -- According the format of the text, choosing the right parser.
  90 -- TODO  withParser :: FileFormat -> Parser [Document]
  91 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
  92 withParser WOS = wosParser
  93 --withParser DOC = docParser
  94 --withParser ODT = odtParser
  95 --withParser XML = xmlParser
  96 --withParser _   = error "[ERROR] Parser not implemented yet"
  97
  98 runParser :: FileFormat -> DB.ByteString
  99           -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
 100 runParser format text = pure $ parseOnly (withParser format) text
 101
 102 openZip :: FilePath -> IO [DB.ByteString]
 103 openZip fp = do
 104     path    <- resolveFile' fp
 105     entries <- withArchive path (DM.keys <$> getEntries)
 106     bs      <- mapConcurrently (\s -> withArchive path (getEntry s)) entries
 107     pure bs
 108
 109