src/Gargantext/Text/Parsers.hs

   1 {-|
   2 Module      : Gargantext.Text.Parsers
   3 Description : All parsers of Gargantext in one file.
   4 Copyright   : (c) CNRS, 2017
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Gargantext enables analyzing semi-structured text that should be parsed
  11 in order to be analyzed.
  12
  13 The parsers suppose we know the format of the Text (TextFormat data
  14 type) according to which the right parser is chosen among the list of
  15 available parsers.
  16
  17 This module mainly describe how to add a new parser to Gargantext,
  18 please follow the types.
  19 -}
  20
  21 {-# LANGUAGE NoImplicitPrelude #-}
  22
  23 module Gargantext.Text.Parsers -- (parse, FileFormat(..))
  24     where
  25
  26 import Gargantext.Prelude
  27
  28 import System.FilePath (takeExtension, FilePath())
  29 import Data.Attoparsec.ByteString (parseOnly, Parser)
  30 import qualified Data.ByteString as DB
  31 import qualified Data.Map        as DM
  32 import Data.Either.Extra (partitionEithers)
  33 import Data.Ord()
  34 import Data.Foldable (concat)
  35 import Data.String()
  36 import Data.Either.Extra(Either())
  37
  38 import Data.Text (Text)
  39 import Data.Text.Encoding (decodeUtf8)
  40 import qualified Data.Text as DT
  41 ----
  42 --import Control.Monad (join)
  43 import Codec.Archive.Zip (withArchive, getEntry, getEntries)
  44 import Path.IO (resolveFile')
  45 ------ import qualified Data.ByteString.Lazy as B
  46 --import Control.Applicative ( (<$>) )
  47 import Control.Concurrent.Async as CCA (mapConcurrently)
  48
  49 import Data.String (String())
  50 import Gargantext.Text.Parsers.WOS (wosParser)
  51 ---- import Gargantext.Parsers.XML (xmlParser)
  52 ---- import Gargantext.Parsers.DOC (docParser)
  53 ---- import Gargantext.Parsers.ODT (odtParser)
  54
  55 --import Gargantext.Types.Main (ErrorMessage(), Corpus)
  56
  57 type ParseError = String
  58 type Field      = Text
  59 type Document   = DM.Map Field Text
  60
  61 type FilesParsed = DM.Map FilePath FileParsed
  62 data FileParsed  = FileParsed { _fileParsed_errors ::  Maybe ParseError
  63                               , _fileParsed_result :: [Document]
  64                               } deriving (Show)
  65
  66
  67 -- | According to the format of Input file,
  68 -- different parser are available.
  69 data FileFormat = WOS        -- Implemented (ISI Format)
  70 --                | DOC        -- Not Implemented / import Pandoc
  71 --                | ODT        -- Not Implemented / import Pandoc
  72 --                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
  73 --                | XML        -- Not Implemented / see :
  74 --                             -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
  75
  76 -- TODO: to debug maybe add the filepath in error message
  77
  78
  79 --parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
  80 --parse format path = do
  81 --    files <- case takeExtension path of
  82 --              ".zip" -> openZip              path
  83 --              _      -> pure <$> DB.readFile path
  84 --    (as, bs) <- partitionEithers <$> mapConcurrently (runParser format) files
  85 --    pure (as, map toText $ concat bs)
  86 --      where
  87 --        -- TODO : decode with bayesian inference on encodings
  88 --        toText = map (\(a,b) -> (decodeUtf8 a, decodeUtf8 b))
  89 --
  90 --
  91 ---- | withParser:
  92 ---- According the format of the text, choosing the right parser.
  93 ---- TODO  withParser :: FileFormat -> Parser [Document]
  94 --withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
  95 --withParser WOS = wosParser
  96 ----withParser DOC = docParser
  97 ----withParser ODT = odtParser
  98 ----withParser XML = xmlParser
  99 ----withParser _   = error "[ERROR] Parser not implemented yet"
 100 --
 101 --runParser :: FileFormat -> DB.ByteString
 102 --          -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
 103 --runParser format text = pure $ parseOnly (withParser format) text
 104 --
 105 --openZip :: FilePath -> IO [DB.ByteString]
 106 --openZip fp = do
 107 --    path    <- resolveFile' fp
 108 --    entries <- withArchive path (DM.keys <$> getEntries)
 109 --    bs      <- mapConcurrently (\s -> withArchive path (getEntry s)) entries
 110 --    pure bs
 111
 112 clean :: Text -> Text
 113 clean txt = DT.map clean' txt
 114   where
 115     clean' '’' = '\''
 116     clean' c  = c
 117
 118