src/Gargantext/Text/Parsers.hs

   1 {-|
   2 Module      : Gargantext.Text.Parsers
   3 Description : All parsers of Gargantext in one file.
   4 Copyright   : (c) CNRS, 2017
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Gargantext enables analyzing semi-structured text that should be parsed
  11 in order to be analyzed.
  12
  13 The parsers suppose we know the format of the Text (TextFormat data
  14 type) according to which the right parser is chosen among the list of
  15 available parsers.
  16
  17 This module mainly describe how to add a new parser to Gargantext,
  18 please follow the types.
  19 -}
  20
  21 {-# LANGUAGE NoImplicitPrelude #-}
  22
  23 module Gargantext.Text.Parsers -- (parse, FileFormat(..))
  24     where
  25
  26 import Gargantext.Prelude
  27
  28 import System.FilePath (FilePath())
  29 import qualified Data.Map        as DM
  30 import Data.Ord()
  31 import Data.String()
  32
  33 import Data.Text (Text)
  34 import qualified Data.Text as DT
  35 -- | Activate Async for to parse in parallel
  36 --import Control.Concurrent.Async as CCA (mapConcurrently)
  37
  38 import Data.String (String())
  39
  40
  41 type ParseError = String
  42 type Field      = Text
  43 type Document   = DM.Map Field Text
  44
  45 type FilesParsed = DM.Map FilePath FileParsed
  46 data FileParsed  = FileParsed { _fileParsed_errors ::  Maybe ParseError
  47                               , _fileParsed_result :: [Document]
  48                               } deriving (Show)
  49
  50
  51 -- | According to the format of Input file,
  52 -- different parser are available.
  53 data FileFormat = WOS        -- Implemented (ISI Format)
  54 --                | DOC        -- Not Implemented / import Pandoc
  55 --                | ODT        -- Not Implemented / import Pandoc
  56 --                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
  57 --                | XML        -- Not Implemented / see :
  58 --                             -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
  59
  60 -- TODO: to debug maybe add the filepath in error message
  61
  62
  63 --parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
  64 --parse format path = do
  65 --    files <- case takeExtension path of
  66 --              ".zip" -> openZip              path
  67 --              _      -> pure <$> DB.readFile path
  68 --    (as, bs) <- partitionEithers <$> mapConcurrently (runParser format) files
  69 --    pure (as, map toText $ concat bs)
  70 --      where
  71 --        -- TODO : decode with bayesian inference on encodings
  72 --        toText = map (\(a,b) -> (decodeUtf8 a, decodeUtf8 b))
  73 --
  74 --
  75 ---- | withParser:
  76 ---- According the format of the text, choosing the right parser.
  77 ---- TODO  withParser :: FileFormat -> Parser [Document]
  78 --withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
  79 --withParser WOS = wosParser
  80 ----withParser DOC = docParser
  81 ----withParser ODT = odtParser
  82 ----withParser XML = xmlParser
  83 ----withParser _   = error "[ERROR] Parser not implemented yet"
  84 --
  85 --runParser :: FileFormat -> DB.ByteString
  86 --          -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
  87 --runParser format text = pure $ parseOnly (withParser format) text
  88 --
  89 --openZip :: FilePath -> IO [DB.ByteString]
  90 --openZip fp = do
  91 --    path    <- resolveFile' fp
  92 --    entries <- withArchive path (DM.keys <$> getEntries)
  93 --    bs      <- mapConcurrently (\s -> withArchive path (getEntry s)) entries
  94 --    pure bs
  95
  96 clean :: Text -> Text
  97 clean txt = DT.map clean' txt
  98   where
  99     clean' '’' = '\''
 100     clean' c  = c
 101
 102