install: proposal

[gargantext.git] / src / Gargantext / Text / Parsers.hs
diff --git a/src/Gargantext/Text/Parsers.hs b/src/Gargantext/Text/Parsers.hs

index c94da34eabdd3d7660ffcc095f606ad65e04e5f8..dbddbbd644b359b7813ec36b1a69bd24cfec4a24 100644 (file)
--- a/src/Gargantext/Text/Parsers.hs
+++ b/src/Gargantext/Text/Parsers.hs
@@ -23,20 +23,31 @@ please follow the types.
  module Gargantext.Text.Parsers -- (parse, FileFormat(..))
      where
  
-import Gargantext.Prelude
+import System.FilePath (FilePath(), takeExtension)
+import Codec.Archive.Zip (withArchive, getEntry, getEntries)
  
-import System.FilePath (FilePath())
+import Data.Either.Extra (partitionEithers)
+import Data.List (concat)
  import qualified Data.Map        as DM
+import qualified Data.ByteString as DB
  import Data.Ord()
  import Data.String()
+import Data.Either(Either(..))
+import Data.Attoparsec.ByteString (parseOnly, Parser)
  
  import Data.Text (Text)
  import qualified Data.Text as DT
  -- | Activate Async for to parse in parallel
---import Control.Concurrent.Async as CCA (mapConcurrently)
+import Control.Concurrent.Async as CCA (mapConcurrently)
  
+import Data.Text.Encoding (decodeUtf8)
  import Data.String (String())
  
+------------------------------------------------------------------------
+import Gargantext.Prelude
+import Gargantext.Text.Parsers.WOS (wosParser)
+------------------------------------------------------------------------
+
  
  type ParseError = String
  type Field      = Text
@@ -60,38 +71,37 @@ data FileFormat = WOS        -- Implemented (ISI Format)
  -- TODO: to debug maybe add the filepath in error message
  
  
---parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
---parse format path = do
---    files <- case takeExtension path of
---              ".zip" -> openZip              path
---              _      -> pure <$> DB.readFile path
---    (as, bs) <- partitionEithers <$> mapConcurrently (runParser format) files
---    pure (as, map toText $ concat bs)
---      where
---        -- TODO : decode with bayesian inference on encodings
---        toText = map (\(a,b) -> (decodeUtf8 a, decodeUtf8 b))
---
---
----- | withParser:
----- According the format of the text, choosing the right parser.
----- TODO  withParser :: FileFormat -> Parser [Document]
---withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
---withParser WOS = wosParser
-----withParser DOC = docParser
-----withParser ODT = odtParser
-----withParser XML = xmlParser
-----withParser _   = error "[ERROR] Parser not implemented yet"
---
---runParser :: FileFormat -> DB.ByteString 
---          -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
---runParser format text = pure $ parseOnly (withParser format) text
---
---openZip :: FilePath -> IO [DB.ByteString]
---openZip fp = do
---    path    <- resolveFile' fp
---    entries <- withArchive path (DM.keys <$> getEntries)
---    bs      <- mapConcurrently (\s -> withArchive path (getEntry s)) entries
---    pure bs
+parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
+parse format path = do
+    files <- case takeExtension path of
+              ".zip" -> openZip              path
+              _      -> pure <$> DB.readFile path
+    (as, bs) <- partitionEithers <$> mapConcurrently (runParser format) files
+    pure (as, map toText $ concat bs)
+      where
+        -- TODO : decode with bayesian inference on encodings
+        toText = map (\(a,b) -> (decodeUtf8 a, decodeUtf8 b))
+
+
+-- | withParser:
+-- According the format of the text, choosing the right parser.
+-- TODO  withParser :: FileFormat -> Parser [Document]
+withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
+withParser WOS = wosParser
+--withParser DOC = docParser
+--withParser ODT = odtParser
+--withParser XML = xmlParser
+--withParser _   = error "[ERROR] Parser not implemented yet"
+
+runParser :: FileFormat -> DB.ByteString 
+          -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
+runParser format text = pure $ parseOnly (withParser format) text
+
+openZip :: FilePath -> IO [DB.ByteString]
+openZip fp = do
+    entries <- withArchive fp (DM.keys <$> getEntries)
+    bs      <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
+    pure bs
  
  clean :: Text -> Text
  clean txt = DT.map clean' txt