1 {-# LANGUAGE OverloadedStrings #-}
3 module Data.Gargantext.Parsers.WOS where
5 import Prelude hiding (takeWhile, take, concat, readFile)
6 import qualified Data.List as DL
8 import Data.Attoparsec.ByteString
9 import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine)
10 import Data.ByteString (ByteString)
11 import Data.ByteString.Char8 (pack)
13 import Data.Either.Extra(Either(..))
14 import Control.Applicative
16 import Control.Monad (join)
18 -- To be removed just for Tests
20 -- import Codec.Archive.LibZip (withArchive, fileNames, sourceFile, addFile)
21 --import Codec.Archive.LibZip.Types (ZipSource, OpenFlag (CreateFlag))
23 import Control.Concurrent.Async as CCA (mapConcurrently)
25 import Codec.Archive.Zip
26 import Path.IO (resolveFile')
27 -- import qualified Data.ByteString.Lazy as B
28 import Control.Applicative ( (<$>) )
31 zipFiles :: FilePath -> IO [ByteString]
33 path <- resolveFile' fp
34 entries <- withArchive path (DM.keys <$> getEntries)
35 bs <- mapConcurrently (\s -> withArchive path (getEntry s)) entries
39 parseFile :: ParserType -> ByteString -> IO Int
40 parseFile p x = case runParser p x of
42 Right r -> pure $ length r
44 testWos :: FilePath -> IO [Int]
45 testWos fp = join $ mapConcurrently (parseFile WOS) <$> zipFiles fp
47 -- type Parser a = a -> Text -> [Document]
48 data ParserType = WOS | CSV
50 wosParser :: Parser [Maybe [ByteString]]
52 -- TODO Warning if version /= 1.0
53 -- FIXME anyChar (string ..) /= exact string "\nVR 1.0" ?
54 _ <- manyTill anyChar (string $ pack "\nVR 1.0")
55 ns <- many1 wosNotice <* (string $ pack "\nEF")
58 startNotice :: Parser ByteString
59 startNotice = "\nPT " *> takeTill isEndOfLine
61 wosNotice :: Parser (Maybe [ByteString])
63 n <- startNotice *> wosFields <* manyTill anyChar (string $ pack "\nER\n")
66 field' :: Parser (ByteString, [ByteString])
68 f <- "\n" *> take 2 <* " "
69 a <- takeTill isEndOfLine
71 let as' = case DL.length as > 0 of
74 return (f, [a] ++ as')
76 wosFields' :: Parser [(ByteString, [ByteString])]
77 wosFields' = many field'
79 wosFields :: Parser (Maybe [ByteString])
84 -- d <- field "DI" -- DOI
89 return $ DL.lookup "UT" ws
90 -- return $ HyperdataDocument
98 wosLines :: Parser [ByteString]
101 line :: Parser ByteString
102 line = "\n " *> takeTill isEndOfLine
104 runParser :: ParserType -> ByteString -> Either String [Maybe [ByteString]]
105 runParser p x = parseOnly parser x
109 _ -> error "Not implemented yet"
111 -- isTokenChar :: Word8 -> Bool
112 -- isTokenChar = inClass "!#$%&'()*+./0-9:<=>?@a-zA-Z[]^_`{|}~-\n"