]> Git — Sourcephile - gargantext.git/blob - src/Data/Gargantext/Parsers/WOS.hs
[FEAT] Adding WOS parser, NLP functions, Rights management guidelines (draft).
[gargantext.git] / src / Data / Gargantext / Parsers / WOS.hs
1 module Data.Gargantext.Parsers.WOS where
2
3 import Prelude hiding (takeWhile, take, concat, readFile)
4 import qualified Data.List as DL
5 import Data.Map as DM
6 import Data.Attoparsec.ByteString
7 import Data.Attoparsec.ByteString.Char8 (anyChar, char8, endOfLine, isDigit_w8, isAlpha_ascii, isEndOfLine)
8 import Data.ByteString (ByteString, unpack, pack, concat, readFile)
9
10 import Data.Either.Extra(Either(..))
11 import Control.Applicative
12
13 import Control.Monad (join)
14
15 -- To be removed just for Tests
16 --
17 -- import Codec.Archive.LibZip (withArchive, fileNames, sourceFile, addFile)
18 --import Codec.Archive.LibZip.Types (ZipSource, OpenFlag (CreateFlag))
19
20 import Control.Concurrent.Async as CCA (mapConcurrently)
21
22 import System.Environment
23
24 import Codec.Archive.Zip
25 import Path (parseAbsFile)
26 import Path.IO (resolveFile')
27 -- import qualified Data.ByteString.Lazy as B
28 import Control.Applicative ( (<$>) )
29
30
31 zipFiles :: FilePath -> IO [ByteString]
32 zipFiles fp = do
33 path <- resolveFile' fp
34 entries <- withArchive path (DM.keys <$> getEntries)
35 bs <- mapConcurrently (\s -> withArchive path (getEntry s)) entries
36 pure bs
37
38
39 parseFile :: ParserType -> ByteString -> IO Int
40 parseFile p x = case runParser p x of
41 Left e -> pure 1
42 Right r -> pure $ length r
43
44 testWos :: FilePath -> IO [Int]
45 testWos fp = join $ mapConcurrently (parseFile WOS) <$> zipFiles fp
46
47 -- type Parser a = a -> Text -> [Document]
48 data ParserType = WOS | CSV
49
50 wosParser :: Parser [Maybe [ByteString]]
51 wosParser = do
52 -- TODO Warning if version /= 1.0
53 _ <- manyTill anyChar (string "\nVR 1.0")
54 ns <- many1 wosNotice <* "\nEF"
55 return ns
56
57 startNotice :: Parser ByteString
58 startNotice = "\nPT " *> takeTill isEndOfLine
59
60 wosNotice :: Parser (Maybe [ByteString])
61 wosNotice = do
62 n <- startNotice *> wosFields <* manyTill anyChar (string "\nER\n")
63 return n
64
65 field' :: Parser (ByteString, [ByteString])
66 field' = do
67 f <- "\n" *> take 2 <* " "
68 a <- takeTill isEndOfLine
69 as <- try wosLines
70 let as' = case DL.length as > 0 of
71 True -> as
72 False -> []
73 return (f, [a] ++ as')
74
75 wosFields' :: Parser [(ByteString, [ByteString])]
76 wosFields' = many field'
77
78 wosFields :: Parser (Maybe [ByteString])
79 wosFields = do
80 -- a <- field "AU"
81 -- t <- field "TI"
82 -- s <- field "SO"
83 -- d <- field "DI" -- DOI
84 -- p <- field "PD"
85 -- b <- field "AB"
86 -- u <- field "UT"
87 ws <- many field'
88 return $ DL.lookup "UT" ws
89 -- return $ HyperdataDocument
90 -- Just "WOS"
91 -- DL.lookup "DI" ws
92 -- DL.lookup "URL" ws
93 -- DL.lookup "PA" ws
94 -- DL.lookup "TI" ws
95 --
96
97 wosLines :: Parser [ByteString]
98 wosLines = many line
99 where
100 line :: Parser ByteString
101 line = "\n " *> takeTill isEndOfLine
102
103 runParser :: ParserType -> ByteString -> Either String [Maybe [ByteString]]
104 runParser p x = parseOnly parser x
105 where
106 parser = case p of
107 WOS -> wosParser
108 _ -> error "Not implemented yet"
109
110 -- isTokenChar :: Word8 -> Bool
111 -- isTokenChar = inClass "!#$%&'()*+./0-9:<=>?@a-zA-Z[]^_`{|}~-\n"
112