]> Git — Sourcephile - gargantext.git/blob - src/Data/Gargantext/Parsers/WOS.hs
[NGRAMS] improving ngrams extraction with prep (of/de) respectively in eng/fr.
[gargantext.git] / src / Data / Gargantext / Parsers / WOS.hs
1 {-# LANGUAGE OverloadedStrings #-}
2
3 module Data.Gargantext.Parsers.WOS where
4
5 import Prelude hiding (takeWhile, take, concat, readFile)
6 import qualified Data.List as DL
7 import Data.Map as DM
8 import Data.Attoparsec.ByteString
9 import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine)
10 import Data.ByteString (ByteString)
11 import Data.ByteString.Char8 (pack)
12
13 import Data.Either.Extra(Either(..))
14 import Control.Applicative
15
16 import Control.Monad (join)
17
18 -- To be removed just for Tests
19 --
20 -- import Codec.Archive.LibZip (withArchive, fileNames, sourceFile, addFile)
21 --import Codec.Archive.LibZip.Types (ZipSource, OpenFlag (CreateFlag))
22
23 import Control.Concurrent.Async as CCA (mapConcurrently)
24
25 import Codec.Archive.Zip
26 import Path.IO (resolveFile')
27 -- import qualified Data.ByteString.Lazy as B
28 import Control.Applicative ( (<$>) )
29
30 -- type Parser a = a -> Text -> [Document]
31 data ParserType = WOS | CSV
32
33 type WosDoc = ByteString
34
35
36 wosParser :: Parser [Maybe [WosDoc]]
37 wosParser = do
38 -- TODO Warning if version /= 1.0
39 -- FIXME anyChar (string ..) /= exact string "\nVR 1.0" ?
40 _ <- manyTill anyChar (string $ pack "\nVR 1.0")
41 ns <- many1 wosNotice <* (string $ pack "\nEF")
42 return ns
43
44 wosNotice :: Parser (Maybe [WosDoc])
45 wosNotice = startNotice *> wosFields <* endNotice
46
47 endNotice :: Parser [Char]
48 endNotice = manyTill anyChar (string $ pack "\nER\n")
49
50 startNotice :: Parser ByteString
51 startNotice = "\nPT " *> takeTill isEndOfLine
52
53
54 field' :: Parser (ByteString, [ByteString])
55 field' = do
56 f <- "\n" *> take 2 <* " "
57 a <- takeTill isEndOfLine
58 as <- try wosLines
59 let as' = case DL.length as > 0 of
60 True -> as
61 False -> []
62 return (f, [a] ++ as')
63
64 wosFields' :: Parser [(ByteString, [ByteString])]
65 wosFields' = many field'
66
67 wosFields :: Parser (Maybe [ByteString])
68 wosFields = do
69 -- a <- field "AU"
70 -- t <- field "TI"
71 -- s <- field "SO"
72 -- d <- field "DI" -- DOI
73 -- p <- field "PD"
74 -- b <- field "AB"
75 -- u <- field "UT"
76 ws <- many field'
77 return $ DL.lookup "UT" ws
78 -- return $ HyperdataDocument
79 -- Just "WOS"
80 -- DL.lookup "DI" ws
81 -- DL.lookup "URL" ws
82 -- DL.lookup "PA" ws
83 -- DL.lookup "TI" ws
84 --
85
86 wosLines :: Parser [ByteString]
87 wosLines = many line
88 where
89 line :: Parser ByteString
90 line = "\n " *> takeTill isEndOfLine
91
92 runParser :: ParserType -> ByteString -> Either String [Maybe [WosDoc]]
93 runParser p x = parseOnly parser x
94 where
95 parser = case p of
96 WOS -> wosParser
97 _ -> error "Not implemented yet"
98
99 -- isTokenChar :: Word8 -> Bool
100 -- isTokenChar = inClass "!#$%&'()*+./0-9:<=>?@a-zA-Z[]^_`{|}~-\n"
101
102
103 zipFiles :: FilePath -> IO [ByteString]
104 zipFiles fp = do
105 path <- resolveFile' fp
106 entries <- withArchive path (DM.keys <$> getEntries)
107 bs <- mapConcurrently (\s -> withArchive path (getEntry s)) entries
108 pure bs
109
110
111 parseFile :: ParserType -> ByteString -> IO Int
112 parseFile p x = case runParser p x of
113 Left _ -> pure 0
114 Right r -> pure $ length r
115
116 testWos :: FilePath -> IO [Int]
117 testWos fp = join $ mapConcurrently (parseFile WOS) <$> zipFiles fp
118
119
120