2 Module : Gargantext.Text.Corpus.Parsers.WOS
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Here is a longer description of this module, containing some
11 commentary with @some markup@.
15 module Gargantext.Text.Corpus.Parsers.WOS (parser, keys) where
17 import Control.Applicative
18 import Data.Attoparsec.ByteString (Parser, string, takeTill, take, manyTill, many1)
19 import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine)
20 import Data.ByteString (ByteString)
21 import Data.ByteString.Char8 (pack)
22 import Gargantext.Text.Corpus.Parsers.RIS (fieldWith)
23 import Prelude hiding (takeWhile, take, concat, readFile, lines, concat)
25 -------------------------------------------------------------
26 -- | wosParser parses ISI format from
27 -- Web Of Science Database
28 parser :: Parser [[(ByteString, ByteString)]]
30 -- TODO Warning if version /= 1.0
31 -- FIXME anyChar (string ..) /= exact string "\nVR 1.0" ?
32 _ <- manyTill anyChar (string $ pack "\nVR 1.0")
33 ns <- many1 notice <* (string $ pack "\nEF" )
36 notice :: Parser [(ByteString, ByteString)]
37 notice = start *> many (fieldWith field) <* end
39 field :: Parser ByteString
40 field = "\n" *> take 2 <* " "
42 start :: Parser ByteString
43 start = "\nPT " *> takeTill isEndOfLine
46 end = manyTill anyChar (string $ pack "\nER\n")
49 keys :: ByteString -> ByteString
51 | champs == "AF" = "authors"
52 | champs == "TI" = "title"
53 | champs == "SO" = "source"
54 | champs == "DI" = "doi"
55 | champs == "PD" = "publication_date"
56 | champs == "AB" = "abstract"