2 Module : Gargantext.Text.Parsers.WOS
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Here is a longer description of this module, containing some
11 commentary with @some markup@.
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
17 module Gargantext.Text.Parsers.WOS (wosParser) where
19 -- TOFIX : Should import Gargantext.Prelude here
20 import Prelude hiding (takeWhile, take, concat, readFile, lines, concat)
22 import qualified Data.List as DL
24 import Data.Monoid ((<>))
25 import Data.Attoparsec.ByteString (Parser, try, string
28 import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine)
29 import Data.ByteString (ByteString, concat)
30 import Data.ByteString.Char8 (pack)
32 import Control.Applicative
34 --import Gargantext.Types
36 -- | wosParser parses ISI format from
37 -- Web Of Science Database
38 wosParser :: Parser [[(ByteString, ByteString)]]
40 -- TODO Warning if version /= 1.0
41 -- FIXME anyChar (string ..) /= exact string "\nVR 1.0" ?
42 _ <- manyTill anyChar (string $ pack "\nVR 1.0")
43 ns <- many1 notice <* (string $ pack "\nEF" )
46 notice :: Parser [(ByteString, ByteString)]
47 notice = start *> fields <* end
49 start :: Parser ByteString
50 start = "\nPT " *> takeTill isEndOfLine
53 end = manyTill anyChar (string $ pack "\nER\n")
56 fields :: Parser [(ByteString, ByteString)]
59 field :: Parser (ByteString, ByteString)
61 name <- "\n" *> take 2 <* " "
62 txt <- takeTill isEndOfLine
64 let txts' = case DL.length txts > 0 of
67 pure (translate name, concat ([txt] <> txts'))
70 lines :: Parser [ByteString]
73 line :: Parser ByteString
74 line = "\n " *> takeTill isEndOfLine
76 translate :: ByteString -> ByteString
78 | champs == "AU" = "author"
79 | champs == "TI" = "title"
80 | champs == "SO" = "source"
81 | champs == "DI" = "doi"
82 | champs == "PD" = "publication_date"
83 | champs == "AB" = "abstract"