src/Gargantext/Text/Parsers/WOS.hs

   1 {-|
   2 Module      : Gargantext.Text.Parsers.WOS
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Here is a longer description of this module, containing some
  11 commentary with @some markup@.
  12 -}
  13
  14 {-# LANGUAGE NoImplicitPrelude #-}
  15 {-# LANGUAGE OverloadedStrings #-}
  16
  17 module Gargantext.Text.Parsers.WOS (wosParser) where
  18
  19 -- TOFIX : Should import Gargantext.Prelude here
  20 import Prelude hiding (takeWhile, take, concat, readFile, lines, concat)
  21
  22 import qualified Data.List as DL
  23
  24 import Data.Monoid ((<>))
  25 import Data.Attoparsec.ByteString (Parser, try, string
  26                                   , takeTill, take
  27                                   , manyTill, many1)
  28 import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine)
  29 import Data.ByteString (ByteString, concat)
  30 import Data.ByteString.Char8 (pack)
  31
  32 import Control.Applicative
  33
  34 --import Gargantext.Types
  35
  36 -- | wosParser parses ISI format from
  37 -- Web Of Science Database
  38 wosParser :: Parser [[(ByteString, ByteString)]]
  39 wosParser = do
  40     -- TODO Warning if version /= 1.0
  41     -- FIXME anyChar (string ..) /= exact string "\nVR 1.0" ?
  42     _  <- manyTill anyChar (string $ pack "\nVR 1.0")
  43     ns <- many1 notice <*  (string $ pack "\nEF"    )
  44     pure ns
  45
  46 notice :: Parser [(ByteString, ByteString)]
  47 notice = start *> fields <* end
  48     where
  49       start :: Parser ByteString
  50       start = "\nPT " *> takeTill isEndOfLine
  51
  52       end :: Parser [Char]
  53       end = manyTill anyChar (string $ pack "\nER\n")
  54
  55
  56 fields :: Parser [(ByteString, ByteString)]
  57 fields = many field
  58     where
  59         field :: Parser (ByteString, ByteString)
  60         field = do
  61             name  <- "\n" *> take 2 <* " "
  62             txt   <- takeTill isEndOfLine
  63             txts  <- try lines
  64             let txts' = case DL.length txts > 0 of
  65                     True  -> txts
  66                     False -> []
  67             pure (translate name, concat ([txt] <> txts'))
  68
  69
  70 lines :: Parser [ByteString]
  71 lines = many line
  72     where
  73         line :: Parser ByteString
  74         line = "\n  " *> takeTill isEndOfLine
  75
  76 translate :: ByteString -> ByteString
  77 translate champs
  78             | champs == "AU" = "author"
  79             | champs == "TI" = "title"
  80             | champs == "SO" = "source"
  81             | champs == "DI" = "doi"
  82             | champs == "PD" = "publication_date"
  83             | champs == "AB" = "abstract"
  84             | otherwise  = champs
  85