]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Parsers/WOS.hs
[PROTOLUDE] cleaning filter.
[gargantext.git] / src / Gargantext / Parsers / WOS.hs
1 {-# LANGUAGE OverloadedStrings #-}
2
3 module Gargantext.Parsers.WOS (wosParser) where
4
5 -- TOFIX : Should import Gargantext.Prelude here
6 import Prelude hiding (takeWhile, take, concat, readFile, lines, concat)
7
8 import qualified Data.List as DL
9
10 import Data.Monoid ((<>))
11 import Data.Attoparsec.ByteString (Parser, try, string
12 , takeTill, take
13 , manyTill, many1)
14 import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine)
15 import Data.ByteString (ByteString, concat)
16 import Data.ByteString.Char8 (pack)
17
18 import Control.Applicative
19
20 --import Gargantext.Types
21
22 -- | wosParser parses ISI format from
23 -- Web Of Science Database
24 wosParser :: Parser [[(ByteString, ByteString)]]
25 wosParser = do
26 -- TODO Warning if version /= 1.0
27 -- FIXME anyChar (string ..) /= exact string "\nVR 1.0" ?
28 _ <- manyTill anyChar (string $ pack "\nVR 1.0")
29 ns <- many1 notice <* (string $ pack "\nEF" )
30 pure ns
31
32 notice :: Parser [(ByteString, ByteString)]
33 notice = start *> fields <* end
34 where
35 start :: Parser ByteString
36 start = "\nPT " *> takeTill isEndOfLine
37
38 end :: Parser [Char]
39 end = manyTill anyChar (string $ pack "\nER\n")
40
41
42 fields :: Parser [(ByteString, ByteString)]
43 fields = many field
44 where
45 field :: Parser (ByteString, ByteString)
46 field = do
47 name <- "\n" *> take 2 <* " "
48 txt <- takeTill isEndOfLine
49 txts <- try lines
50 let txts' = case DL.length txts > 0 of
51 True -> txts
52 False -> []
53 pure (translate name, concat ([txt] <> txts'))
54
55
56 lines :: Parser [ByteString]
57 lines = many line
58 where
59 line :: Parser ByteString
60 line = "\n " *> takeTill isEndOfLine
61
62 translate :: ByteString -> ByteString
63 translate champs
64 | champs == "AU" = "author"
65 | champs == "TI" = "title"
66 | champs == "SO" = "source"
67 | champs == "DI" = "doi"
68 | champs == "PD" = "publication_date"
69 | champs == "AB" = "abstract"
70 | otherwise = champs
71