]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Parsers/WOS.hs
Merge branch 'dev-phylo' of https://gitlab.iscpif.fr/gargantext/haskell-gargantext...
[gargantext.git] / src / Gargantext / Text / Parsers / WOS.hs
1 {-|
2 Module : Gargantext.Text.Parsers.WOS
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Here is a longer description of this module, containing some
11 commentary with @some markup@.
12 -}
13
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16
17 module Gargantext.Text.Parsers.WOS (wosParser) where
18
19 -- TOFIX : Should import Gargantext.Prelude here
20 import Prelude hiding (takeWhile, take, concat, readFile, lines, concat)
21
22 import qualified Data.List as DL
23
24 import Data.Monoid ((<>))
25 import Data.Attoparsec.ByteString (Parser, try, string
26 , takeTill, take
27 , manyTill, many1)
28 import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine)
29 import Data.ByteString (ByteString, concat)
30 import Data.ByteString.Char8 (pack)
31 import Control.Applicative
32
33 -------------------------------------------------------------
34 -- | wosParser parses ISI format from
35 -- Web Of Science Database
36 wosParser :: Parser [[(ByteString, ByteString)]]
37 wosParser = do
38 -- TODO Warning if version /= 1.0
39 -- FIXME anyChar (string ..) /= exact string "\nVR 1.0" ?
40 _ <- manyTill anyChar (string $ pack "\nVR 1.0")
41 ns <- many1 notice <* (string $ pack "\nEF" )
42 pure ns
43
44 notice :: Parser [(ByteString, ByteString)]
45 notice = start *> fields <* end
46 where
47 start :: Parser ByteString
48 start = "\nPT " *> takeTill isEndOfLine
49
50 end :: Parser [Char]
51 end = manyTill anyChar (string $ pack "\nER\n")
52
53
54 fields :: Parser [(ByteString, ByteString)]
55 fields = many field
56 where
57 field :: Parser (ByteString, ByteString)
58 field = do
59 name <- "\n" *> take 2 <* " "
60 txt <- takeTill isEndOfLine
61 txts <- try lines
62 let txts' = case DL.length txts > 0 of
63 True -> txts
64 False -> []
65 pure (translate name, concat ([txt] <> txts'))
66
67
68 lines :: Parser [ByteString]
69 lines = many line
70 where
71 line :: Parser ByteString
72 line = "\n " *> takeTill isEndOfLine
73
74 translate :: ByteString -> ByteString
75 translate champs
76 | champs == "AF" = "authors"
77 | champs == "TI" = "title"
78 | champs == "SO" = "source"
79 | champs == "DI" = "doi"
80 | champs == "PD" = "publication_date"
81 | champs == "AB" = "abstract"
82 | otherwise = champs
83 -------------------------------------------------------------
84