]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Corpus/Parsers/WOS.hs
[ELEVE] mainEleve' with a witness corpus
[gargantext.git] / src / Gargantext / Text / Corpus / Parsers / WOS.hs
1 {-|
2 Module : Gargantext.Text.Corpus.Parsers.WOS
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Here is a longer description of this module, containing some
11 commentary with @some markup@.
12 -}
13
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16
17 module Gargantext.Text.Corpus.Parsers.WOS (parser, keys) where
18
19 import Control.Applicative
20 import Data.Attoparsec.ByteString (Parser, string, takeTill, take, manyTill, many1)
21 import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine)
22 import Data.ByteString (ByteString)
23 import Data.ByteString.Char8 (pack)
24 import Gargantext.Text.Corpus.Parsers.RIS (fieldWith)
25 import Prelude hiding (takeWhile, take, concat, readFile, lines, concat)
26
27 -------------------------------------------------------------
28 -- | wosParser parses ISI format from
29 -- Web Of Science Database
30 parser :: Parser [[(ByteString, ByteString)]]
31 parser = do
32 -- TODO Warning if version /= 1.0
33 -- FIXME anyChar (string ..) /= exact string "\nVR 1.0" ?
34 _ <- manyTill anyChar (string $ pack "\nVR 1.0")
35 ns <- many1 notice <* (string $ pack "\nEF" )
36 pure ns
37
38 notice :: Parser [(ByteString, ByteString)]
39 notice = start *> many (fieldWith field) <* end
40 where
41 field :: Parser ByteString
42 field = "\n" *> take 2 <* " "
43
44 start :: Parser ByteString
45 start = "\nPT " *> takeTill isEndOfLine
46
47 end :: Parser [Char]
48 end = manyTill anyChar (string $ pack "\nER\n")
49
50
51 keys :: ByteString -> ByteString
52 keys champs
53 | champs == "AF" = "authors"
54 | champs == "TI" = "title"
55 | champs == "SO" = "source"
56 | champs == "DI" = "doi"
57 | champs == "PD" = "publication_date"
58 | champs == "AB" = "abstract"
59 | otherwise = champs