]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers/WOS.hs
Merge branch 'dev' into dev-ngrams-groups
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers / WOS.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers.WOS
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Here is a longer description of this module, containing some
11 commentary with @some markup@.
12 -}
13
14
15 module Gargantext.Core.Text.Corpus.Parsers.WOS (parser, keys) where
16
17 import Control.Applicative
18 import Data.Attoparsec.ByteString (Parser, string, takeTill, take, manyTill, many1)
19 import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine)
20 import Data.ByteString (ByteString)
21 import Data.ByteString.Char8 (pack)
22 import Gargantext.Core.Text.Corpus.Parsers.RIS (fieldWith)
23 import Prelude hiding (takeWhile, take, concat, readFile, lines, concat)
24
25 -------------------------------------------------------------
26 -- | wosParser parses ISI format from
27 -- Web Of Science Database
28 parser :: Parser [[(ByteString, ByteString)]]
29 parser = do
30 -- TODO Warning if version /= 1.0
31 -- FIXME anyChar (string ..) /= exact string "\nVR 1.0" ?
32 _ <- manyTill anyChar (string $ pack "\nVR 1.0")
33 ns <- many1 notice <* (string $ pack "\nEF" )
34 pure ns
35
36 notice :: Parser [(ByteString, ByteString)]
37 notice = start *> many (fieldWith field) <* end
38 where
39 field :: Parser ByteString
40 field = "\n" *> take 2 <* " "
41
42 start :: Parser ByteString
43 start = "\nPT " *> takeTill isEndOfLine
44
45 end :: Parser [Char]
46 end = manyTill anyChar (string $ pack "\nER\n")
47
48
49 keys :: ByteString -> ByteString
50 keys champs
51 | champs == "AF" = "authors"
52 | champs == "TI" = "title"
53 | champs == "SO" = "source"
54 | champs == "DI" = "doi"
55 | champs == "PD" = "publication_date"
56 | champs == "AB" = "abstract"
57 | otherwise = champs