]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Parsers/RIS.hs
[FEAT] Parsers RIS && Presse.
[gargantext.git] / src / Gargantext / Text / Parsers / RIS.hs
1 {-|
2 Module : Gargantext.Text.Parsers.RIS
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10
11 RIS is a standardized tag format developed by Research Information
12 Systems, Incorporated (the format name refers to the company) to enable
13 citation programs to exchange data.
14
15 [More](https://en.wikipedia.org/wiki/RIS_(file_format))
16
17 -}
18
19 {-# LANGUAGE NoImplicitPrelude #-}
20 {-# LANGUAGE OverloadedStrings #-}
21
22 module Gargantext.Text.Parsers.RIS (risParser, risDate, toDate, presseParser) where
23
24 import Data.Either (either)
25 import Data.List (lookup)
26 import Data.Tuple.Extra (first)
27 import Control.Applicative
28 import Data.Attoparsec.ByteString (Parser, try, string, takeTill, take, manyTill, many1, endOfInput, parseOnly)
29 import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine)
30 import Data.ByteString (ByteString, concat, length)
31 import Data.ByteString.Char8 (pack)
32 import Data.Monoid ((<>))
33 import Gargantext.Prelude hiding (takeWhile, take, concat, readFile, lines, concat)
34 import qualified Data.List as DL
35 -------------------------------------------------------------
36
37 risParser :: Parser [[(ByteString, ByteString)]]
38 risParser = do
39 n <- notice "TY -"
40 ns <- many1 (notice "\nTY -")
41 pure $ [n] <> ns
42
43 notice :: Parser ByteString -> Parser [(ByteString, ByteString)]
44 notice s = start *> many field <* end
45 where
46 start :: Parser ByteString
47 start = s *> takeTill isEndOfLine
48
49 end :: Parser ByteString
50 end = "\nER -" *> takeTill isEndOfLine
51
52 field :: Parser (ByteString, ByteString)
53 field = do
54 name <- "\n" *> take 2 <* " - "
55 txt <- takeTill isEndOfLine
56 txts <- try lines
57 let txts' = case DL.length txts > 0 of
58 True -> txts
59 False -> []
60 pure (translate name, concat ([txt] <> txts'))
61
62 lines :: Parser [ByteString]
63 lines = many line
64 where
65 line :: Parser ByteString
66 line = "\n\n" *> takeTill isEndOfLine
67
68 translate :: ByteString -> ByteString
69 translate champs
70 | champs == "AU" = "authors"
71 | champs == "TI" = "title"
72 | champs == "JF" = "source"
73 | champs == "LA" = "language"
74 | champs == "DI" = "doi"
75 | champs == "UR" = "url"
76 | champs == "N2" = "abstract"
77 | otherwise = champs
78 -------------------------------------------------------------
79
80 presseParser :: [(ByteString, ByteString)] -> [(ByteString, ByteString)]
81 presseParser = (toDate "DA" (\x -> either (const []) identity $ parseOnly risDate x))
82 . (toDate "LA" presseLang)
83
84 risDate :: Parser [(ByteString, ByteString)]
85 risDate = do
86 day <- take 2 <* "/"
87 mon <- take 2 <* "/"
88 yea <- take 4
89 pure $ map (first (\x -> "publication_" <> x))
90 [ ("day",day)
91 , ("month", mon)
92 , ("year", yea)
93 , ("date", yea <> "-" <> mon <> "-" <> day <> "T0:0:0")
94 ]
95
96 toDate :: ByteString -> (ByteString -> [(ByteString, ByteString)])
97 -> [(ByteString, ByteString)] -> [(ByteString, ByteString)]
98 toDate k f m = m <> ( maybe [] f (lookup k m) )
99
100 presseLang :: ByteString -> [(ByteString, ByteString)]
101 presseLang "Français" = [("language", "FR")]
102 presseLang "English" = [("langauge", "EN")]
103 presseLang _ = undefined
104
105 {-
106 fixTitle :: [(ByteString, ByteString)] -> [(ByteString, ByteString)]
107 fixTitle ns = ns <> [ti, ab]
108 where
109 ti = case
110 -}