2 Module : Gargantext.Text.Corpus.Parsers.RIS.Presse
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Presse RIS format parser for Europresse Database.
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
17 module Gargantext.Text.Corpus.Parsers.RIS.Presse (presseEnrich) where
19 import Data.List (lookup)
20 import Data.Either (either)
21 import Data.Tuple.Extra (first, both, uncurry)
22 import Data.Attoparsec.ByteString (parseOnly)
23 import Data.ByteString (ByteString, length)
24 import Gargantext.Prelude hiding (takeWhile, take, length)
25 import Gargantext.Text.Corpus.Parsers.RIS (onField)
26 import Gargantext.Core (Lang(..))
27 import qualified Gargantext.Text.Corpus.Parsers.Date.Attoparsec as Date
31 presseEnrich :: [(ByteString, ByteString)] -> [(ByteString, ByteString)]
32 presseEnrich = (onField "DA" parseDate)
33 . (onField "LA" parseLang)
37 parseDate :: ByteString -> [(ByteString, ByteString)]
38 parseDate str = either (const []) identity $ parseOnly (Date.parserWith "/") str
40 parseLang :: ByteString -> [(ByteString, ByteString)]
41 parseLang "Français" = [(langField, cs $ show FR)]
42 parseLang "English" = [(langField, cs $ show EN)]
43 parseLang x = [(langField, x)]
45 langField :: ByteString
46 langField = "language"
49 fixFields :: [(ByteString, ByteString)] -> [(ByteString, ByteString)]
50 fixFields ns = map (first fixFields'') ns
52 -- | Title is sometimes longer than abstract
53 fixFields'' = case uncurry (>) <$> look'' of
54 Just True -> fixFields' "abstract" "title"
55 _ -> fixFields' "title" "abstract"
57 look'' :: Maybe (Int, Int)
58 look'' = both length <$> look
60 look :: Maybe (ByteString,ByteString)
61 look = (,) <$> lookup "TI" ns <*> lookup "N2" ns
64 fixFields' :: ByteString -> ByteString
65 -> ByteString -> ByteString
66 fixFields' title abstract champs
67 | champs == "AU" = "authors"
68 | champs == "TI" = title
69 | champs == "JF" = "source"
70 | champs == "DI" = "doi"
71 | champs == "UR" = "url"
72 | champs == "N2" = abstract