2 Module : Gargantext.Text.Corpus.Parsers.RIS.Presse
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Presse RIS format parser for Europresse Database.
15 module Gargantext.Text.Corpus.Parsers.RIS.Presse (presseEnrich) where
17 import Data.List (lookup)
18 import Data.Either (either)
19 import Data.Tuple.Extra (first, both, uncurry)
20 import Data.Attoparsec.ByteString (parseOnly)
21 import Data.ByteString (ByteString, length)
22 import Gargantext.Prelude hiding (takeWhile, take, length)
23 import Gargantext.Text.Corpus.Parsers.RIS (onField)
24 import Gargantext.Core (Lang(..))
25 import qualified Gargantext.Text.Corpus.Parsers.Date.Attoparsec as Date
29 presseEnrich :: [(ByteString, ByteString)] -> [(ByteString, ByteString)]
30 presseEnrich = (onField "DA" parseDate)
31 . (onField "LA" parseLang)
35 parseDate :: ByteString -> [(ByteString, ByteString)]
36 parseDate str = either (const []) identity $ parseOnly (Date.parserWith "/") str
38 parseLang :: ByteString -> [(ByteString, ByteString)]
39 parseLang "Français" = [(langField, cs $ show FR)]
40 parseLang "English" = [(langField, cs $ show EN)]
41 parseLang x = [(langField, x)]
43 langField :: ByteString
44 langField = "language"
47 fixFields :: [(ByteString, ByteString)] -> [(ByteString, ByteString)]
48 fixFields ns = map (first fixFields'') ns
50 -- | Title is sometimes longer than abstract
51 fixFields'' = case uncurry (>) <$> look'' of
52 Just True -> fixFields' "abstract" "title"
53 _ -> fixFields' "title" "abstract"
55 look'' :: Maybe (Int, Int)
56 look'' = both length <$> look
58 look :: Maybe (ByteString,ByteString)
59 look = (,) <$> lookup "TI" ns <*> lookup "N2" ns
62 fixFields' :: ByteString -> ByteString
63 -> ByteString -> ByteString
64 fixFields' title abstract champs
65 | champs == "AU" = "authors"
66 | champs == "TI" = title
67 | champs == "JF" = "source"
68 | champs == "DI" = "doi"
69 | champs == "UR" = "url"
70 | champs == "N2" = abstract