]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Corpus/Parsers/RIS/Presse.hs
remove groups from time matching and work only on ngrams and ids
[gargantext.git] / src / Gargantext / Text / Corpus / Parsers / RIS / Presse.hs
1 {-|
2 Module : Gargantext.Text.Corpus.Parsers.RIS.Presse
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Presse RIS format parser for Europresse Database.
11
12 -}
13
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16
17 module Gargantext.Text.Corpus.Parsers.RIS.Presse (presseEnrich) where
18
19 import Data.List (lookup)
20 import Data.Either (either)
21 import Data.Tuple.Extra (first, both, uncurry)
22 import Data.Attoparsec.ByteString (parseOnly)
23 import Data.ByteString (ByteString, length)
24 import Gargantext.Prelude hiding (takeWhile, take, length)
25 import Gargantext.Text.Corpus.Parsers.RIS (onField)
26 import Gargantext.Core (Lang(..))
27 import qualified Gargantext.Text.Corpus.Parsers.Date.Attoparsec as Date
28
29
30
31 presseEnrich :: [(ByteString, ByteString)] -> [(ByteString, ByteString)]
32 presseEnrich = (onField "DA" parseDate)
33 . (onField "LA" parseLang)
34 . fixFields
35
36
37 parseDate :: ByteString -> [(ByteString, ByteString)]
38 parseDate str = either (const []) identity $ parseOnly (Date.parserWith "/") str
39
40 parseLang :: ByteString -> [(ByteString, ByteString)]
41 parseLang "Français" = [(langField, cs $ show FR)]
42 parseLang "English" = [(langField, cs $ show EN)]
43 parseLang x = [(langField, x)]
44
45 langField :: ByteString
46 langField = "language"
47
48
49 fixFields :: [(ByteString, ByteString)] -> [(ByteString, ByteString)]
50 fixFields ns = map (first fixFields'') ns
51 where
52 -- | Title is sometimes longer than abstract
53 fixFields'' = case uncurry (>) <$> look'' of
54 Just True -> fixFields' "abstract" "title"
55 _ -> fixFields' "title" "abstract"
56
57 look'' :: Maybe (Int, Int)
58 look'' = both length <$> look
59
60 look :: Maybe (ByteString,ByteString)
61 look = (,) <$> lookup "TI" ns <*> lookup "N2" ns
62
63
64 fixFields' :: ByteString -> ByteString
65 -> ByteString -> ByteString
66 fixFields' title abstract champs
67 | champs == "AU" = "authors"
68 | champs == "TI" = title
69 | champs == "JF" = "source"
70 | champs == "DI" = "doi"
71 | champs == "UR" = "url"
72 | champs == "N2" = abstract
73 | otherwise = champs
74
75