2 Module : Gargantext.Text.Parsers.PubMed
3 Description : Parser for Wikimedia dump
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 This version of Parsers fixes the Date of publication in Gargantext
11 (V3) parser of PubMed. Indeed, we can not rely neither on Journal
12 Publication Date neither on Article publication date, which are
13 incomplete structurally but for its interpretation too. Then, to
14 simplify and uniformize data, date of publication of database insertion
22 {-# LANGUAGE OverloadedStrings #-}
23 {-# LANGUAGE NoImplicitPrelude #-}
25 module Gargantext.Text.Parsers.PubMed where
28 import Data.Conduit.List as CL hiding (catMaybes, head)
29 import Control.Monad (join)
30 import GHC.IO (FilePath)
32 import Gargantext.Prelude
33 import Control.Monad.Catch (MonadThrow)
34 import Data.Maybe (Maybe)
35 import Data.Monoid (mconcat)
36 import Text.XML.Stream.Parse
37 import Data.Conduit (runConduit, (.|), ConduitT)
38 import Data.Text (Text, unpack)
39 import Data.XML.Types (Event)
40 import Data.Time.Segment (jour)
41 import Data.Time (UTCTime(..))
42 import qualified Data.ByteString.Lazy as DBL
43 import Gargantext.Text.Parsers.Wikimedia
47 PubMed { pubmed_article :: PubMedArticle
48 , pubmed_date :: PubMedData
52 PubMedArticle { pubmed_title :: Maybe Text
53 , pubmed_journal :: Maybe Text
54 , pubmed_abstract :: Maybe [Text]
59 PubMedData { pubmedData_date :: UTCTime
60 , pubmedData_year :: Integer
61 , pubmedData_month :: Int
62 , pubmedData_day :: Int
65 readPubMedFile :: FilePath -> IO [PubMed]
66 readPubMedFile fp = do
67 input <- DBL.readFile fp
70 pubMedParser :: DBL.ByteString -> IO [PubMed]
71 pubMedParser bstring = runConduit $ parseLBS def bstring
75 parseArticleSet :: MonadThrow m => ConduitT Event PubMed m ()
77 as <- force "force" $ tagIgnoreAttrs "PubmedArticleSet" $ manyYield parsePubMedArticle
80 parsePubMedArticle :: MonadThrow m => ConduitT Event o m (Maybe PubMed)
81 parsePubMedArticle = do
82 articles <- tagIgnoreAttrs "PubmedArticle" parsePubMedArticle'
85 parsePubMedArticle' :: MonadThrow m => ConduitT Event o m (PubMed)
86 parsePubMedArticle' = do
87 article <- force "MedlineCitation" $ tagIgnoreAttrs "MedlineCitation" parseMedlineCitation
88 dates <- tagIgnoreAttrs "PubmedData" $ do
89 dates' <- tagIgnoreAttrs "History" $ many $ tagIgnoreAttrs "PubMedPubDate" $ do
90 y' <- force "Year" $ tagIgnoreAttrs "Year" content
91 m' <- force "Month" $ tagIgnoreAttrs "Month" content
92 d' <- force "Day" $ tagIgnoreAttrs "Day" content
93 _ <- many $ ignoreAnyTreeContent
94 return (read $ unpack y', read $ unpack m', read $ unpack d')
95 _ <- many $ ignoreAnyTreeContent
97 _ <- many $ ignoreAnyTreeContent
98 let (y,m,d) = maybe (1,1,1) identity $ join $ fmap head $ reverse <$> join dates
99 return $ PubMed (article) (PubMedData (jour y m d) y m d)
101 parseMedlineCitation :: MonadThrow m => ConduitT Event o m PubMedArticle
102 parseMedlineCitation = do
103 a <- force "article" $ manyTagsUntil "Article" parseArticle
104 _ <- many $ ignoreAnyTreeContent
107 parseArticle :: MonadThrow m => ConduitT Event o m PubMedArticle
109 journal <- force "journal" $ manyTagsUntil "Journal" $ do
110 j <- manyTagsUntil "Title" content
111 _ <- many $ ignoreAnyTreeContent
115 t <- manyTagsUntil "ArticleTitle" content
119 as <- manyTagsUntil "Abstract" $ many $ do
120 txt <- tagIgnoreAttrs "AbstractText" $ do
122 _ <- many $ ignoreAnyTreeContent
124 _ <- many $ ignoreAnyTreeContent
129 _ <- many $ ignoreAnyTreeContent
130 return $ PubMedArticle title journal abstracts
133 pubMedData :: DBL.ByteString
135 [ "<?xml version=\"1.0\"?>\n"
136 , "<!DOCTYPE PubmedArticleSet PUBLIC \"-//NLM//DTD PubMedArticle, 1st June 2018//EN\" \"https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_180601.dtd\">\n"
137 , "<PubmedArticleSet>\n"
138 , "<PubmedArticle>\n"
139 , " <MedlineCitation Status=\"Publisher\" Owner=\"NLM\">\n"
140 , " <PMID Version=\"1\">30357468</PMID>\n"
142 , " <Year>2018</Year>\n"
143 , " </DateRevised>\n"
144 , " <Article PubModel=\"Print-Electronic\">\n"
146 , " <ISSN IssnType=\"Electronic\">1432-1076</ISSN>\n"
147 , " <Title>European journal of pediatrics</Title>\n"
149 , " <ArticleTitle>Title of the Article</ArticleTitle>\n"
150 , " <ELocationID EIdType=\"doi\" ValidYN=\"Y\">10.1007/s00431-018-3270-3</ELocationID>\n"
152 , " <AbstractText>Abstract Text.</AbstractText>\n"
157 , " </MedlineCitation>\n"
162 , "</PubmedArticle>\n"
163 , "</PubmedArticleSet>\n"