2 Module : Gargantext.Text.Parsers.PubMed
3 Description : Parser for Wikimedia dump
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE NoImplicitPrelude #-}
16 module Gargantext.Text.Parsers.PubMed where
20 import Control.Monad (void)
21 import Data.Conduit.List as CL hiding (catMaybes)
22 import Control.Monad (join)
23 import GHC.IO (FilePath)
24 import Prelude (read, print)
25 import Gargantext.Prelude
26 import Control.Applicative ((<*))
27 import Control.Monad.Catch (MonadThrow)
28 import Data.Maybe (Maybe, catMaybes)
29 import Data.Monoid (mconcat)
30 import Text.XML.Stream.Parse
31 import Data.Conduit (runConduit, (.|), ConduitT)
32 import Data.Text (Text, unpack, concat)
33 import Data.XML.Types (Event)
34 import Data.ByteString (ByteString)
35 import Data.Time.Segment (jour)
36 import Data.Time (UTCTime(..))
37 import qualified Data.ByteString.Lazy as DBL
38 import Gargantext.Text.Parsers.Wikimedia
42 PubMedArticle { pubmed_title :: Maybe Text
43 , pubmed_journal :: Maybe Text
44 , pubmed_abstract :: Maybe [Text]
45 , pubmed_date :: UTCTime
46 , pubmed_year :: Integer
52 readPubMedFile :: FilePath -> IO ()
53 readPubMedFile fp = do
54 input <- DBL.readFile fp
58 pubMedParser :: DBL.ByteString -> IO ()
59 pubMedParser bstring = runConduit $ parseLBS def bstring
63 --parseArticleSet :: MonadThrow m => ConduitT Event o m [PubMedArticle]
65 as <- force "force" $ tagIgnoreAttrs "PubmedArticleSet" $ manyYield parsePubMedArticle
66 -- _ <- many $ ignoreAnyTreeContent
69 parsePubMedArticle :: MonadThrow m => ConduitT Event o m (Maybe PubMedArticle)
70 parsePubMedArticle = do
71 articles <- force "PubmedArticle" $ tagIgnoreAttrs "PubmedArticle" parsePubMedArticle'
72 --_ <- many $ ignoreAnyTreeContent
75 parsePubMedArticle' :: MonadThrow m => ConduitT Event o m (Maybe PubMedArticle)
76 parsePubMedArticle' = do
77 pubmed_article <- tagIgnoreAttrs "MedlineCitation" parseMedlineCitation
78 --_ <- tagIgnoreAttrs "PubmedData" content
79 _ <- many $ ignoreAnyTreeContent
82 parseMedlineCitation :: MonadThrow m => ConduitT Event o m PubMedArticle
83 parseMedlineCitation = do
84 a <- force "article" $ manyTagsUntil "Article" parseArticle
85 _ <- many $ ignoreAnyTreeContent
88 parseArticle :: MonadThrow m => ConduitT Event o m PubMedArticle
90 (journal,maybePubDate) <- force "journal" $ manyTagsUntil "Journal" $ do
91 maybePubDate' <- manyTagsUntil "JournalIssue" $ do
92 maybePubDate'' <- manyTagsUntil "PubDate" $ do
93 y <- tagIgnoreAttrs "Year" content
94 m <- tagIgnoreAttrs "Month" content
95 d <- tagIgnoreAttrs "Day" content
99 j <- manyTagsUntil "Title" content
100 _ <- many $ ignoreAnyTreeContent
101 return (j,join maybePubDate')
104 t <- manyTagsUntil "ArticleTitle" content
108 as <- manyTagsUntil "Abstract" $ many $ do
109 txt <- tagIgnoreAttrs "AbstractText" $ do
111 _ <- many $ ignoreAnyTreeContent
113 _ <- many $ ignoreAnyTreeContent
118 (year, month, day) <- case maybePubDate of
119 Nothing -> force "ArticleDate" $ manyTagsUntil "ArticleDate" $ do
120 y <- force "Year" $ tagIgnoreAttrs "Year" content
121 m <- force "Month" $ tagIgnoreAttrs "Month" content
122 d <- force "Day" $ tagIgnoreAttrs "Day" content
123 return (read $ unpack y, read $ unpack m, read $ unpack d)
124 Just (Just y, Just m, Just d) -> return (read $ unpack "1", read $ unpack "3", read $ unpack "3")
125 _ -> panic "error date"
127 _ <- many $ ignoreAnyTreeContent
128 return $ PubMedArticle title journal abstracts (jour year month day) year month day
131 pubMedData :: DBL.ByteString
133 [ "<?xml version=\"1.0\"?>\n"
134 , "<!DOCTYPE PubmedArticleSet PUBLIC \"-//NLM//DTD PubMedArticle, 1st June 2018//EN\" \"https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_180601.dtd\">\n"
135 , "<PubmedArticleSet>\n"
136 , "<PubmedArticle>\n"
137 , " <MedlineCitation Status=\"Publisher\" Owner=\"NLM\">\n"
138 , " <PMID Version=\"1\">30357468</PMID>\n"
140 , " <Year>2018</Year>\n"
141 , " </DateRevised>\n"
142 , " <Article PubModel=\"Print-Electronic\">\n"
144 , " <ISSN IssnType=\"Electronic\">1432-1076</ISSN>\n"
145 , " <Title>European journal of pediatrics</Title>\n"
147 , " <ArticleTitle>Title of the Article</ArticleTitle>\n"
148 , " <ELocationID EIdType=\"doi\" ValidYN=\"Y\">10.1007/s00431-018-3270-3</ELocationID>\n"
150 , " <AbstractText>Abstract Text.</AbstractText>\n"
155 , " </MedlineCitation>\n"
160 , "</PubmedArticle>\n"
161 , "</PubmedArticleSet>\n"