]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Parsers/PubMed.hs
[TEXT][PARSER][PUBMED] PubDate or ArticleDate are not reliable.
[gargantext.git] / src / Gargantext / Text / Parsers / PubMed.hs
1 {-|
2 Module : Gargantext.Text.Parsers.PubMed
3 Description : Parser for Wikimedia dump
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10
11 -}
12
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE NoImplicitPrelude #-}
15
16 module Gargantext.Text.Parsers.PubMed where
17
18
19
20 import Control.Monad (void)
21 import Data.Conduit.List as CL hiding (catMaybes)
22 import Control.Monad (join)
23 import GHC.IO (FilePath)
24 import Prelude (read, print)
25 import Gargantext.Prelude
26 import Control.Applicative ((<*))
27 import Control.Monad.Catch (MonadThrow)
28 import Data.Maybe (Maybe, catMaybes)
29 import Data.Monoid (mconcat)
30 import Text.XML.Stream.Parse
31 import Data.Conduit (runConduit, (.|), ConduitT)
32 import Data.Text (Text, unpack, concat)
33 import Data.XML.Types (Event)
34 import Data.ByteString (ByteString)
35 import Data.Time.Segment (jour)
36 import Data.Time (UTCTime(..))
37 import qualified Data.ByteString.Lazy as DBL
38 import Gargantext.Text.Parsers.Wikimedia
39
40
41 data PubMedArticle =
42 PubMedArticle { pubmed_title :: Maybe Text
43 , pubmed_journal :: Maybe Text
44 , pubmed_abstract :: Maybe [Text]
45 , pubmed_date :: UTCTime
46 , pubmed_year :: Integer
47 , pubmed_month :: Int
48 , pubmed_day :: Int
49 }
50 deriving (Show)
51
52 readPubMedFile :: FilePath -> IO ()
53 readPubMedFile fp = do
54 input <- DBL.readFile fp
55 pubMedParser input
56
57
58 pubMedParser :: DBL.ByteString -> IO ()
59 pubMedParser bstring = runConduit $ parseLBS def bstring
60 .| parseArticleSet
61 .| CL.mapM_ print
62
63 --parseArticleSet :: MonadThrow m => ConduitT Event o m [PubMedArticle]
64 parseArticleSet = do
65 as <- force "force" $ tagIgnoreAttrs "PubmedArticleSet" $ manyYield parsePubMedArticle
66 -- _ <- many $ ignoreAnyTreeContent
67 return as
68
69 parsePubMedArticle :: MonadThrow m => ConduitT Event o m (Maybe PubMedArticle)
70 parsePubMedArticle = do
71 articles <- force "PubmedArticle" $ tagIgnoreAttrs "PubmedArticle" parsePubMedArticle'
72 --_ <- many $ ignoreAnyTreeContent
73 return articles
74
75 parsePubMedArticle' :: MonadThrow m => ConduitT Event o m (Maybe PubMedArticle)
76 parsePubMedArticle' = do
77 pubmed_article <- tagIgnoreAttrs "MedlineCitation" parseMedlineCitation
78 --_ <- tagIgnoreAttrs "PubmedData" content
79 _ <- many $ ignoreAnyTreeContent
80 return pubmed_article
81
82 parseMedlineCitation :: MonadThrow m => ConduitT Event o m PubMedArticle
83 parseMedlineCitation = do
84 a <- force "article" $ manyTagsUntil "Article" parseArticle
85 _ <- many $ ignoreAnyTreeContent
86 return a
87
88 parseArticle :: MonadThrow m => ConduitT Event o m PubMedArticle
89 parseArticle = do
90 (journal,maybePubDate) <- force "journal" $ manyTagsUntil "Journal" $ do
91 maybePubDate' <- manyTagsUntil "JournalIssue" $ do
92 maybePubDate'' <- manyTagsUntil "PubDate" $ do
93 y <- tagIgnoreAttrs "Year" content
94 m <- tagIgnoreAttrs "Month" content
95 d <- tagIgnoreAttrs "Day" content
96 return (y, m, d)
97 return maybePubDate''
98
99 j <- manyTagsUntil "Title" content
100 _ <- many $ ignoreAnyTreeContent
101 return (j,join maybePubDate')
102
103 title <- do
104 t <- manyTagsUntil "ArticleTitle" content
105 return t
106
107 abstracts <- do
108 as <- manyTagsUntil "Abstract" $ many $ do
109 txt <- tagIgnoreAttrs "AbstractText" $ do
110 c <- content
111 _ <- many $ ignoreAnyTreeContent
112 return c
113 _ <- many $ ignoreAnyTreeContent
114 return txt
115 return as
116 -- TODO add authos
117
118 (year, month, day) <- case maybePubDate of
119 Nothing -> force "ArticleDate" $ manyTagsUntil "ArticleDate" $ do
120 y <- force "Year" $ tagIgnoreAttrs "Year" content
121 m <- force "Month" $ tagIgnoreAttrs "Month" content
122 d <- force "Day" $ tagIgnoreAttrs "Day" content
123 return (read $ unpack y, read $ unpack m, read $ unpack d)
124 Just (Just y, Just m, Just d) -> return (read $ unpack "1", read $ unpack "3", read $ unpack "3")
125 _ -> panic "error date"
126
127 _ <- many $ ignoreAnyTreeContent
128 return $ PubMedArticle title journal abstracts (jour year month day) year month day
129
130
131 pubMedData :: DBL.ByteString
132 pubMedData = mconcat
133 [ "<?xml version=\"1.0\"?>\n"
134 , "<!DOCTYPE PubmedArticleSet PUBLIC \"-//NLM//DTD PubMedArticle, 1st June 2018//EN\" \"https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_180601.dtd\">\n"
135 , "<PubmedArticleSet>\n"
136 , "<PubmedArticle>\n"
137 , " <MedlineCitation Status=\"Publisher\" Owner=\"NLM\">\n"
138 , " <PMID Version=\"1\">30357468</PMID>\n"
139 , " <DateRevised>\n"
140 , " <Year>2018</Year>\n"
141 , " </DateRevised>\n"
142 , " <Article PubModel=\"Print-Electronic\">\n"
143 , " <Journal>\n"
144 , " <ISSN IssnType=\"Electronic\">1432-1076</ISSN>\n"
145 , " <Title>European journal of pediatrics</Title>\n"
146 , " </Journal>\n"
147 , " <ArticleTitle>Title of the Article</ArticleTitle>\n"
148 , " <ELocationID EIdType=\"doi\" ValidYN=\"Y\">10.1007/s00431-018-3270-3</ELocationID>\n"
149 , " <Abstract>\n"
150 , " <AbstractText>Abstract Text.</AbstractText>\n"
151 , " </Abstract>\n"
152 , " <AuthorList>\n"
153 , " </AuthorList>\n"
154 , " </Article>\n"
155 , " </MedlineCitation>\n"
156 , " <PubmedData>\n"
157 , " <History>\n"
158 , " </History>\n"
159 , " </PubmedData>\n"
160 , "</PubmedArticle>\n"
161 , "</PubmedArticleSet>\n"
162 ]
163
164
165