]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Parsers/PubMed.hs
[PARSERS] refactor.
[gargantext.git] / src / Gargantext / Text / Parsers / PubMed.hs
1 {-|
2 Module : Gargantext.Text.Parsers.PubMed
3 Description : Parser for Wikimedia dump
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 This version of Parsers fixes the Date of publication in Gargantext
11 (V3) parser of PubMed. Indeed, we can not rely neither on Journal
12 Publication Date neither on Article publication date, which are
13 incomplete structurally but for its interpretation too. Then, to
14 simplify and uniformize data, date of publication of database insertion
15 is used.
16
17 TODO:
18 - Add authors
19 - factorize
20 -}
21
22 {-# LANGUAGE OverloadedStrings #-}
23 {-# LANGUAGE NoImplicitPrelude #-}
24
25 module Gargantext.Text.Parsers.PubMed where
26
27
28 import Data.Conduit.List as CL hiding (catMaybes, head)
29 import Control.Monad (join)
30 import GHC.IO (FilePath)
31 import Prelude (read)
32 import Gargantext.Prelude
33 import Control.Monad.Catch (MonadThrow)
34 import Data.Maybe (Maybe)
35 import Data.Monoid (mconcat)
36 import Text.XML.Stream.Parse
37 import Data.Conduit (runConduit, (.|), ConduitT)
38 import Data.Text (Text, unpack)
39 import Data.XML.Types (Event)
40 import Data.Time.Segment (jour)
41 import Data.Time (UTCTime(..))
42 import qualified Data.ByteString.Lazy as DBL
43 import Gargantext.Text.Parsers.Wikimedia
44
45
46 data PubMed =
47 PubMed { pubmed_article :: PubMedArticle
48 , pubmed_date :: PubMedData
49 } deriving Show
50
51 data PubMedArticle =
52 PubMedArticle { pubmed_title :: Maybe Text
53 , pubmed_journal :: Maybe Text
54 , pubmed_abstract :: Maybe [Text]
55 }
56 deriving (Show)
57
58 data PubMedData =
59 PubMedData { pubmedData_date :: UTCTime
60 , pubmedData_year :: Integer
61 , pubmedData_month :: Int
62 , pubmedData_day :: Int
63 } deriving (Show)
64
65 readPubMedFile :: FilePath -> IO [PubMed]
66 readPubMedFile fp = do
67 input <- DBL.readFile fp
68 pubMedParser input
69
70 pubMedParser :: DBL.ByteString -> IO [PubMed]
71 pubMedParser bstring = runConduit $ parseLBS def bstring
72 .| parseArticleSet
73 .| CL.consume
74
75 parseArticleSet :: MonadThrow m => ConduitT Event PubMed m ()
76 parseArticleSet = do
77 as <- force "force" $ tagIgnoreAttrs "PubmedArticleSet" $ manyYield parsePubMedArticle
78 return as
79
80 parsePubMedArticle :: MonadThrow m => ConduitT Event o m (Maybe PubMed)
81 parsePubMedArticle = do
82 articles <- tagIgnoreAttrs "PubmedArticle" parsePubMedArticle'
83 return articles
84
85 parsePubMedArticle' :: MonadThrow m => ConduitT Event o m (PubMed)
86 parsePubMedArticle' = do
87 article <- force "MedlineCitation" $ tagIgnoreAttrs "MedlineCitation" parseMedlineCitation
88 dates <- tagIgnoreAttrs "PubmedData" $ do
89 dates' <- tagIgnoreAttrs "History" $ many $ tagIgnoreAttrs "PubMedPubDate" $ do
90 y' <- force "Year" $ tagIgnoreAttrs "Year" content
91 m' <- force "Month" $ tagIgnoreAttrs "Month" content
92 d' <- force "Day" $ tagIgnoreAttrs "Day" content
93 _ <- many $ ignoreAnyTreeContent
94 return (read $ unpack y', read $ unpack m', read $ unpack d')
95 _ <- many $ ignoreAnyTreeContent
96 return dates'
97 _ <- many $ ignoreAnyTreeContent
98 let (y,m,d) = maybe (1,1,1) identity $ join $ fmap head $ reverse <$> join dates
99 return $ PubMed (article) (PubMedData (jour y m d) y m d)
100
101 parseMedlineCitation :: MonadThrow m => ConduitT Event o m PubMedArticle
102 parseMedlineCitation = do
103 a <- force "article" $ manyTagsUntil "Article" parseArticle
104 _ <- many $ ignoreAnyTreeContent
105 return a
106
107 parseArticle :: MonadThrow m => ConduitT Event o m PubMedArticle
108 parseArticle = do
109 journal <- force "journal" $ manyTagsUntil "Journal" $ do
110 j <- manyTagsUntil "Title" content
111 _ <- many $ ignoreAnyTreeContent
112 return j
113
114 title <- do
115 t <- manyTagsUntil "ArticleTitle" content
116 return t
117
118 abstracts <- do
119 as <- manyTagsUntil "Abstract" $ many $ do
120 txt <- tagIgnoreAttrs "AbstractText" $ do
121 c <- content
122 _ <- many $ ignoreAnyTreeContent
123 return c
124 _ <- many $ ignoreAnyTreeContent
125 return txt
126 return as
127 -- TODO add authos
128
129 _ <- many $ ignoreAnyTreeContent
130 return $ PubMedArticle title journal abstracts
131
132
133 pubMedData :: DBL.ByteString
134 pubMedData = mconcat
135 [ "<?xml version=\"1.0\"?>\n"
136 , "<!DOCTYPE PubmedArticleSet PUBLIC \"-//NLM//DTD PubMedArticle, 1st June 2018//EN\" \"https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_180601.dtd\">\n"
137 , "<PubmedArticleSet>\n"
138 , "<PubmedArticle>\n"
139 , " <MedlineCitation Status=\"Publisher\" Owner=\"NLM\">\n"
140 , " <PMID Version=\"1\">30357468</PMID>\n"
141 , " <DateRevised>\n"
142 , " <Year>2018</Year>\n"
143 , " </DateRevised>\n"
144 , " <Article PubModel=\"Print-Electronic\">\n"
145 , " <Journal>\n"
146 , " <ISSN IssnType=\"Electronic\">1432-1076</ISSN>\n"
147 , " <Title>European journal of pediatrics</Title>\n"
148 , " </Journal>\n"
149 , " <ArticleTitle>Title of the Article</ArticleTitle>\n"
150 , " <ELocationID EIdType=\"doi\" ValidYN=\"Y\">10.1007/s00431-018-3270-3</ELocationID>\n"
151 , " <Abstract>\n"
152 , " <AbstractText>Abstract Text.</AbstractText>\n"
153 , " </Abstract>\n"
154 , " <AuthorList>\n"
155 , " </AuthorList>\n"
156 , " </Article>\n"
157 , " </MedlineCitation>\n"
158 , " <PubmedData>\n"
159 , " <History>\n"
160 , " </History>\n"
161 , " </PubmedData>\n"
162 , "</PubmedArticle>\n"
163 , "</PubmedArticleSet>\n"
164 ]
165
166
167