{-| Module : Gargantext.Text.Parsers.PubMed Description : Parser for Wikimedia dump Copyright : (c) CNRS, 2017-Present License : AGPL + CECILL v3 Maintainer : team@gargantext.org Stability : experimental Portability : POSIX This version of Parsers fixes the Date of publication in Gargantext (V3) parser of PubMed. Indeed, we can not rely neither on Journal Publication Date neither on Article publication date, which are incomplete structurally but for its interpretation too. Then, to simplify and uniformize data, date of publication of database insertion is used. TODO: - Add authors - factorize -} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE NoImplicitPrelude #-} module Gargantext.Text.Parsers.PubMed where import Data.Conduit.List as CL hiding (catMaybes, head) import Control.Monad (join) import GHC.IO (FilePath) import Prelude (read) import Gargantext.Prelude import Control.Monad.Catch (MonadThrow) import Data.Maybe (Maybe) import Data.Monoid (mconcat) import Text.XML.Stream.Parse import Data.Conduit (runConduit, (.|), ConduitT) import Data.Text (Text, unpack) import Data.XML.Types (Event) import Data.Time.Segment (jour) import Data.Time (UTCTime(..)) import qualified Data.ByteString.Lazy as DBL import Gargantext.Text.Parsers.Wikimedia data PubMed = PubMed { pubmed_article :: PubMedArticle , pubmed_date :: PubMedData } deriving Show data PubMedArticle = PubMedArticle { pubmed_title :: Maybe Text , pubmed_journal :: Maybe Text , pubmed_abstract :: Maybe [Text] } deriving (Show) data PubMedData = PubMedData { pubmedData_date :: UTCTime , pubmedData_year :: Integer , pubmedData_month :: Int , pubmedData_day :: Int } deriving (Show) readPubMedFile :: FilePath -> IO [PubMed] readPubMedFile fp = do input <- DBL.readFile fp pubMedParser input pubMedParser :: DBL.ByteString -> IO [PubMed] pubMedParser bstring = runConduit $ parseLBS def bstring .| parseArticleSet .| CL.consume parseArticleSet :: MonadThrow m => ConduitT Event PubMed m () parseArticleSet = do as <- force "force" $ tagIgnoreAttrs "PubmedArticleSet" $ manyYield parsePubMedArticle return as parsePubMedArticle :: MonadThrow m => ConduitT Event o m (Maybe PubMed) parsePubMedArticle = do articles <- tagIgnoreAttrs "PubmedArticle" parsePubMedArticle' return articles parsePubMedArticle' :: MonadThrow m => ConduitT Event o m (PubMed) parsePubMedArticle' = do article <- force "MedlineCitation" $ tagIgnoreAttrs "MedlineCitation" parseMedlineCitation dates <- tagIgnoreAttrs "PubmedData" $ do dates' <- tagIgnoreAttrs "History" $ many $ tagIgnoreAttrs "PubMedPubDate" $ do y' <- force "Year" $ tagIgnoreAttrs "Year" content m' <- force "Month" $ tagIgnoreAttrs "Month" content d' <- force "Day" $ tagIgnoreAttrs "Day" content _ <- many $ ignoreAnyTreeContent return (read $ unpack y', read $ unpack m', read $ unpack d') _ <- many $ ignoreAnyTreeContent return dates' _ <- many $ ignoreAnyTreeContent let (y,m,d) = maybe (1,1,1) identity $ join $ fmap head $ reverse <$> join dates return $ PubMed (article) (PubMedData (jour y m d) y m d) parseMedlineCitation :: MonadThrow m => ConduitT Event o m PubMedArticle parseMedlineCitation = do a <- force "article" $ manyTagsUntil "Article" parseArticle _ <- many $ ignoreAnyTreeContent return a parseArticle :: MonadThrow m => ConduitT Event o m PubMedArticle parseArticle = do journal <- force "journal" $ manyTagsUntil "Journal" $ do j <- manyTagsUntil "Title" content _ <- many $ ignoreAnyTreeContent return j title <- do t <- manyTagsUntil "ArticleTitle" content return t abstracts <- do as <- manyTagsUntil "Abstract" $ many $ do txt <- tagIgnoreAttrs "AbstractText" $ do c <- content _ <- many $ ignoreAnyTreeContent return c _ <- many $ ignoreAnyTreeContent return txt return as -- TODO add authos _ <- many $ ignoreAnyTreeContent return $ PubMedArticle title journal abstracts pubMedData :: DBL.ByteString pubMedData = mconcat [ "\n" , "\n" , "\n" , "\n" , " \n" , " 30357468\n" , " \n" , " 2018\n" , " \n" , "
\n" , " \n" , " 1432-1076\n" , " European journal of pediatrics\n" , " \n" , " Title of the Article\n" , " 10.1007/s00431-018-3270-3\n" , " \n" , " Abstract Text.\n" , " \n" , " \n" , " \n" , "
\n" , "
\n" , " \n" , " \n" , " \n" , " \n" , "
\n" , "
\n" ]