2 Module : Gargantext.Core.Text.Corpus.API.Pubmed
3 Description : Pubmed API connection
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE DerivingStrategies #-}
14 module Gargantext.Core.Text.Corpus.API.Pubmed
16 -- * Internals for testing
24 import Control.Monad.Reader (runReaderT)
25 import Data.Either (Either)
29 import Data.Text (Text)
30 import qualified Data.Text as Text
31 import qualified Data.Text.Encoding as TE
32 import Network.HTTP.Types.URI (EscapeItem(..), renderQueryPartialEscape)
33 import Servant.Client (ClientError)
35 import Gargantext.Prelude
36 import Gargantext.Core (Lang(..))
37 import Gargantext.Core.Text.Corpus.Query as Corpus
38 import Gargantext.Core.Types (Term(..))
39 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
41 import qualified PUBMED as PubMed
42 import qualified PUBMED.Parser as PubMedDoc
43 import PUBMED.Types (Config(..))
47 -- See: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
48 newtype ESearch = ESearch { _ESearch :: [EscapeItem] }
49 deriving stock (Show, Eq)
50 deriving newtype (Semigroup, Monoid)
52 -- | Returns an /url encoded/ query ready to be sent to pubmed.
53 getESearch :: ESearch -> Text
54 getESearch (ESearch items) =
55 Text.replace "term=" "" . TE.decodeUtf8 . renderQueryPartialEscape False $ [
59 convertQuery :: Corpus.Query -> ESearch
60 convertQuery q = ESearch (interpretQuery q transformAST)
62 transformAST :: BoolExpr Term -> [EscapeItem]
63 transformAST ast = case ast of
64 BAnd sub (BConst (Negative term))
65 -- The second term become positive, so that it can be translated.
66 -> (transformAST sub) <> [QN "+AND+NOT+"] <> transformAST (BConst (Positive term))
67 BAnd term1 (BNot term2)
68 -> transformAST term1 <> [QN "+AND+NOT+"] <> transformAST term2
70 -> transformAST sub1 <> [QN "+AND+"] <> transformAST sub2
72 -> transformAST sub1 <> [QN "+OR+"] <> transformAST sub2
73 BNot (BConst (Negative term))
74 -> transformAST (BConst (Positive term)) -- double negation
76 -> [QN "NOT+"] <> transformAST sub
77 -- BTrue cannot happen is the query parser doesn't support parsing 'TRUE' alone.
80 -- BTrue cannot happen is the query parser doesn't support parsing 'FALSE' alone.
83 BConst (Positive (Term term))
84 -> [QE (TE.encodeUtf8 term)]
85 -- We can handle negatives via `ANDNOT` with itself.
86 BConst (Negative (Term term))
87 -> [QN "NOT+", QE (TE.encodeUtf8 term)]
89 -- | TODO put default pubmed query in gargantext.ini
90 -- by default: 10K docs
94 -> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
96 -- The documentation for PUBMED says:
97 -- Values for query keys may also be provided in term if they are preceeded by a
98 -- '#' (%23 in the URL). While only one query_key parameter can be provided to ESearch,
99 -- any number of query keys can be combined in term. Also, if query keys are provided in term,
100 -- they can be combined with OR or NOT in addition to AND.
102 -- esearch.fcgi?db=pubmed&term=%231+AND+asthma&WebEnv=<webenv string>&usehistory=y
104 -- Therefore, we can pretty-print our 'Query' back into something that PubMed could understand.
105 eRes <- runReaderT PubMed.getMetadataWithC (Config { apiKey = Just apiKey
106 , query = getESearch $ convertQuery q
108 , mWebEnv = Nothing })
109 let takeLimit = case l of
110 Nothing -> mapC identity
111 Just l' -> takeC $ getLimit l'
112 pure $ (\(len, docsC) -> (len, docsC .| takeLimit .| mapC (toDoc EN))) <$> eRes
113 --either (\e -> panic $ "CRAWL: PubMed" <> e) (map (toDoc EN))
114 -- <$> PubMed.getMetadataWithC q l
116 toDoc :: Lang -> PubMedDoc.PubMed -> HyperdataDocument
117 toDoc l (PubMedDoc.PubMed { pubmed_id
118 , pubmed_article = PubMedDoc.PubMedArticle t j as aus
119 , pubmed_date = PubMedDoc.PubMedDate a y m d }
120 ) = HyperdataDocument { _hd_bdd = Just "PubMed"
123 , _hd_uniqId = Just $ Text.pack $ show pubmed_id
124 , _hd_uniqIdBdd = Nothing
127 , _hd_authors = authors aus
128 , _hd_institutes = institutes aus
130 , _hd_abstract = abstract as
131 , _hd_publication_date = Just $ Text.pack $ show a
132 , _hd_publication_year = Just $ fromIntegral y
133 , _hd_publication_month = Just m
134 , _hd_publication_day = Just d
135 , _hd_publication_hour = Nothing
136 , _hd_publication_minute = Nothing
137 , _hd_publication_second = Nothing
138 , _hd_language_iso2 = Just $ (Text.pack . show) l }
140 authors :: [PubMedDoc.Author] -> Maybe Text
142 authors au = Just $ (Text.intercalate ", ")
144 $ map (\n -> PubMedDoc.foreName n <> Just " " <> PubMedDoc.lastName n) au
146 institutes :: [PubMedDoc.Author] -> Maybe Text
147 institutes [] = Nothing
148 institutes au = Just $ (Text.intercalate ", ")
149 $ (map (Text.replace ", " " - "))
151 $ map PubMedDoc.affiliation au
154 abstract :: [Text] -> Maybe Text
155 abstract [] = Nothing
156 abstract as' = Just $ Text.intercalate ", " as'