]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/API/Pubmed.hs
WIP - start porting Pubmed queries
[gargantext.git] / src / Gargantext / Core / Text / Corpus / API / Pubmed.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.API.Pubmed
3 Description : Pubmed API connection
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE DerivingStrategies #-}
13
14 module Gargantext.Core.Text.Corpus.API.Pubmed
15 ( get
16 -- * Internals for testing
17 , ESearch(..)
18 , convertQuery
19 , getESearch
20 )
21 where
22
23 import Conduit
24 import Control.Monad.Reader (runReaderT)
25 import Data.Either (Either)
26 import Data.Maybe
27 import Data.Semigroup
28 import Data.Monoid
29 import Data.Text (Text)
30 import qualified Data.Text as Text
31 import qualified Data.Text.Encoding as TE
32 import Network.HTTP.Types.URI (EscapeItem(..), renderQueryPartialEscape)
33 import Servant.Client (ClientError)
34
35 import Gargantext.Prelude
36 import Gargantext.Core (Lang(..))
37 import Gargantext.Core.Text.Corpus.Query as Corpus
38 import Gargantext.Core.Types (Term(..))
39 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
40
41 import qualified PUBMED as PubMed
42 import qualified PUBMED.Parser as PubMedDoc
43 import PUBMED.Types (Config(..))
44
45
46 -- | A pubmed query.
47 -- See: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
48 newtype ESearch = ESearch { _ESearch :: [EscapeItem] }
49 deriving stock (Show, Eq)
50 deriving newtype (Semigroup, Monoid)
51
52 -- | Returns an /url encoded/ query ready to be sent to pubmed.
53 getESearch :: ESearch -> Text
54 getESearch (ESearch items) =
55 Text.replace "term=" "" . TE.decodeUtf8 . renderQueryPartialEscape False $ [
56 ("term", QE "#" : items)
57 ]
58
59 convertQuery :: Corpus.Query -> ESearch
60 convertQuery q = ESearch (interpretQuery q transformAST)
61 where
62 transformAST :: BoolExpr Term -> [EscapeItem]
63 transformAST ast = case ast of
64 BAnd sub (BConst (Negative term))
65 -- The second term become positive, so that it can be translated.
66 -> (transformAST sub) <> [QN "+AND+NOT+"] <> transformAST (BConst (Positive term))
67 BAnd term1 (BNot term2)
68 -> transformAST term1 <> [QN "+AND+NOT+"] <> transformAST term2
69 BAnd sub1 sub2
70 -> transformAST sub1 <> [QN "+AND+"] <> transformAST sub2
71 BOr sub1 sub2
72 -> transformAST sub1 <> [QN "+OR+"] <> transformAST sub2
73 BNot (BConst (Negative term))
74 -> transformAST (BConst (Positive term)) -- double negation
75 BNot sub
76 -> [QN "NOT+"] <> transformAST sub
77 -- BTrue cannot happen is the query parser doesn't support parsing 'TRUE' alone.
78 BTrue
79 -> mempty
80 -- BTrue cannot happen is the query parser doesn't support parsing 'FALSE' alone.
81 BFalse
82 -> mempty
83 BConst (Positive (Term term))
84 -> [QE (TE.encodeUtf8 term)]
85 -- We can handle negatives via `ANDNOT` with itself.
86 BConst (Negative (Term term))
87 -> [QN "NOT+", QE (TE.encodeUtf8 term)]
88
89 -- | TODO put default pubmed query in gargantext.ini
90 -- by default: 10K docs
91 get :: Text
92 -> Corpus.Query
93 -> Maybe Limit
94 -> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
95 get apiKey q l = do
96 -- The documentation for PUBMED says:
97 -- Values for query keys may also be provided in term if they are preceeded by a
98 -- '#' (%23 in the URL). While only one query_key parameter can be provided to ESearch,
99 -- any number of query keys can be combined in term. Also, if query keys are provided in term,
100 -- they can be combined with OR or NOT in addition to AND.
101 -- Example:
102 -- esearch.fcgi?db=pubmed&term=%231+AND+asthma&WebEnv=<webenv string>&usehistory=y
103 --
104 -- Therefore, we can pretty-print our 'Query' back into something that PubMed could understand.
105 eRes <- runReaderT PubMed.getMetadataWithC (Config { apiKey = Just apiKey
106 , query = getESearch $ convertQuery q
107 , perPage = Just 200
108 , mWebEnv = Nothing })
109 let takeLimit = case l of
110 Nothing -> mapC identity
111 Just l' -> takeC $ getLimit l'
112 pure $ (\(len, docsC) -> (len, docsC .| takeLimit .| mapC (toDoc EN))) <$> eRes
113 --either (\e -> panic $ "CRAWL: PubMed" <> e) (map (toDoc EN))
114 -- <$> PubMed.getMetadataWithC q l
115
116 toDoc :: Lang -> PubMedDoc.PubMed -> HyperdataDocument
117 toDoc l (PubMedDoc.PubMed { pubmed_id
118 , pubmed_article = PubMedDoc.PubMedArticle t j as aus
119 , pubmed_date = PubMedDoc.PubMedDate a y m d }
120 ) = HyperdataDocument { _hd_bdd = Just "PubMed"
121 , _hd_doi = Nothing
122 , _hd_url = Nothing
123 , _hd_uniqId = Just $ Text.pack $ show pubmed_id
124 , _hd_uniqIdBdd = Nothing
125 , _hd_page = Nothing
126 , _hd_title = t
127 , _hd_authors = authors aus
128 , _hd_institutes = institutes aus
129 , _hd_source = j
130 , _hd_abstract = abstract as
131 , _hd_publication_date = Just $ Text.pack $ show a
132 , _hd_publication_year = Just $ fromIntegral y
133 , _hd_publication_month = Just m
134 , _hd_publication_day = Just d
135 , _hd_publication_hour = Nothing
136 , _hd_publication_minute = Nothing
137 , _hd_publication_second = Nothing
138 , _hd_language_iso2 = Just $ (Text.pack . show) l }
139 where
140 authors :: [PubMedDoc.Author] -> Maybe Text
141 authors [] = Nothing
142 authors au = Just $ (Text.intercalate ", ")
143 $ catMaybes
144 $ map (\n -> PubMedDoc.foreName n <> Just " " <> PubMedDoc.lastName n) au
145
146 institutes :: [PubMedDoc.Author] -> Maybe Text
147 institutes [] = Nothing
148 institutes au = Just $ (Text.intercalate ", ")
149 $ (map (Text.replace ", " " - "))
150 $ catMaybes
151 $ map PubMedDoc.affiliation au
152
153
154 abstract :: [Text] -> Maybe Text
155 abstract [] = Nothing
156 abstract as' = Just $ Text.intercalate ", " as'