]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/API/Istex.hs
[pubmed] use fixed pubmed repo, fix per page to be > 20
[gargantext.git] / src / Gargantext / Core / Text / Corpus / API / Istex.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.API.Istex
3 Description : Pubmed API connection
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12
13 module Gargantext.Core.Text.Corpus.API.Istex
14 where
15
16 import Data.Either (Either(..))
17 import Data.List (concat)
18 import Data.Maybe
19 import Data.Text (Text, pack)
20
21 import qualified Data.Text as Text
22 import qualified Data.List as List
23 import Gargantext.Core (Lang(..))
24 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
25 import qualified Gargantext.Defaults as Defaults
26 import Gargantext.Prelude
27 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
28 import qualified ISTEX as ISTEX
29 import qualified ISTEX.Client as ISTEX
30
31 type Query = Text
32 type MaxResults = Maybe Integer
33
34 get :: Lang -> Query -> MaxResults -> IO [HyperdataDocument]
35 get la query' maxResults = do
36 --printDebug "[Istex.get] calling getMetadataScrollProgress for la" la
37 --printDebug "[Istex.get] calling getMetadataScrollProgress for q" q
38 --printDebug "[Istex.get] calling getMetadataScrollProgress for ml" ml
39 -- The "scroll" expects "d/h/m/s/ms" time interval. Let's set it to "1 month"
40 --eDocs <- ISTEX.getMetadataScroll q ((\_n -> pack $ "1m") <$> ml) Nothing 0 --(fromIntegral <$> ml)
41
42 -- TODO check if abstract is in query already if not add like below
43 -- eDocs <- ISTEX.getMetadataScroll (q <> " abstract:*") "1m" Nothing 0 --(fromIntegral <$> ml)
44 -- eDocs <- ISTEX.getMetadataScroll q "1m" Nothing 0 --(fromIntegral <$> ml)
45
46 let query = case (List.length $ Text.splitOn ":" query') == 1 of
47 -- True case means users is entering default search of IsTex
48 -- In that case we need to enrich his query with 2 parameters
49 -- First expected language: user has to define it in GTXT
50 -- Second : query in abstract
51 True -> ("language:"<> lang la) <> " AND abstract:"<>query'
52 where
53 lang FR = "fre"
54 lang _ = "eng"
55
56 False -> query'
57 -- Complex queries of IsTex needs parameters using ":" so we leave the query as it is
58 -- in that case we suppose user is knowing what s.he is doing
59
60 eDocs <- ISTEX.getMetadataWith query (fromIntegral <$> maxResults)
61 -- printDebug "[Istex.get] will print length" (0 :: Int)
62 case eDocs of
63 Left _ -> pure ()
64 Right (ISTEX.Documents { _documents_hits }) -> printDebug "[Istex.get] length docs" $ length _documents_hits
65 --ISTEX.getMetadataScrollProgress q ((\_ -> pack $ "1m") <$> ml) Nothing progress errorHandler
66 case eDocs of
67 Left err -> panic . Text.pack . show $ err
68 Right docs -> toDoc' la docs
69 --pure $ either (panic . pack . show) (toDoc' la) eDocs
70 -- where
71 -- progress (ISTEX.ScrollResponse { _scroll_documents = ISTEX.Documents { _documents_hits }}) =
72 -- printDebug "[Istex.get] got docs: " $ length _documents_hits
73 -- errorHandler err = printDebug "[Istex.get] error" $ show err
74
75 toDoc' :: Lang -> ISTEX.Documents -> IO [HyperdataDocument]
76 toDoc' la docs' = mapM (toDoc la) (ISTEX._documents_hits docs')
77 --printDebug "ISTEX" (ISTEX._documents_total docs')
78
79 -- | TODO remove dateSplit here
80 -- TODO current year as default
81 toDoc :: Lang -> ISTEX.Document -> IO HyperdataDocument
82 toDoc la (ISTEX.Document i t a ab d s) = do
83 --printDebug "ISTEX date" d
84 (utctime, (pub_year, pub_month, pub_day)) <-
85 Date.dateSplit la (maybe (Just $ pack $ show Defaults.year) (Just . pack . show) d)
86 --printDebug "toDoc Istex" (utctime, (pub_year, pub_month, pub_day))
87 pure $ HyperdataDocument { _hd_bdd = Just "Istex"
88 , _hd_doi = Just i
89 , _hd_url = Nothing
90 , _hd_uniqId = Nothing
91 , _hd_uniqIdBdd = Nothing
92 , _hd_page = Nothing
93 , _hd_title = t
94 , _hd_authors = Just $ foldl (\x y -> x <> ", " <> y) "" (map ISTEX._author_name a)
95 , _hd_institutes = Just $ foldl (\x y -> x <> ", " <> y) "" (concat $ (map ISTEX._author_affiliations) a)
96 , _hd_source = Just $ foldl (\x y -> x <> ", " <> y) "" (catMaybes $ map ISTEX._source_title s)
97 , _hd_abstract = ab
98 , _hd_publication_date = fmap (pack . show) utctime
99 , _hd_publication_year = pub_year
100 , _hd_publication_month = pub_month
101 , _hd_publication_day = pub_day
102 , _hd_publication_hour = Nothing
103 , _hd_publication_minute = Nothing
104 , _hd_publication_second = Nothing
105 , _hd_language_iso2 = Just $ (pack . show) la
106 }