2 Module : Gargantext.Core.Text.Corpus.Parsers.Wikidata
4 Description : To query Wikidata
6 Description : To query Wikidata
8 Copyright : (c) CNRS, 2019-Present
9 License : AGPL + CECILL v3
10 Maintainer : team@gargantext.org
11 Stability : experimental
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE ScopedTypeVariables #-}
19 module Gargantext.Core.Text.Corpus.Parsers.Wikidata where
21 import Control.Lens (makeLenses, (^.) )
22 import Data.Maybe (catMaybes)
23 import Data.Text (Text, concat)
24 import Database.HSparql.Connection
25 import Gargantext.Core (Lang(..))
26 import Gargantext.Core.Text.Corpus.Parsers.Isidore (unbound)
27 import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
28 import Gargantext.Prelude
29 import Gargantext.Core.Text.Corpus.Parsers.Wikidata.Crawler
30 import Prelude (String)
31 import qualified Data.List as List
32 import Gargantext.Core.Text.Corpus.Parsers.Date (dateSplit)
36 data WikiResult = WikiResult { _wr_cid :: Maybe Text
37 , _wr_title :: Maybe Text
38 , _wr_url :: Maybe Text
39 , _wr_yearStart :: Maybe Text
40 , _wr_yearEnd :: Maybe Text
41 , _wr_yearFlorish :: Maybe Text
43 $(makeLenses ''WikiResult)
45 type NumberOfSections = Int
47 wikidataGet :: Int -> NumberOfSections -> IO [HyperdataDocument]
49 results <- wikidataSelect n
50 mapM (wikiPageToDocument m) results
53 wikiPageToDocument :: NumberOfSections -> WikiResult -> IO HyperdataDocument
54 wikiPageToDocument m wr = do
55 sections <- case wr ^. wr_url of
59 let bdd = Just "wikidata"
65 title = (wr ^. wr_title)
69 abstract = Just $ concat $ take m sections
71 (date, (year, month, day))
72 <- dateSplit EN $ head
76 , wr ^. wr_yearFlorish
83 iso2 = Just $ cs $ show EN
85 pure $ HyperdataDocument bdd doi url uniqId uniqIdBdd
86 page title authors institutes source
87 abstract ((cs . show) <$> date) year month day hour minute second iso2
90 wikidataSelect :: Int -> IO [WikiResult]
92 result <- selectQueryRaw wikidataRoute (wikidataQuery n)
95 Just result' -> pure $ map toWikiResult $ unbound' EN result'
98 unbound' :: Lang -> [[BindingValue]] -> [[Maybe Text]]
99 unbound' l = map (map (unbound l))
101 toWikiResult :: [Maybe Text] -> WikiResult
102 toWikiResult (c:t:u:ys:ye:yf:_) = WikiResult c t u ys ye yf
103 toWikiResult _ = panic "[G.C.T.C.Parsers.Wikidata.toWikiResult] error"
105 wikidataRoute :: EndPoint
106 wikidataRoute = "https://query.wikidata.org/sparql"
108 wikidataQuery :: Int -> String
109 wikidataQuery n = List.unlines
110 [" PREFIX wd: <http://www.wikidata.org/entity/>"
111 ," PREFIX wdt: <http://www.wikidata.org/prop/direct/>"
112 ," PREFIX schema: <http://schema.org/>"
113 ," PREFIX wikibase: <http://wikiba.se/ontology#>"
118 ," (year(xsd:dateTime(?dateStart)) as ?yearStart)"
119 ," (year(xsd:dateTime(?dateEnd)) as ?yearEnd)"
120 ," (year(xsd:dateTime(?dateFlorish)) as ?yearFlorish) "
122 ," ?cid wdt:P31 wd:Q968159 ."
123 ," ?cid rdfs:label ?title filter (lang(?title) = \"en\") ."
125 ," ?url schema:about ?cid ."
126 ," ?url schema:inLanguage \"en\" ."
127 ," FILTER (SUBSTR(str(?url), 1, 25) = \"https://en.wikipedia.org/\")"
128 ," OPTIONAL {?cid (wdt:P580) ?dateStart .}"
129 ," OPTIONAL {?cid (wdt:P582) ?dateEnd .}"
130 ," OPTIONAL {?cid (wdt:P571) ?dateFlorish .}"
132 ," LIMIT " <> (cs $ show n)