2 Module : Gargantext.Core.Text.Corpus.Parsers.Wikidata
4 Description : To query Wikidata
6 Description : To query Wikidata
8 Copyright : (c) CNRS, 2019-Present
9 License : AGPL + CECILL v3
10 Maintainer : team@gargantext.org
11 Stability : experimental
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE ScopedTypeVariables #-}
19 module Gargantext.Core.Text.Corpus.Parsers.Wikidata where
21 import Control.Lens (makeLenses, (^.) )
22 import Data.Maybe (catMaybes)
23 import Data.Text (Text, concat)
24 import Database.HSparql.Connection
25 import Gargantext.Core (Lang(..))
26 import Gargantext.Core.Text.Corpus.Parsers.Isidore (unbound)
27 import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
28 import Gargantext.Prelude
29 import Gargantext.Core.Text.Corpus.Parsers.Wikidata.Crawler
30 import Prelude (String)
31 import qualified Data.List as List
32 import Gargantext.Core.Text.Corpus.Parsers.Date (dateSplit)
36 data WikiResult = WikiResult { _wr_cid :: Maybe Text
37 , _wr_title :: Maybe Text
38 , _wr_url :: Maybe Text
39 , _wr_yearStart :: Maybe Text
40 , _wr_yearEnd :: Maybe Text
41 , _wr_yearFlorish :: Maybe Text
43 $(makeLenses ''WikiResult)
45 type NumberOfSections = Int
47 wikidataGet :: Int -> NumberOfSections -> IO [HyperdataDocument]
49 results <- wikidataSelect n
50 mapM (wikiPageToDocument m) results
53 wikiPageToDocument :: NumberOfSections -> WikiResult -> IO HyperdataDocument
54 wikiPageToDocument m wr = do
55 sections <- case wr ^. wr_url of
59 let bdd = Just "wikidata"
65 title = (wr ^. wr_title)
69 abstract = Just $ concat $ take m sections
71 let (date, (year, month, day)) = dateSplit EN $ head
75 , wr ^. wr_yearFlorish
82 iso2 = Just $ cs $ show EN
84 pure $ HyperdataDocument bdd doi url uniqId uniqIdBdd
85 page title authors institutes source
86 abstract ((cs . show) <$> date) year month day hour minute second iso2
89 wikidataSelect :: Int -> IO [WikiResult]
91 result <- selectQueryRaw wikidataRoute (wikidataQuery n)
94 Just result' -> pure $ map toWikiResult $ unbound' EN result'
97 unbound' :: Lang -> [[BindingValue]] -> [[Maybe Text]]
98 unbound' l = map (map (unbound l))
100 toWikiResult :: [Maybe Text] -> WikiResult
101 toWikiResult (c:t:u:ys:ye:yf:_) = WikiResult c t u ys ye yf
102 toWikiResult _ = panic "[G.C.T.C.Parsers.Wikidata.toWikiResult] error"
104 wikidataRoute :: EndPoint
105 wikidataRoute = "https://query.wikidata.org/sparql"
107 wikidataQuery :: Int -> String
108 wikidataQuery n = List.unlines
109 [" PREFIX wd: <http://www.wikidata.org/entity/>"
110 ," PREFIX wdt: <http://www.wikidata.org/prop/direct/>"
111 ," PREFIX schema: <http://schema.org/>"
112 ," PREFIX wikibase: <http://wikiba.se/ontology#>"
117 ," (year(xsd:dateTime(?dateStart)) as ?yearStart)"
118 ," (year(xsd:dateTime(?dateEnd)) as ?yearEnd)"
119 ," (year(xsd:dateTime(?dateFlorish)) as ?yearFlorish) "
121 ," ?cid wdt:P31 wd:Q968159 ."
122 ," ?cid rdfs:label ?title filter (lang(?title) = \"en\") ."
124 ," ?url schema:about ?cid ."
125 ," ?url schema:inLanguage \"en\" ."
126 ," FILTER (SUBSTR(str(?url), 1, 25) = \"https://en.wikipedia.org/\")"
127 ," OPTIONAL {?cid (wdt:P580) ?dateStart .}"
128 ," OPTIONAL {?cid (wdt:P582) ?dateEnd .}"
129 ," OPTIONAL {?cid (wdt:P571) ?dateFlorish .}"
131 ," LIMIT " <> (cs $ show n)