]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers/Wikidata.hs
[FEAT] Wikidata parser example for artistic movements (to be generalized) WIP
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers / Wikidata.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers.Wikidata
3 Description : To query Wikidata
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE TemplateHaskell #-}
13 {-# LANGUAGE ScopedTypeVariables #-}
14
15 module Gargantext.Core.Text.Corpus.Parsers.Wikidata where
16
17 import Control.Lens (makeLenses, (^.) )
18 import Data.Maybe (catMaybes)
19 import Data.Text (Text, concat)
20 import Database.HSparql.Connection
21 import Gargantext.Core (Lang(..))
22 import Gargantext.Core.Text.Corpus.Parsers.Isidore (unbound)
23 import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
24 import Gargantext.Prelude
25 import Gargantext.Core.Text.Corpus.Parsers.Wikidata.Crawler
26 import Prelude (String)
27 import qualified Data.List as List
28 import Gargantext.Core.Text.Corpus.Parsers.Date (dateSplit)
29
30
31
32 data WikiResult = WikiResult { _wr_cid :: Maybe Text
33 , _wr_title :: Maybe Text
34 , _wr_url :: Maybe Text
35 , _wr_yearStart :: Maybe Text
36 , _wr_yearEnd :: Maybe Text
37 , _wr_yearFlorish :: Maybe Text
38 } deriving (Show, Eq)
39 $(makeLenses ''WikiResult)
40
41 type NumberOfSections = Int
42
43 wikidataGet :: Int -> NumberOfSections -> IO [HyperdataDocument]
44 wikidataGet n m = do
45 results <- wikidataSelect n
46 mapM (wikiPageToDocument m) results
47
48
49 wikiPageToDocument :: NumberOfSections -> WikiResult -> IO HyperdataDocument
50 wikiPageToDocument m wr = do
51
52 sections <- case wr ^. wr_url of
53 Nothing -> pure []
54 Just u -> crawlPage u
55
56 let bdd = Just "wikidata"
57 doi = Nothing
58 url = (wr ^. wr_url)
59 uniqId = Nothing
60 uniqIdBdd = Nothing
61 page = Nothing
62 title = (wr ^. wr_title)
63 authors = Nothing
64 institutes = Nothing
65 source = Nothing
66 abstract = Just $ concat $ take m sections
67
68 (date, (year, month, day))
69 <- dateSplit EN $ head
70 $ catMaybes
71 [ wr ^. wr_yearStart
72 , wr ^. wr_yearEnd
73 , wr ^. wr_yearFlorish
74 , head sections
75 ]
76
77 let hour = Nothing
78 minute = Nothing
79 second = Nothing
80 iso2 = Just $ cs $ show EN
81
82 pure $ HyperdataDocument bdd doi url uniqId uniqIdBdd
83 page title authors institutes source
84 abstract ((cs . show) <$> date) year month day hour minute second iso2
85
86
87 wikidataSelect :: Int -> IO [WikiResult]
88 wikidataSelect n = do
89 result <- selectQueryRaw wikidataRoute (wikidataQuery n)
90 case result of
91 Nothing -> pure []
92 Just result' -> pure $ map toWikiResult $ unbound' EN result'
93
94
95 unbound' :: Lang -> [[BindingValue]] -> [[Maybe Text]]
96 unbound' l = map (map (unbound l))
97
98 toWikiResult :: [Maybe Text] -> WikiResult
99 toWikiResult (c:t:u:ys:ye:yf:_) = WikiResult c t u ys ye yf
100 toWikiResult _ = panic "[G.C.T.C.Parsers.Wikidata.toWikiResult] error"
101
102 wikidataRoute :: EndPoint
103 wikidataRoute = "https://query.wikidata.org/sparql"
104
105 wikidataQuery :: Int -> String
106 wikidataQuery n = List.unlines
107 [" PREFIX wd: <http://www.wikidata.org/entity/>"
108 ," PREFIX wdt: <http://www.wikidata.org/prop/direct/>"
109 ," PREFIX schema: <http://schema.org/>"
110 ," PREFIX wikibase: <http://wikiba.se/ontology#>"
111 ," SELECT DISTINCT "
112 ," ?cid"
113 ," ?title"
114 ," ?url"
115 ," (year(xsd:dateTime(?dateStart)) as ?yearStart)"
116 ," (year(xsd:dateTime(?dateEnd)) as ?yearEnd)"
117 ," (year(xsd:dateTime(?dateFlorish)) as ?yearFlorish) "
118 ," WHERE {"
119 ," ?cid wdt:P31 wd:Q968159 ."
120 ," ?cid rdfs:label ?title filter (lang(?title) = \"en\") ."
121 ," "
122 ," ?url schema:about ?cid ."
123 ," ?url schema:inLanguage \"en\" ."
124 ," FILTER (SUBSTR(str(?url), 1, 25) = \"https://en.wikipedia.org/\")"
125 ," OPTIONAL {?cid (wdt:P580) ?dateStart .}"
126 ," OPTIONAL {?cid (wdt:P582) ?dateEnd .}"
127 ," OPTIONAL {?cid (wdt:P571) ?dateFlorish .}"
128 ," }"
129 ," LIMIT " <> (cs $ show n)
130 ]
131
132
133