]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers/Wikidata.hs
[FEAT] Improving NodeWriteParsing
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers / Wikidata.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers.Wikidata
3 <<<<<<< HEAD
4 Description : To query Wikidata
5 =======
6 Description : To query Wikidata
7 >>>>>>> dev-clustering
8 Copyright : (c) CNRS, 2019-Present
9 License : AGPL + CECILL v3
10 Maintainer : team@gargantext.org
11 Stability : experimental
12 Portability : POSIX
13
14 -}
15
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE ScopedTypeVariables #-}
18
19 module Gargantext.Core.Text.Corpus.Parsers.Wikidata where
20
21 import Control.Lens (makeLenses, (^.) )
22 import Data.Maybe (catMaybes)
23 import Data.Text (Text, concat)
24 import Database.HSparql.Connection
25 import Gargantext.Core (Lang(..))
26 import Gargantext.Core.Text.Corpus.Parsers.Isidore (unbound)
27 import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
28 import Gargantext.Prelude
29 import Gargantext.Core.Text.Corpus.Parsers.Wikidata.Crawler
30 import Prelude (String)
31 import qualified Data.List as List
32 import Gargantext.Core.Text.Corpus.Parsers.Date (dateSplit)
33
34
35
36 data WikiResult = WikiResult { _wr_cid :: Maybe Text
37 , _wr_title :: Maybe Text
38 , _wr_url :: Maybe Text
39 , _wr_yearStart :: Maybe Text
40 , _wr_yearEnd :: Maybe Text
41 , _wr_yearFlorish :: Maybe Text
42 } deriving (Show, Eq)
43 $(makeLenses ''WikiResult)
44
45 type NumberOfSections = Int
46
47 wikidataGet :: Int -> NumberOfSections -> IO [HyperdataDocument]
48 wikidataGet n m = do
49 results <- wikidataSelect n
50 mapM (wikiPageToDocument m) results
51
52
53 wikiPageToDocument :: NumberOfSections -> WikiResult -> IO HyperdataDocument
54 wikiPageToDocument m wr = do
55 sections <- case wr ^. wr_url of
56 Nothing -> pure []
57 Just u -> crawlPage u
58
59 let bdd = Just "wikidata"
60 doi = Nothing
61 url = (wr ^. wr_url)
62 uniqId = Nothing
63 uniqIdBdd = Nothing
64 page = Nothing
65 title = (wr ^. wr_title)
66 authors = Nothing
67 institutes = Nothing
68 source = Nothing
69 abstract = Just $ concat $ take m sections
70
71 (date, (year, month, day)) <- dateSplit EN $ head
72 $ catMaybes
73 [ wr ^. wr_yearStart
74 , wr ^. wr_yearEnd
75 , wr ^. wr_yearFlorish
76 , head sections
77 ]
78
79 let hour = Nothing
80 minute = Nothing
81 second = Nothing
82 iso2 = Just $ cs $ show EN
83
84 pure $ HyperdataDocument bdd doi url uniqId uniqIdBdd
85 page title authors institutes source
86 abstract ((cs . show) <$> date) year month day hour minute second iso2
87
88
89 wikidataSelect :: Int -> IO [WikiResult]
90 wikidataSelect n = do
91 result <- selectQueryRaw wikidataRoute (wikidataQuery n)
92 case result of
93 Nothing -> pure []
94 Just result' -> pure $ map toWikiResult $ unbound' EN result'
95
96
97 unbound' :: Lang -> [[BindingValue]] -> [[Maybe Text]]
98 unbound' l = map (map (unbound l))
99
100 toWikiResult :: [Maybe Text] -> WikiResult
101 toWikiResult (c:t:u:ys:ye:yf:_) = WikiResult c t u ys ye yf
102 toWikiResult _ = panic "[G.C.T.C.Parsers.Wikidata.toWikiResult] error"
103
104 wikidataRoute :: EndPoint
105 wikidataRoute = "https://query.wikidata.org/sparql"
106
107 wikidataQuery :: Int -> String
108 wikidataQuery n = List.unlines
109 [" PREFIX wd: <http://www.wikidata.org/entity/>"
110 ," PREFIX wdt: <http://www.wikidata.org/prop/direct/>"
111 ," PREFIX schema: <http://schema.org/>"
112 ," PREFIX wikibase: <http://wikiba.se/ontology#>"
113 ," SELECT DISTINCT "
114 ," ?cid"
115 ," ?title"
116 ," ?url"
117 ," (year(xsd:dateTime(?dateStart)) as ?yearStart)"
118 ," (year(xsd:dateTime(?dateEnd)) as ?yearEnd)"
119 ," (year(xsd:dateTime(?dateFlorish)) as ?yearFlorish) "
120 ," WHERE {"
121 ," ?cid wdt:P31 wd:Q968159 ."
122 ," ?cid rdfs:label ?title filter (lang(?title) = \"en\") ."
123 ," "
124 ," ?url schema:about ?cid ."
125 ," ?url schema:inLanguage \"en\" ."
126 ," FILTER (SUBSTR(str(?url), 1, 25) = \"https://en.wikipedia.org/\")"
127 ," OPTIONAL {?cid (wdt:P580) ?dateStart .}"
128 ," OPTIONAL {?cid (wdt:P582) ?dateEnd .}"
129 ," OPTIONAL {?cid (wdt:P571) ?dateFlorish .}"
130 ," }"
131 ," LIMIT " <> (cs $ show n)
132 ]
133