]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers/Wikidata.hs
[FIX] list size limit
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers / Wikidata.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers.Wikidata
3 <<<<<<< HEAD
4 Description : To query Wikidata
5 =======
6 Description : To query Wikidata
7 >>>>>>> dev-clustering
8 Copyright : (c) CNRS, 2019-Present
9 License : AGPL + CECILL v3
10 Maintainer : team@gargantext.org
11 Stability : experimental
12 Portability : POSIX
13
14 -}
15
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE ScopedTypeVariables #-}
18
19 module Gargantext.Core.Text.Corpus.Parsers.Wikidata where
20
21 import Control.Lens (makeLenses, (^.) )
22 import Data.Maybe (catMaybes)
23 import Data.Text (Text, concat)
24 import Database.HSparql.Connection
25 import Gargantext.Core (Lang(..))
26 import Gargantext.Core.Text.Corpus.Parsers.Isidore (unbound)
27 import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
28 import Gargantext.Prelude
29 import Gargantext.Core.Text.Corpus.Parsers.Wikidata.Crawler
30 import Prelude (String)
31 import qualified Data.List as List
32 import Gargantext.Core.Text.Corpus.Parsers.Date (dateSplit)
33
34
35
36 data WikiResult = WikiResult { _wr_cid :: Maybe Text
37 , _wr_title :: Maybe Text
38 , _wr_url :: Maybe Text
39 , _wr_yearStart :: Maybe Text
40 , _wr_yearEnd :: Maybe Text
41 , _wr_yearFlorish :: Maybe Text
42 } deriving (Show, Eq)
43 $(makeLenses ''WikiResult)
44
45 type NumberOfSections = Int
46
47 wikidataGet :: Int -> NumberOfSections -> IO [HyperdataDocument]
48 wikidataGet n m = do
49 results <- wikidataSelect n
50 mapM (wikiPageToDocument m) results
51
52
53 wikiPageToDocument :: NumberOfSections -> WikiResult -> IO HyperdataDocument
54 wikiPageToDocument m wr = do
55 sections <- case wr ^. wr_url of
56 Nothing -> pure []
57 Just u -> crawlPage u
58
59 let bdd = Just "wikidata"
60 doi = Nothing
61 url = (wr ^. wr_url)
62 uniqId = Nothing
63 uniqIdBdd = Nothing
64 page = Nothing
65 title = (wr ^. wr_title)
66 authors = Nothing
67 institutes = Nothing
68 source = Nothing
69 abstract = Just $ concat $ take m sections
70
71 (date, (year, month, day))
72 <- dateSplit EN $ head
73 $ catMaybes
74 [ wr ^. wr_yearStart
75 , wr ^. wr_yearEnd
76 , wr ^. wr_yearFlorish
77 , head sections
78 ]
79
80 let hour = Nothing
81 minute = Nothing
82 second = Nothing
83 iso2 = Just $ cs $ show EN
84
85 pure $ HyperdataDocument bdd doi url uniqId uniqIdBdd
86 page title authors institutes source
87 abstract ((cs . show) <$> date) year month day hour minute second iso2
88
89
90 wikidataSelect :: Int -> IO [WikiResult]
91 wikidataSelect n = do
92 result <- selectQueryRaw wikidataRoute (wikidataQuery n)
93 case result of
94 Nothing -> pure []
95 Just result' -> pure $ map toWikiResult $ unbound' EN result'
96
97
98 unbound' :: Lang -> [[BindingValue]] -> [[Maybe Text]]
99 unbound' l = map (map (unbound l))
100
101 toWikiResult :: [Maybe Text] -> WikiResult
102 toWikiResult (c:t:u:ys:ye:yf:_) = WikiResult c t u ys ye yf
103 toWikiResult _ = panic "[G.C.T.C.Parsers.Wikidata.toWikiResult] error"
104
105 wikidataRoute :: EndPoint
106 wikidataRoute = "https://query.wikidata.org/sparql"
107
108 wikidataQuery :: Int -> String
109 wikidataQuery n = List.unlines
110 [" PREFIX wd: <http://www.wikidata.org/entity/>"
111 ," PREFIX wdt: <http://www.wikidata.org/prop/direct/>"
112 ," PREFIX schema: <http://schema.org/>"
113 ," PREFIX wikibase: <http://wikiba.se/ontology#>"
114 ," SELECT DISTINCT "
115 ," ?cid"
116 ," ?title"
117 ," ?url"
118 ," (year(xsd:dateTime(?dateStart)) as ?yearStart)"
119 ," (year(xsd:dateTime(?dateEnd)) as ?yearEnd)"
120 ," (year(xsd:dateTime(?dateFlorish)) as ?yearFlorish) "
121 ," WHERE {"
122 ," ?cid wdt:P31 wd:Q968159 ."
123 ," ?cid rdfs:label ?title filter (lang(?title) = \"en\") ."
124 ," "
125 ," ?url schema:about ?cid ."
126 ," ?url schema:inLanguage \"en\" ."
127 ," FILTER (SUBSTR(str(?url), 1, 25) = \"https://en.wikipedia.org/\")"
128 ," OPTIONAL {?cid (wdt:P580) ?dateStart .}"
129 ," OPTIONAL {?cid (wdt:P582) ?dateEnd .}"
130 ," OPTIONAL {?cid (wdt:P571) ?dateFlorish .}"
131 ," }"
132 ," LIMIT " <> (cs $ show n)
133 ]
134