2 Module : Gargantext.Text.Corpus.Parsers.Wikimedia
3 Description : Parser for Wikimedia dump
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 @Gargantext.Text.Corpus.Parsers.Wikimedia@:
11 This module provide a parser for wikipedia dump.
12 This include an xml parser for wikipedia's xml
13 and an wikimedia to plaintext converter for the wikipedia text field
16 {-# LANGUAGE OverloadedStrings #-}
17 {-# LANGUAGE NoImplicitPrelude #-}
19 module Gargantext.Text.Corpus.Parsers.Wikimedia
22 import Control.Monad.Catch
26 import Data.XML.Types (Event, Name)
27 import Gargantext.Prelude
29 import Text.XML.Stream.Parse
33 -- wikimediaFile <- BL.readFile "text.xml"
34 -- _ <- runConduit $ parseLBS def wikimediaFile
35 -- .| force "mediawiki required" parseMediawiki
36 -- .| CL.mapM mediawikiPageToPlain
40 -- | A simple "Page" type.
41 -- For the moment it takes only text and title
42 -- (since there is no abstract) will see if other data are relevant.
44 Page { _markupFormat :: MarkupFormat
45 , _title :: Maybe T.Text
46 , _text :: Maybe T.Text
50 data MarkupFormat = Mediawiki | Plaintext
53 parseRevision :: MonadThrow m => ConduitT Event o m (Maybe T.Text)
54 parseRevision = tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}revision" $ do
55 text <- force "text is missing" $ ignoreExcept "{http://www.mediawiki.org/xml/export-0.10/}text" content
56 many_ ignoreAnyTreeContent
59 -- | Utility function that matches everything but the tag given
60 tagUntil :: Name -> NameMatcher Name
61 tagUntil name = matching (/= name)
63 -- | Utility function that consumes everything but the tag given
64 -- usefull because we have to consume every data.
65 manyTagsUntil_ :: MonadThrow m => Name -> ConduitT Event o m ()
66 manyTagsUntil_ = many_ . ignoreTreeContent . tagUntil
68 manyTagsUntil_' :: MonadThrow m => Name -> ConduitT Event o m ()
69 manyTagsUntil_' = many_ . ignoreEmptyTag . tagUntil
71 -- | Utility function that parses nothing but the tag given,
72 -- usefull because we have to consume every data.
73 ignoreExcept :: MonadThrow m => Name
74 -> ConduitT Event o m b
75 -> ConduitT Event o m (Maybe b)
76 ignoreExcept name f = do
77 _ <- manyTagsUntil_ name
78 tagIgnoreAttrs (matching (== name)) f
80 -- TODO: remove ignoreExcept to:
81 -- many ignoreAnyTreeContentUntil "Article"
82 manyTagsUntil :: MonadThrow m => Name
83 -> ConduitT Event o m b
84 -> ConduitT Event o m (Maybe b)
85 manyTagsUntil name f = do
86 _ <- manyTagsUntil_ name
87 tagIgnoreAttrs (matching (== name)) f
91 parsePage :: MonadThrow m => ConduitT Event o m (Maybe Page)
93 tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}page" $ do
95 tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}title" content
96 _ <- manyTagsUntil_ "{http://www.mediawiki.org/xml/export-0.10/}revision"
99 many_ $ ignoreAnyTreeContent
100 return $ Page Mediawiki title revision
102 parseMediawiki :: MonadThrow m => ConduitT Event Page m (Maybe ())
104 tagIgnoreAttrs "{http://www.mediawiki.org/xml/export-0.10/}mediawiki"
105 $ manyYield' parsePage
107 -- | Convert a Mediawiki Page to a Plaintext Page.
108 -- Need to wrap the result in IO to parse and to combine it.
109 mediawikiPageToPlain :: Page -> IO Page
110 mediawikiPageToPlain page = do
111 title <- mediaToPlain $ _title page
112 revision <- mediaToPlain $ _text page
113 return $ Page Plaintext title revision
114 where mediaToPlain media =
116 (Nothing) -> return Nothing
119 doc <- readMediaWiki def med
122 (Left _) -> return Nothing
123 (Right r) -> return $ Just r