2 Module : Gargantext.Text.Corpus.Parsers.Wikimedia
3 Description : Parser for Wikimedia dump
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 @Gargantext.Text.Corpus.Parsers.Wikimedia@:
11 This module provide a parser for wikipedia dump.
12 This include an xml parser for wikipedia's xml
13 and an wikimedia to plaintext converter for the wikipedia text field
17 module Gargantext.Text.Corpus.Parsers.Wikimedia
20 import Control.Monad.Catch
24 import Data.XML.Types (Event, Name)
25 import Gargantext.Prelude
27 import Text.XML.Stream.Parse
31 -- wikimediaFile <- BL.readFile "text.xml"
32 -- _ <- runConduit $ parseLBS def wikimediaFile
33 -- .| force "mediawiki required" parseMediawiki
34 -- .| CL.mapM mediawikiPageToPlain
38 -- | A simple "Page" type.
39 -- For the moment it takes only text and title
40 -- (since there is no abstract) will see if other data are relevant.
42 Page { _markupFormat :: MarkupFormat
43 , _title :: Maybe T.Text
44 , _text :: Maybe T.Text
48 data MarkupFormat = Mediawiki | Plaintext
51 parseRevision :: MonadThrow m => ConduitT Event o m (Maybe T.Text)
52 parseRevision = tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}revision" $ do
53 text <- force "text is missing" $ ignoreExcept "{http://www.mediawiki.org/xml/export-0.10/}text" content
54 many_ ignoreAnyTreeContent
57 -- | Utility function that matches everything but the tag given
58 tagUntil :: Name -> NameMatcher Name
59 tagUntil name = matching (/= name)
61 -- | Utility function that consumes everything but the tag given
62 -- usefull because we have to consume every data.
63 manyTagsUntil_ :: MonadThrow m => Name -> ConduitT Event o m ()
64 manyTagsUntil_ = many_ . ignoreTreeContent . tagUntil
66 manyTagsUntil_' :: MonadThrow m => Name -> ConduitT Event o m ()
67 manyTagsUntil_' = many_ . ignoreEmptyTag . tagUntil
69 -- | Utility function that parses nothing but the tag given,
70 -- usefull because we have to consume every data.
71 ignoreExcept :: MonadThrow m => Name
72 -> ConduitT Event o m b
73 -> ConduitT Event o m (Maybe b)
74 ignoreExcept name f = do
75 _ <- manyTagsUntil_ name
76 tagIgnoreAttrs (matching (== name)) f
78 -- TODO: remove ignoreExcept to:
79 -- many ignoreAnyTreeContentUntil "Article"
80 manyTagsUntil :: MonadThrow m => Name
81 -> ConduitT Event o m b
82 -> ConduitT Event o m (Maybe b)
83 manyTagsUntil name f = do
84 _ <- manyTagsUntil_ name
85 tagIgnoreAttrs (matching (== name)) f
89 parsePage :: MonadThrow m => ConduitT Event o m (Maybe Page)
91 tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}page" $ do
93 tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}title" content
94 _ <- manyTagsUntil_ "{http://www.mediawiki.org/xml/export-0.10/}revision"
97 many_ $ ignoreAnyTreeContent
98 return $ Page Mediawiki title revision
100 parseMediawiki :: MonadThrow m => ConduitT Event Page m (Maybe ())
102 tagIgnoreAttrs "{http://www.mediawiki.org/xml/export-0.10/}mediawiki"
103 $ manyYield' parsePage
105 -- | Convert a Mediawiki Page to a Plaintext Page.
106 -- Need to wrap the result in IO to parse and to combine it.
107 mediawikiPageToPlain :: Page -> IO Page
108 mediawikiPageToPlain page = do
109 title <- mediaToPlain $ _title page
110 revision <- mediaToPlain $ _text page
111 return $ Page Plaintext title revision
112 where mediaToPlain media =
114 (Nothing) -> return Nothing
117 doc <- readMediaWiki def med
120 (Left _) -> return Nothing
121 (Right r) -> return $ Just r