2 Module : Gargantext.Core.Text.Corpus.Parsers.Wikimedia
3 Description : Parser for Wikimedia dump
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 @Gargantext.Core.Text.Corpus.Parsers.Wikimedia@:
11 This module provide a parser for wikipedia dump.
12 This include an xml parser for wikipedia's xml
13 and an wikimedia to plaintext converter for the wikipedia text field
17 module Gargantext.Core.Text.Corpus.Parsers.Wikimedia
20 import Control.Monad.Catch
24 import Data.XML.Types (Event, Name)
25 import Gargantext.Prelude
27 import Text.XML.Stream.Parse
31 -- wikimediaFile <- BL.readFile "text.xml"
32 -- _ <- runConduit $ parseLBS def wikimediaFile
33 -- .| force "mediawiki required" parseMediawiki
34 -- .| CL.mapM mediawikiPageToPlain
38 -- | A simple "Page" type.
39 -- For the moment it takes only text and title
40 -- (since there is no abstract) will see if other data are relevant.
42 Page { _markupFormat :: MarkupFormat
43 , _title :: Maybe T.Text
44 , _text :: Maybe T.Text
48 data MarkupFormat = Mediawiki | Plaintext
51 parseRevision :: MonadThrow m => ConduitT Event o m (Maybe T.Text)
52 parseRevision = tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}revision" $ do
53 text <- force "text is missing" $ ignoreExcept "{http://www.mediawiki.org/xml/export-0.10/}text" content
54 many_ ignoreAnyTreeContent
57 -- | Utility function that matches everything but the tag given
58 tagUntil :: Name -> NameMatcher Name
59 tagUntil name = matching (/= name)
61 -- | Utility function that consumes everything but the tag given
62 -- usefull because we have to consume every data.
63 manyTagsUntil_ :: MonadThrow m => Name -> ConduitT Event o m ()
64 manyTagsUntil_ n = many_ (ignoreTree (tagUntil n) ignoreAttrs)
66 manyTagsUntil_' :: MonadThrow m => Name -> ConduitT Event o m ()
67 manyTagsUntil_' = many_ . ignoreEmptyTag . tagUntil
69 -- | Utility function that parses nothing but the tag given,
70 -- usefull because we have to consume every data.
71 ignoreExcept :: MonadThrow m => Name
72 -> ConduitT Event o m b
73 -> ConduitT Event o m (Maybe b)
74 ignoreExcept name f = do
75 _ <- manyTagsUntil_ name
76 tagIgnoreAttrs (matching (== name)) f
78 -- TODO: remove ignoreExcept to:
79 -- many ignoreAnyTreeContentUntil "Article"
80 manyTagsUntil :: MonadThrow m => Name
81 -> ConduitT Event o m b
82 -> ConduitT Event o m (Maybe b)
83 manyTagsUntil name f = do
84 _ <- manyTagsUntil_ name
85 tagIgnoreAttrs (matching (== name)) f
89 parsePage :: MonadThrow m => ConduitT Event o m (Maybe Page)
91 tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}page" $ do
93 tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}title" content
94 _ <- manyTagsUntil_ "{http://www.mediawiki.org/xml/export-0.10/}revision"
97 many_ $ ignoreAnyTreeContent
98 return $ Page { _markupFormat = Mediawiki
102 parseMediawiki :: MonadThrow m => ConduitT Event Page m (Maybe ())
104 tagIgnoreAttrs "{http://www.mediawiki.org/xml/export-0.10/}mediawiki"
105 $ manyYield' parsePage
107 -- | Convert a Mediawiki Page to a Plaintext Page.
108 -- Need to wrap the result in IO to parse and to combine it.
109 mediawikiPageToPlain :: Page -> IO Page
110 mediawikiPageToPlain page = do
111 title <- mediaToPlain $ _title page
112 revision <- mediaToPlain $ _text page
113 return $ Page { _markupFormat = Plaintext, _title = title, _text = revision }
114 where mediaToPlain media =
116 (Nothing) -> return Nothing
119 doc <- readMediaWiki def med
122 (Left _) -> return Nothing
123 (Right r) -> return $ Just r