2 Module : Gargantext.Text.Parsers.WOS
3 Description : Parser for Wikimedia dump
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 @Gargantext.Text.Parsers.Wikimedia@:
11 This module provide a parser for wikipedia dump.
12 This include an xml parser for wikipedia's xml
13 and an wikimedia to plaintext converter for the wikipedia text field
16 {-# LANGUAGE OverloadedStrings #-}
17 {-# LANGUAGE NoImplicitPrelude #-}
19 module Gargantext.Text.Parsers.Wikimedia where
20 import Gargantext.Prelude
21 import Text.XML.Stream.Parse
22 import Control.Monad.Catch
24 import Data.XML.Types (Event, Name)
31 -- wikimediaFile <- BL.readFile "text.xml"
32 -- _ <- runConduit $ parseLBS def wikimediaFile
33 -- .| force "mediawiki required" parseMediawiki
34 -- .| CL.mapM mediawikiPageToPlain
38 -- | A simple "Page" type.
39 -- For the moment it takes only text and title
40 -- (since there is no abstract) will see if other data are relevant.
43 _markupFormat :: MarkupFormat
44 , _title :: Maybe T.Text
45 , _text :: Maybe T.Text
49 data MarkupFormat = Mediawiki | Plaintext
52 parseRevision :: MonadThrow m => ConduitT Event o m (Maybe T.Text)
54 tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}revision" $ do
56 force "text is missing" $ ignoreExcept
57 "{http://www.mediawiki.org/xml/export-0.10/}text" content
59 $ ignoreAnyTreeContent
62 -- | Utility function that match everything but the tag given
63 tagUntil :: Name -> NameMatcher Name
64 tagUntil name = matching (/= name)
66 -- | Utility function that parse nothing but the tag given,
67 -- usefull because we have to consume every data.
68 ignoreExcept :: MonadThrow m => Name
69 -> ConduitT Event o m b
70 -> ConduitT Event o m (Maybe b)
71 ignoreExcept name f = do
72 _ <- consumeExcept name
73 tagIgnoreAttrs (matching (==name)) f
75 -- | Utility function that consume everything but the tag given
76 -- usefull because we have to consume every data.
77 consumeExcept :: MonadThrow m => Name -> ConduitT Event o m ()
78 consumeExcept = many_ . ignoreTreeContent . tagUntil
80 parsePage :: MonadThrow m => ConduitT Event o m (Maybe Page)
82 tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}page" $ do
84 tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}title" content
86 consumeExcept "{http://www.mediawiki.org/xml/export-0.10/}revision"
89 many_ $ ignoreAnyTreeContent
90 return $ Page Mediawiki title revision
92 parseMediawiki :: MonadThrow m => ConduitT Event Page m (Maybe ())
94 tagIgnoreAttrs "{http://www.mediawiki.org/xml/export-0.10/}mediawiki"
95 $ manyYield' parsePage
97 -- | Convert a Mediawiki Page to a Plaintext Page.
98 -- Need to wrap the result in IO to parse and to combine it.
99 mediawikiPageToPlain :: Page -> IO Page
100 mediawikiPageToPlain page = do
101 title <- mediaToPlain $ _title page
102 revision <- mediaToPlain $ _text page
103 return $ Page Plaintext title revision
104 where mediaToPlain media =
106 (Nothing) -> return Nothing
109 doc <- readMediaWiki def med
112 (Left _) -> return Nothing
113 (Right r) -> return $ Just r