]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Corpus/Parsers/Wikimedia.hs
add a new trace
[gargantext.git] / src / Gargantext / Text / Corpus / Parsers / Wikimedia.hs
1 {-|
2 Module : Gargantext.Text.Corpus.Parsers.Wikimedia
3 Description : Parser for Wikimedia dump
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 @Gargantext.Text.Corpus.Parsers.Wikimedia@:
11 This module provide a parser for wikipedia dump.
12 This include an xml parser for wikipedia's xml
13 and an wikimedia to plaintext converter for the wikipedia text field
14 -}
15
16 {-# LANGUAGE OverloadedStrings #-}
17 {-# LANGUAGE NoImplicitPrelude #-}
18
19 module Gargantext.Text.Corpus.Parsers.Wikimedia
20 where
21
22 import Control.Monad.Catch
23 import Data.Conduit
24 import Data.Either
25 import Data.Text as T
26 import Data.XML.Types (Event, Name)
27 import Gargantext.Prelude
28 import Text.Pandoc
29 import Text.XML.Stream.Parse
30
31 -- | Use case
32 -- :{
33 -- wikimediaFile <- BL.readFile "text.xml"
34 -- _ <- runConduit $ parseLBS def wikimediaFile
35 -- .| force "mediawiki required" parseMediawiki
36 -- .| CL.mapM mediawikiPageToPlain
37 -- .| CL.mapM_ print
38 -- :}
39
40 -- | A simple "Page" type.
41 -- For the moment it takes only text and title
42 -- (since there is no abstract) will see if other data are relevant.
43 data Page =
44 Page { _markupFormat :: MarkupFormat
45 , _title :: Maybe T.Text
46 , _text :: Maybe T.Text
47 }
48 deriving (Show)
49
50 data MarkupFormat = Mediawiki | Plaintext
51 deriving (Show)
52
53 parseRevision :: MonadThrow m => ConduitT Event o m (Maybe T.Text)
54 parseRevision = tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}revision" $ do
55 text <- force "text is missing" $ ignoreExcept "{http://www.mediawiki.org/xml/export-0.10/}text" content
56 many_ ignoreAnyTreeContent
57 return text
58
59 -- | Utility function that matches everything but the tag given
60 tagUntil :: Name -> NameMatcher Name
61 tagUntil name = matching (/= name)
62
63 -- | Utility function that consumes everything but the tag given
64 -- usefull because we have to consume every data.
65 manyTagsUntil_ :: MonadThrow m => Name -> ConduitT Event o m ()
66 manyTagsUntil_ = many_ . ignoreTreeContent . tagUntil
67
68 manyTagsUntil_' :: MonadThrow m => Name -> ConduitT Event o m ()
69 manyTagsUntil_' = many_ . ignoreEmptyTag . tagUntil
70
71 -- | Utility function that parses nothing but the tag given,
72 -- usefull because we have to consume every data.
73 ignoreExcept :: MonadThrow m => Name
74 -> ConduitT Event o m b
75 -> ConduitT Event o m (Maybe b)
76 ignoreExcept name f = do
77 _ <- manyTagsUntil_ name
78 tagIgnoreAttrs (matching (== name)) f
79
80 -- TODO: remove ignoreExcept to:
81 -- many ignoreAnyTreeContentUntil "Article"
82 manyTagsUntil :: MonadThrow m => Name
83 -> ConduitT Event o m b
84 -> ConduitT Event o m (Maybe b)
85 manyTagsUntil name f = do
86 _ <- manyTagsUntil_ name
87 tagIgnoreAttrs (matching (== name)) f
88
89
90
91 parsePage :: MonadThrow m => ConduitT Event o m (Maybe Page)
92 parsePage =
93 tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}page" $ do
94 title <-
95 tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}title" content
96 _ <- manyTagsUntil_ "{http://www.mediawiki.org/xml/export-0.10/}revision"
97 revision <-
98 parseRevision
99 many_ $ ignoreAnyTreeContent
100 return $ Page Mediawiki title revision
101
102 parseMediawiki :: MonadThrow m => ConduitT Event Page m (Maybe ())
103 parseMediawiki =
104 tagIgnoreAttrs "{http://www.mediawiki.org/xml/export-0.10/}mediawiki"
105 $ manyYield' parsePage
106
107 -- | Convert a Mediawiki Page to a Plaintext Page.
108 -- Need to wrap the result in IO to parse and to combine it.
109 mediawikiPageToPlain :: Page -> IO Page
110 mediawikiPageToPlain page = do
111 title <- mediaToPlain $ _title page
112 revision <- mediaToPlain $ _text page
113 return $ Page Plaintext title revision
114 where mediaToPlain media =
115 case media of
116 (Nothing) -> return Nothing
117 (Just med) -> do
118 res <- runIO $ do
119 doc <- readMediaWiki def med
120 writePlain def doc
121 case res of
122 (Left _) -> return Nothing
123 (Right r) -> return $ Just r