]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Parsers/Wikimedia.hs
[FLOW][DB][Ngrams] group function.
[gargantext.git] / src / Gargantext / Text / Parsers / Wikimedia.hs
1 {-|
2 Module : Gargantext.Text.Parsers.WOS
3 Description : Parser for Wikimedia dump
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 @Gargantext.Text.Parsers.Wikimedia@:
11 This module provide a parser for wikipedia dump.
12 This include an xml parser for wikipedia's xml
13 and an wikimedia to plaintext converter for the wikipedia text field
14 -}
15
16 {-# LANGUAGE OverloadedStrings #-}
17 {-# LANGUAGE NoImplicitPrelude #-}
18
19 module Gargantext.Text.Parsers.Wikimedia where
20 import Gargantext.Prelude
21 import Text.XML.Stream.Parse
22 import Control.Monad.Catch
23 import Data.Conduit
24 import Data.XML.Types (Event, Name)
25 import Text.Pandoc
26 import Data.Text as T
27 import Data.Either
28
29 -- | Use case
30 -- :{
31 -- wikimediaFile <- BL.readFile "text.xml"
32 -- _ <- runConduit $ parseLBS def wikimediaFile
33 -- .| force "mediawiki required" parseMediawiki
34 -- .| CL.mapM mediawikiPageToPlain
35 -- .| CL.mapM_ print
36 -- :}
37
38 -- | A simple "Page" type.
39 -- For the moment it takes only text and title
40 -- (since there is no abstract) will see if other data are relevant.
41 data Page = Page
42 {
43 _markupFormat :: MarkupFormat
44 , _title :: Maybe T.Text
45 , _text :: Maybe T.Text
46 }
47 deriving (Show)
48
49 data MarkupFormat = Mediawiki | Plaintext
50 deriving (Show)
51
52 parseRevision :: MonadThrow m => ConduitT Event o m (Maybe T.Text)
53 parseRevision =
54 tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}revision" $ do
55 text <-
56 force "text is missing" $ ignoreExcept
57 "{http://www.mediawiki.org/xml/export-0.10/}text" content
58 many_
59 $ ignoreAnyTreeContent
60 return text
61
62 -- | Utility function that match everything but the tag given
63 tagUntil :: Name -> NameMatcher Name
64 tagUntil name = matching (/= name)
65
66 -- | Utility function that parse nothing but the tag given,
67 -- usefull because we have to consume every data.
68 ignoreExcept :: MonadThrow m => Name
69 -> ConduitT Event o m b
70 -> ConduitT Event o m (Maybe b)
71 ignoreExcept name f = do
72 _ <- consumeExcept name
73 tagIgnoreAttrs (matching (==name)) f
74
75 -- | Utility function that consume everything but the tag given
76 -- usefull because we have to consume every data.
77 consumeExcept :: MonadThrow m => Name -> ConduitT Event o m ()
78 consumeExcept = many_ . ignoreTreeContent . tagUntil
79
80 parsePage :: MonadThrow m => ConduitT Event o m (Maybe Page)
81 parsePage =
82 tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}page" $ do
83 title <-
84 tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}title" content
85 _ <-
86 consumeExcept "{http://www.mediawiki.org/xml/export-0.10/}revision"
87 revision <-
88 parseRevision
89 many_ $ ignoreAnyTreeContent
90 return $ Page Mediawiki title revision
91
92 parseMediawiki :: MonadThrow m => ConduitT Event Page m (Maybe ())
93 parseMediawiki =
94 tagIgnoreAttrs "{http://www.mediawiki.org/xml/export-0.10/}mediawiki"
95 $ manyYield' parsePage
96
97 -- | Convert a Mediawiki Page to a Plaintext Page.
98 -- Need to wrap the result in IO to parse and to combine it.
99 mediawikiPageToPlain :: Page -> IO Page
100 mediawikiPageToPlain page = do
101 title <- mediaToPlain $ _title page
102 revision <- mediaToPlain $ _text page
103 return $ Page Plaintext title revision
104 where mediaToPlain media =
105 case media of
106 (Nothing) -> return Nothing
107 (Just med) -> do
108 res <- runIO $ do
109 doc <- readMediaWiki def med
110 writePlain def doc
111 case res of
112 (Left _) -> return Nothing
113 (Right r) -> return $ Just r