]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers/Wikimedia.hs
Merge branch '68-dev-garg-v3-csv-parser' of ssh://gitlab.iscpif.fr:20022/gargantext...
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers / Wikimedia.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers.Wikimedia
3 Description : Parser for Wikimedia dump
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 @Gargantext.Core.Text.Corpus.Parsers.Wikimedia@:
11 This module provide a parser for wikipedia dump.
12 This include an xml parser for wikipedia's xml
13 and an wikimedia to plaintext converter for the wikipedia text field
14 -}
15
16
17 module Gargantext.Core.Text.Corpus.Parsers.Wikimedia
18 where
19
20 import Control.Monad.Catch
21 import Data.Conduit
22 import Data.Either
23 import Data.Text as T
24 import Data.XML.Types (Event, Name)
25 import Gargantext.Prelude
26 import Text.Pandoc
27 import Text.XML.Stream.Parse
28
29 -- | Use case
30 -- :{
31 -- wikimediaFile <- BL.readFile "text.xml"
32 -- _ <- runConduit $ parseLBS def wikimediaFile
33 -- .| force "mediawiki required" parseMediawiki
34 -- .| CL.mapM mediawikiPageToPlain
35 -- .| CL.mapM_ print
36 -- :}
37
38 -- | A simple "Page" type.
39 -- For the moment it takes only text and title
40 -- (since there is no abstract) will see if other data are relevant.
41 data Page =
42 Page { _markupFormat :: MarkupFormat
43 , _title :: Maybe T.Text
44 , _text :: Maybe T.Text
45 }
46 deriving (Show)
47
48 data MarkupFormat = Mediawiki | Plaintext
49 deriving (Show)
50
51 parseRevision :: MonadThrow m => ConduitT Event o m (Maybe T.Text)
52 parseRevision = tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}revision" $ do
53 text <- force "text is missing" $ ignoreExcept "{http://www.mediawiki.org/xml/export-0.10/}text" content
54 many_ ignoreAnyTreeContent
55 return text
56
57 -- | Utility function that matches everything but the tag given
58 tagUntil :: Name -> NameMatcher Name
59 tagUntil name = matching (/= name)
60
61 -- | Utility function that consumes everything but the tag given
62 -- usefull because we have to consume every data.
63 manyTagsUntil_ :: MonadThrow m => Name -> ConduitT Event o m ()
64 manyTagsUntil_ n = many_ (ignoreTree (tagUntil n) ignoreAttrs)
65
66 manyTagsUntil_' :: MonadThrow m => Name -> ConduitT Event o m ()
67 manyTagsUntil_' = many_ . ignoreEmptyTag . tagUntil
68
69 -- | Utility function that parses nothing but the tag given,
70 -- usefull because we have to consume every data.
71 ignoreExcept :: MonadThrow m => Name
72 -> ConduitT Event o m b
73 -> ConduitT Event o m (Maybe b)
74 ignoreExcept name f = do
75 _ <- manyTagsUntil_ name
76 tagIgnoreAttrs (matching (== name)) f
77
78 -- TODO: remove ignoreExcept to:
79 -- many ignoreAnyTreeContentUntil "Article"
80 manyTagsUntil :: MonadThrow m => Name
81 -> ConduitT Event o m b
82 -> ConduitT Event o m (Maybe b)
83 manyTagsUntil name f = do
84 _ <- manyTagsUntil_ name
85 tagIgnoreAttrs (matching (== name)) f
86
87
88
89 parsePage :: MonadThrow m => ConduitT Event o m (Maybe Page)
90 parsePage =
91 tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}page" $ do
92 title <-
93 tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}title" content
94 _ <- manyTagsUntil_ "{http://www.mediawiki.org/xml/export-0.10/}revision"
95 revision <-
96 parseRevision
97 many_ $ ignoreAnyTreeContent
98 return $ Page Mediawiki title revision
99
100 parseMediawiki :: MonadThrow m => ConduitT Event Page m (Maybe ())
101 parseMediawiki =
102 tagIgnoreAttrs "{http://www.mediawiki.org/xml/export-0.10/}mediawiki"
103 $ manyYield' parsePage
104
105 -- | Convert a Mediawiki Page to a Plaintext Page.
106 -- Need to wrap the result in IO to parse and to combine it.
107 mediawikiPageToPlain :: Page -> IO Page
108 mediawikiPageToPlain page = do
109 title <- mediaToPlain $ _title page
110 revision <- mediaToPlain $ _text page
111 return $ Page Plaintext title revision
112 where mediaToPlain media =
113 case media of
114 (Nothing) -> return Nothing
115 (Just med) -> do
116 res <- runIO $ do
117 doc <- readMediaWiki def med
118 writePlain def doc
119 case res of
120 (Left _) -> return Nothing
121 (Right r) -> return $ Just r