2 Module : Gargantext.Core.Text.Clean
3 Description : Tools to clean text
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Clean some texts before importing it.
12 For a given Language, chose a big master piece of litteracy to analyze
13 it with GarganText. Here is a an example with a famous French Writer
14 that could be the incarnation of the mythic Gargantua.
18 {-# LANGUAGE OverloadedStrings #-}
20 module Gargantext.Core.Text.Prepare
23 import Data.Text (Text)
24 import Gargantext.Core.Text (sentences)
25 import Gargantext.Prelude
26 import qualified Data.List as List
27 import qualified Data.Text as Text
30 ---------------------------------------------------------------------
31 prepareText :: Paragraph -> Text -> [Text]
32 prepareText p txt = groupText p
36 $ Text.replace "_" " " -- some texts seem to be underlined
37 $ Text.replace "--" "" -- removing bullets like of dialogs
38 $ Text.replace "\xd" "" txt
40 ---------------------------------------------------------------------
42 groupText :: Paragraph -> [Text] -> [Text]
43 groupText (Uniform blockSize) = groupUniform blockSize
44 groupText AuthorLike = groupLines
46 ---------------------------------------------------------------------
47 data Paragraph = Uniform Grain | AuthorLike
48 -- Uniform does not preserve the paragraphs of the author but length of paragraphs is uniform
49 -- Author Like preserve the paragraphs of the Author but length of paragraphs is not uniform
51 -- Grain: number of Sentences by block of Text
52 -- Step : overlap of sentence between connex block of Text
53 groupUniform :: Grain -> [Text] -> [Text]
54 groupUniform g ts = map (Text.intercalate " ")
59 groupLines :: [Text] -> [Text]
60 groupLines xxx@(a:b:xs) =
61 if Text.length a > moyenne
62 then [a] <> (groupLines (b:xs))
63 else let ab = a <> " " <> b in
64 if Text.length ab > moyenne
65 then [ab] <> (groupLines xs)
66 else groupLines ([ab] <> xs)
70 $ (map (fromIntegral . Text.length) xxx :: [Double])
74 groupLines_test :: [Text]
75 groupLines_test = groupLines theData
77 theData = ["abxxxx", "bc", "cxxx", "d"]
79 ---------------------------------------------------------------------
80 toParagraphs :: [Text] -> [Text]
81 toParagraphs (a:x:xs) =
83 then [a] <> toParagraphs (x:xs)
85 then [a] <> toParagraphs (x:xs)
86 else toParagraphs $ [a <> " " <> x ] <> xs
87 toParagraphs [a] = [a]
92 -- TODO for internships: Property tests
93 toParagraphs_test :: Bool
95 toParagraphs ["a","b","","c","d","d","","e","f","","g","h",""]
96 == [ "a b", "", "c d d", "", "e f", "", "g h", ""]