]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Prepare.hs
[FEAT] Backend NLP French tested
[gargantext.git] / src / Gargantext / Core / Text / Prepare.hs
1 {-|
2 Module : Gargantext.Core.Text.Clean
3 Description : Tools to clean text
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Clean some texts before importing it.
11
12 For a given Language, chose a big master piece of litteracy to analyze
13 it with GarganText. Here is a an example with a famous French Writer
14 that could be the incarnation of the mythic Gargantua.
15
16 -}
17
18 {-# LANGUAGE OverloadedStrings #-}
19
20 module Gargantext.Core.Text.Prepare
21 where
22
23 import Data.Text (Text)
24 import Gargantext.Core.Text (sentences)
25 import Gargantext.Prelude
26 import qualified Data.List as List
27 import qualified Data.Text as Text
28
29
30 ---------------------------------------------------------------------
31 prepareText :: Paragraph -> Text -> [Text]
32 prepareText p txt = groupText p
33 $ List.filter (/= "")
34 $ toParagraphs
35 $ Text.lines
36 $ Text.replace "_" " " -- some texts seem to be underlined
37 $ Text.replace "--" "" -- removing bullets like of dialogs
38 $ Text.replace "\xd" "" txt
39
40 ---------------------------------------------------------------------
41
42 groupText :: Paragraph -> [Text] -> [Text]
43 groupText (Uniform blockSize) = groupUniform blockSize
44 groupText AuthorLike = groupLines
45
46 ---------------------------------------------------------------------
47 data Paragraph = Uniform Grain | AuthorLike
48 -- Uniform does not preserve the paragraphs of the author but length of paragraphs is uniform
49 -- Author Like preserve the paragraphs of the Author but length of paragraphs is not uniform
50
51 -- Grain: number of Sentences by block of Text
52 -- Step : overlap of sentence between connex block of Text
53 groupUniform :: Grain -> [Text] -> [Text]
54 groupUniform g ts = map (Text.intercalate " ")
55 $ chunkAlong g g
56 $ sentences
57 $ Text.concat ts
58
59 groupLines :: [Text] -> [Text]
60 groupLines xxx@(a:b:xs) =
61 if Text.length a > moyenne
62 then [a] <> (groupLines (b:xs))
63 else let ab = a <> " " <> b in
64 if Text.length ab > moyenne
65 then [ab] <> (groupLines xs)
66 else groupLines ([ab] <> xs)
67 where
68 moyenne = round
69 $ mean
70 $ (map (fromIntegral . Text.length) xxx :: [Double])
71 groupLines [a] = [a]
72 groupLines [] = []
73
74 groupLines_test :: [Text]
75 groupLines_test = groupLines theData
76 where
77 theData = ["abxxxx", "bc", "cxxx", "d"]
78
79 ---------------------------------------------------------------------
80 toParagraphs :: [Text] -> [Text]
81 toParagraphs (a:x:xs) =
82 if a == ""
83 then [a] <> toParagraphs (x:xs)
84 else if x == ""
85 then [a] <> toParagraphs (x:xs)
86 else toParagraphs $ [a <> " " <> x ] <> xs
87 toParagraphs [a] = [a]
88 toParagraphs [] = []
89
90 -- Tests
91
92 -- TODO for internships: Property tests
93 toParagraphs_test :: Bool
94 toParagraphs_test =
95 toParagraphs ["a","b","","c","d","d","","e","f","","g","h",""]
96 == [ "a b", "", "c d d", "", "e f", "", "g h", ""]
97
98
99
100
101