2 Module : Gargantext.Text.Ngrams.Stem.En
3 Description : Porter Algorithm Implementation purely Haskell
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 - source: https://hackage.haskell.org/package/porter
13 - adding Types signatures
19 module Gargantext.Text.Terms.Mono.Stem.En (stemIt)
25 import Data.Text (Text(), pack, unpack)
27 import Data.List hiding (map, head)
29 import Gargantext.Prelude
32 vowels = ['a','e','i','o','u']
34 isConsonant :: [Char] -> Int -> Bool
36 | c `elem` vowels = False
37 | c == 'y' = i == 0 || isVowel str (i - 1)
42 isVowel :: [Char] -> Int -> Bool
43 isVowel = (not .) . isConsonant
45 byIndex :: Foldable t1 => (t1 a -> [Int] -> t2) -> t1 a -> t2
46 byIndex fun str = fun str [0..length str - 1]
48 containsVowel :: [Char] -> Bool
49 containsVowel = byIndex (any . isVowel)
51 -- | /!\ unsafe fromJust
52 measure :: [Char] -> Int
53 measure = length . filter not . init . (True:)
54 . map fromJust . map head
55 . group . byIndex (map . isConsonant)
58 endsWithDouble :: [Char] -> Bool
59 endsWithDouble = startsWithDouble . reverse
61 startsWithDouble l = case l of
62 (x:y:_) -> x == y && x `notElem` vowels
66 cvc word | length word < 3 = False
67 | otherwise = isConsonant word lastIndex &&
68 isVowel word (lastIndex - 1) &&
69 isConsonant word (lastIndex - 2) &&
70 last word `notElem` ['w','x','y']
71 where lastIndex = length word - 1
73 statefulReplace :: Eq a => ([a] -> Bool)
75 -> Maybe (Data.Either.Either [a] [a])
76 statefulReplace predicate str end replacement
77 | end `isSuffixOf` str = Just replaced
80 part = take (length str - length end) str
81 replaced | predicate part = Right (part ++ replacement)
82 | otherwise = Left str
84 replaceEnd :: Eq a => ([a] -> Bool) -> [a] -> [a] -> [a] -> Maybe [a]
85 replaceEnd predicate str end replacement = do
86 result <- statefulReplace predicate str end replacement
87 return (either identity identity result)
90 :: (Foldable t, Functor t, Eq a) =>
91 ([a] -> Bool) -> [a] -> t ([a], [a]) -> Maybe [a]
92 findStem f word pairs = msum $ map (uncurry (replaceEnd f word)) pairs
94 measureGT :: Int -> [Char] -> Bool
95 measureGT = flip ((>) . measure)
97 step1a :: [Char] -> [Char]
98 step1a word = fromMaybe word result
100 result = findStem (const True) word suffixes
101 suffixes = [("sses", "ss"), ("ies", "i"), ("ss", "ss"), ("s", "")]
103 beforeStep1b :: [Char] -> Either [Char] [Char]
104 beforeStep1b word = fromMaybe (Left word) result
106 cond23 x = do { v <- x; either (const Nothing) (return . Right) v }
107 cond1 x = do { v <- x; return (Left v) }
109 cond1 (replaceEnd (measureGT 0) word "eed" "ee") `mplus`
110 cond23 (statefulReplace containsVowel word "ed" "" ) `mplus`
111 cond23 (statefulReplace containsVowel word "ing" "" )
113 afterStep1b :: [Char] -> [Char]
114 afterStep1b word = fromMaybe word result
116 double = endsWithDouble word && not (any ((`isSuffixOf` word) . return) ['l','s','z'])
117 mEq1AndCvc = measure word == 1 && cvc word
118 iif cond val = if cond then Just val else Nothing
119 result = findStem (const True) word [("at", "ate"), ("bl", "ble"), ("iz", "ize")]
120 `mplus` iif double (init word)
121 `mplus` iif mEq1AndCvc (word ++ "e")
123 step1b :: [Char] -> [Char]
124 step1b = either identity afterStep1b . beforeStep1b
126 step1c :: [Char] -> [Char]
127 step1c word = fromMaybe word result
128 where result = replaceEnd containsVowel word "y" "i"
130 step1 :: [Char] -> [Char]
131 step1 = step1c . step1b . step1a
133 step2 :: [Char] -> [Char]
134 step2 word = fromMaybe word result
136 result = findStem (measureGT 0) word
137 [ ("ational", "ate" )
147 , ("ization", "ize" )
151 , ("iveness", "ive" )
152 , ("fulness", "ful" )
153 , ("ousness", "ous" )
159 step3 :: [Char] -> [Char]
160 step3 word = fromMaybe word result
162 result = findStem (measureGT 0) word
171 step4 :: [Char] -> [Char]
172 step4 word = fromMaybe word result
174 gt1andST str = (measureGT 1) str && any ((`isSuffixOf` str) . return) ['s','t']
175 findGT1 = findStem (measureGT 1) word . map (flip (,) "")
176 result = (findGT1 ["al", "ance", "ence", "er", "ic", "able", "ible", "ant", "ement", "ment", "ent"]) `mplus`
177 (findStem gt1andST word [("ion","")]) `mplus`
178 (findGT1 ["ou", "ism", "ate", "iti", "ous", "ive", "ize"])
180 step5a :: [Char] -> [Char]
181 step5a word = fromMaybe word result
183 test str = (measureGT 1 str) || ((measure str == 1) && (not $ cvc str))
184 result = replaceEnd test word "e" ""
186 step5b :: [Char] -> [Char]
187 step5b word = fromMaybe word result
189 cond s = last s == 'l' && measureGT 1 s
190 result = replaceEnd cond word "l" ""
192 step5 :: [Char] -> [Char]
193 step5 = step5b . step5a
195 allSteps :: [Char] -> [Char]
196 allSteps = step5 . step4 . step3 . step2 . step1
198 stemIt :: Text -> Text
199 stemIt s = pack (stem' $ unpack s)
201 stem' :: [Char] -> [Char]
202 stem' s | length s < 3 = s
203 | otherwise = allSteps s
205 --fixpoint :: Eq t => (t -> t) -> t -> t
206 --fixpoint f x = let fx = f x in
209 -- else fixpoint f fx
211 --fixstem :: [Char] -> [Char]
212 --fixstem = fixpoint stem'
219 content <- readFile "input.txt"
220 writeFile "output.txt" $ unlines $ map stem $ lines content