3 Description : Porter Algorithm Implementation purely Haskell
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 - source: https://hackage.haskell.org/package/porter
13 - adding Types signatures
18 {-# LANGUAGE NoImplicitPrelude #-}
20 module Gargantext.Ngrams.Stem.En
26 import Data.Text (Text(), pack, unpack)
28 import Data.List hiding (map, head)
30 import Gargantext.Prelude
33 vowels = ['a','e','i','o','u']
35 isConsonant :: [Char] -> Int -> Bool
37 | c `elem` vowels = False
38 | c == 'y' = i == 0 || isVowel str (i - 1)
43 isVowel :: [Char] -> Int -> Bool
44 isVowel = (not .) . isConsonant
46 byIndex :: Foldable t1 => (t1 a -> [Int] -> t2) -> t1 a -> t2
47 byIndex fun str = fun str [0..length str - 1]
49 containsVowel :: [Char] -> Bool
50 containsVowel = byIndex (any . isVowel)
52 -- | /!\ unsafe fromJust
53 measure :: [Char] -> Int
54 measure = length . filter not . init . (True:)
55 . map fromJust . map head
56 . group . byIndex (map . isConsonant)
59 endsWithDouble :: [Char] -> Bool
60 endsWithDouble = startsWithDouble . reverse
62 startsWithDouble l = case l of
63 (x:y:_) -> x == y && x `notElem` vowels
67 cvc word | length word < 3 = False
68 | otherwise = isConsonant word lastIndex &&
69 isVowel word (lastIndex - 1) &&
70 isConsonant word (lastIndex - 2) &&
71 last word `notElem` ['w','x','y']
72 where lastIndex = length word - 1
74 statefulReplace :: Eq a => ([a] -> Bool)
76 -> Maybe (Data.Either.Either [a] [a])
77 statefulReplace predicate str end replacement
78 | end `isSuffixOf` str = Just replaced
81 part = take (length str - length end) str
82 replaced | predicate part = Right (part ++ replacement)
83 | otherwise = Left str
85 replaceEnd :: Eq a => ([a] -> Bool) -> [a] -> [a] -> [a] -> Maybe [a]
86 replaceEnd predicate str end replacement = do
87 result <- statefulReplace predicate str end replacement
88 return (either identity identity result)
91 :: (Foldable t, Functor t, Eq a) =>
92 ([a] -> Bool) -> [a] -> t ([a], [a]) -> Maybe [a]
93 findStem f word pairs = msum $ map (uncurry (replaceEnd f word)) pairs
95 measureGT :: Int -> [Char] -> Bool
96 measureGT = flip ((>) . measure)
98 step1a :: [Char] -> [Char]
99 step1a word = fromMaybe word result
101 result = findStem (const True) word suffixes
102 suffixes = [("sses", "ss"), ("ies", "i"), ("ss", "ss"), ("s", "")]
104 beforeStep1b :: [Char] -> Either [Char] [Char]
105 beforeStep1b word = fromMaybe (Left word) result
107 cond23 x = do { v <- x; either (const Nothing) (return . Right) v }
108 cond1 x = do { v <- x; return (Left v) }
110 cond1 (replaceEnd (measureGT 0) word "eed" "ee") `mplus`
111 cond23 (statefulReplace containsVowel word "ed" "" ) `mplus`
112 cond23 (statefulReplace containsVowel word "ing" "" )
114 afterStep1b :: [Char] -> [Char]
115 afterStep1b word = fromMaybe word result
117 double = endsWithDouble word && not (any ((`isSuffixOf` word) . return) ['l','s','z'])
118 mEq1AndCvc = measure word == 1 && cvc word
119 iif cond val = if cond then Just val else Nothing
120 result = findStem (const True) word [("at", "ate"), ("bl", "ble"), ("iz", "ize")]
121 `mplus` iif double (init word)
122 `mplus` iif mEq1AndCvc (word ++ "e")
124 step1b :: [Char] -> [Char]
125 step1b = either identity afterStep1b . beforeStep1b
127 step1c :: [Char] -> [Char]
128 step1c word = fromMaybe word result
129 where result = replaceEnd containsVowel word "y" "i"
131 step1 :: [Char] -> [Char]
132 step1 = step1c . step1b . step1a
134 step2 :: [Char] -> [Char]
135 step2 word = fromMaybe word result
137 result = findStem (measureGT 0) word
138 [ ("ational", "ate" )
148 , ("ization", "ize" )
152 , ("iveness", "ive" )
153 , ("fulness", "ful" )
154 , ("ousness", "ous" )
160 step3 :: [Char] -> [Char]
161 step3 word = fromMaybe word result
163 result = findStem (measureGT 0) word
172 step4 :: [Char] -> [Char]
173 step4 word = fromMaybe word result
175 gt1andST str = (measureGT 1) str && any ((`isSuffixOf` str) . return) ['s','t']
176 findGT1 = findStem (measureGT 1) word . map (flip (,) "")
177 result = (findGT1 ["al", "ance", "ence", "er", "ic", "able", "ible", "ant", "ement", "ment", "ent"]) `mplus`
178 (findStem gt1andST word [("ion","")]) `mplus`
179 (findGT1 ["ou", "ism", "ate", "iti", "ous", "ive", "ize"])
181 step5a :: [Char] -> [Char]
182 step5a word = fromMaybe word result
184 test str = (measureGT 1 str) || ((measure str == 1) && (not $ cvc str))
185 result = replaceEnd test word "e" ""
187 step5b :: [Char] -> [Char]
188 step5b word = fromMaybe word result
190 cond s = last s == 'l' && measureGT 1 s
191 result = replaceEnd cond word "l" ""
193 step5 :: [Char] -> [Char]
194 step5 = step5b . step5a
196 allSteps :: [Char] -> [Char]
197 allSteps = step5 . step4 . step3 . step2 . step1
200 stem s = pack (stem' $ unpack s)
202 stem' :: [Char] -> [Char]
203 stem' s | length s < 3 = s
204 | otherwise = allSteps s
206 fixpoint :: Eq t => (t -> t) -> t -> t
207 fixpoint f x = let fx = f x in
212 fixstem :: [Char] -> [Char]
213 fixstem = fixpoint stem'
220 content <- readFile "input.txt"
221 writeFile "output.txt" $ unlines $ map stem $ lines content