2 Module : Gargantext.Core.Text.Ngrams.Stem.En
3 Description : Porter Algorithm Implementation purely Haskell
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 - source: https://hackage.haskell.org/package/porter
13 - adding Types signatures
19 module Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
24 import Data.List ((!!))
26 import Data.Text (Text(), pack, unpack)
27 import Gargantext.Prelude
28 import qualified Data.List as List hiding (map, head)
31 vowels = ['a','e','i','o','u']
33 isConsonant :: [Char] -> Int -> Bool
35 | c `elem` vowels = False
36 | c == 'y' = i == 0 || isVowel str (i - 1)
41 isVowel :: [Char] -> Int -> Bool
42 isVowel = (not .) . isConsonant
44 byIndex :: Foldable t1 => (t1 a -> [Int] -> t2) -> t1 a -> t2
45 byIndex fun str = fun str [0..length str - 1]
47 containsVowel :: [Char] -> Bool
48 containsVowel = byIndex (any . isVowel)
50 -- | /!\ unsafe fromJust
51 measure :: [Char] -> Int
52 measure = length . filter not . List.init . (True:)
53 . map fromJust . map head
54 . List.group . byIndex (map . isConsonant)
57 endsWithDouble :: [Char] -> Bool
58 endsWithDouble = startsWithDouble . reverse
60 startsWithDouble l = case l of
61 (x:y:_) -> x == y && x `List.notElem` vowels
65 cvc word | length word < 3 = False
66 | otherwise = isConsonant word lastIndex &&
67 isVowel word (lastIndex - 1) &&
68 isConsonant word (lastIndex - 2) &&
69 List.last word `List.notElem` ['w','x','y']
70 where lastIndex = length word - 1
72 statefulReplace :: Eq a => ([a] -> Bool)
74 -> Maybe (Data.Either.Either [a] [a])
75 statefulReplace predicate str end replacement
76 | end `List.isSuffixOf` str = Just replaced
79 part = take (length str - length end) str
80 replaced | predicate part = Right (part <> replacement)
81 | otherwise = Left str
83 replaceEnd :: Eq a => ([a] -> Bool) -> [a] -> [a] -> [a] -> Maybe [a]
84 replaceEnd predicate str end replacement = do
85 result <- statefulReplace predicate str end replacement
86 return (either identity identity result)
89 :: (Foldable t, Functor t, Eq a) =>
90 ([a] -> Bool) -> [a] -> t ([a], [a]) -> Maybe [a]
91 findStem f word pairs = msum $ map (uncurry (replaceEnd f word)) pairs
93 measureGT :: Int -> [Char] -> Bool
94 measureGT = flip ((>) . measure)
96 step1a :: [Char] -> [Char]
97 step1a word = fromMaybe word result
99 result = findStem (const True) word suffixes
100 suffixes = [("sses", "ss"), ("ies", "i"), ("ss", "ss"), ("s", "")]
102 beforeStep1b :: [Char] -> Either [Char] [Char]
103 beforeStep1b word = fromMaybe (Left word) result
105 cond23 x = do { v <- x; either (const Nothing) (return . Right) v }
106 cond1 x = do { v <- x; return (Left v) }
108 cond1 (replaceEnd (measureGT 0) word "eed" "ee") `mplus`
109 cond23 (statefulReplace containsVowel word "ed" "" ) `mplus`
110 cond23 (statefulReplace containsVowel word "ing" "" )
112 afterStep1b :: [Char] -> [Char]
113 afterStep1b word = fromMaybe word result
115 double = endsWithDouble word && not (any ((`List.isSuffixOf` word) . return) ['l','s','z'])
116 mEq1AndCvc = measure word == 1 && cvc word
117 iif cond val = if cond then Just val else Nothing
118 result = findStem (const True) word [("at", "ate"), ("bl", "ble"), ("iz", "ize")]
119 `mplus` iif double (List.init word)
120 `mplus` iif mEq1AndCvc (word <> "e")
122 step1b :: [Char] -> [Char]
123 step1b = either identity afterStep1b . beforeStep1b
125 step1c :: [Char] -> [Char]
126 step1c word = fromMaybe word result
127 where result = replaceEnd containsVowel word "y" "i"
129 step1 :: [Char] -> [Char]
130 step1 = step1c . step1b . step1a
132 step2 :: [Char] -> [Char]
133 step2 word = fromMaybe word result
135 result = findStem (measureGT 0) word
136 [ ("ational", "ate" )
146 , ("ization", "ize" )
150 , ("iveness", "ive" )
151 , ("fulness", "ful" )
152 , ("ousness", "ous" )
158 step3 :: [Char] -> [Char]
159 step3 word = fromMaybe word result
161 result = findStem (measureGT 0) word
170 step4 :: [Char] -> [Char]
171 step4 word = fromMaybe word result
173 gt1andST str = (measureGT 1) str && any ((`List.isSuffixOf` str) . return) ['s','t']
174 findGT1 = findStem (measureGT 1) word . map (flip (,) "")
175 result = (findGT1 ["al", "ance", "ence", "er", "ic", "able", "ible", "ant", "ement", "ment", "ent"]) `mplus`
176 (findStem gt1andST word [("ion","")]) `mplus`
177 (findGT1 ["ou", "ism", "ate", "iti", "ous", "ive", "ize"])
179 step5a :: [Char] -> [Char]
180 step5a word = fromMaybe word result
182 test str = (measureGT 1 str) || ((measure str == 1) && (not $ cvc str))
183 result = replaceEnd test word "e" ""
185 step5b :: [Char] -> [Char]
186 step5b word = fromMaybe word result
188 cond s = List.last s == 'l' && measureGT 1 s
189 result = replaceEnd cond word "l" ""
191 step5 :: [Char] -> [Char]
192 step5 = step5b . step5a
194 allSteps :: [Char] -> [Char]
195 allSteps = step5 . step4 . step3 . step2 . step1
197 stemIt :: Text -> Text
198 stemIt s = pack (stem' $ unpack s)
200 stem' :: [Char] -> [Char]
201 stem' s | length s < 3 = s
202 | otherwise = allSteps s
204 --fixpoint :: Eq t => (t -> t) -> t -> t
205 --fixpoint f x = let fx = f x in
208 -- else fixpoint f fx
210 --fixstem :: [Char] -> [Char]
211 --fixstem = fixpoint stem'
218 content <- readFile "input.txt"
219 writeFile "output.txt" $ unlines $ map stem $ lines content