]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Ngrams/Stem/En.hs
[FEAT/STEM] copying Porter lib.
[gargantext.git] / src / Gargantext / Ngrams / Stem / En.hs
1 module Language.Porter (stem, fixstem)
2 where
3
4 import Control.Monad
5 import Data.Maybe
6 import Data.List
7
8 isConsonant str i
9 | c `elem` "aeiou" = False
10 | c == 'y' = i == 0 || isVowel str (i - 1)
11 | otherwise = True
12 where
13 c = str !! i
14
15 isVowel = (not .) . isConsonant
16
17 byIndex fun str = fun str [0..length str - 1]
18
19 measure = length . filter not . init . (True:) . map head . group . byIndex (map . isConsonant)
20
21 containsVowel = byIndex (any . isVowel)
22
23 endsWithDouble = startsWithDouble . reverse
24 where
25 startsWithDouble l | length l < 2 = False
26 | otherwise = let (x:y:_) = l in x == y && x `notElem` "aeiou"
27
28 cvc word | length word < 3 = False
29 | otherwise = isConsonant word lastIndex &&
30 isVowel word (lastIndex - 1) &&
31 isConsonant word (lastIndex - 2) &&
32 last word `notElem` "wxy"
33 where lastIndex = length word - 1
34
35 statefulReplace predicate str end replacement
36 | end `isSuffixOf` str = Just replaced
37 | otherwise = Nothing
38 where
39 part = take (length str - length end) str
40 replaced | predicate part = Right (part ++ replacement)
41 | otherwise = Left str
42
43 replaceEnd predicate str end replacement = do
44 result <- statefulReplace predicate str end replacement
45 return (either id id result)
46
47 findStem f word pairs = msum $ map (uncurry (replaceEnd f word)) pairs
48
49 measureGT = flip ((>) . measure)
50
51 step1a word = fromMaybe word result
52 where result = findStem (const True) word [("sses", "ss"), ("ies", "i"), ("ss", "ss"), ("s", "")]
53
54 beforeStep1b word = fromMaybe (Left word) result
55 where
56 cond23 x = do { v <- x; either (const Nothing) (return . Right) v }
57 cond1 x = do { v <- x; return (Left v) }
58 result =
59 cond1 (replaceEnd (measureGT 0) word "eed" "ee") `mplus`
60 cond23 (statefulReplace containsVowel word "ed" "" ) `mplus`
61 cond23 (statefulReplace containsVowel word "ing" "" )
62
63 afterStep1b word = fromMaybe word result
64 where
65 double = endsWithDouble word && not (any ((`isSuffixOf` word) . return) "lsz")
66 mEq1AndCvc = measure word == 1 && cvc word
67 iif cond val = if cond then Just val else Nothing
68 result = findStem (const True) word [("at", "ate"), ("bl", "ble"), ("iz", "ize")]
69 `mplus` iif double (init word)
70 `mplus` iif mEq1AndCvc (word ++ "e")
71
72 step1b = either id afterStep1b . beforeStep1b
73
74 step1c word = fromMaybe word result
75 where result = replaceEnd containsVowel word "y" "i"
76
77 step1 = step1c . step1b . step1a
78
79 step2 word = fromMaybe word result
80 where
81 result = findStem (measureGT 0) word
82 [ ("ational", "ate" )
83 , ("tional", "tion")
84 , ("enci", "ence")
85 , ("anci", "ance")
86 , ("izer", "ize" )
87 , ("bli", "ble" )
88 , ("alli", "al" )
89 , ("entli", "ent" )
90 , ("eli", "e" )
91 , ("ousli", "ous" )
92 , ("ization", "ize" )
93 , ("ation", "ate" )
94 , ("ator", "ate" )
95 , ("alism", "al" )
96 , ("iveness", "ive" )
97 , ("fulness", "ful" )
98 , ("ousness", "ous" )
99 , ("aliti", "al" )
100 , ("iviti", "ive" )
101 , ("biliti", "ble" )
102 , ("logi", "log" ) ]
103
104 step3 word = fromMaybe word result
105 where
106 result = findStem (measureGT 0) word
107 [ ("icate", "ic")
108 , ("ative", "" )
109 , ("alize", "al")
110 , ("iciti", "ic")
111 , ("ical" , "ic")
112 , ("ful" , "" )
113 , ("ness" , "" ) ]
114
115 step4 word = fromMaybe word result
116 where
117 gt1andST str = (measureGT 1) str && any ((`isSuffixOf` str) . return) "st"
118 findGT1 = findStem (measureGT 1) word . map (flip (,) "")
119 result = (findGT1 ["al", "ance", "ence", "er", "ic", "able", "ible", "ant", "ement", "ment", "ent"]) `mplus`
120 (findStem gt1andST word [("ion","")]) `mplus`
121 (findGT1 ["ou", "ism", "ate", "iti", "ous", "ive", "ize"])
122
123 step5a word = fromMaybe word result
124 where
125 test str = (measureGT 1 str) || ((measure str == 1) && (not $ cvc str))
126 result = replaceEnd test word "e" ""
127
128 step5b word = fromMaybe word result
129 where
130 cond s = last s == 'l' && measureGT 1 s
131 result = replaceEnd cond word "l" ""
132
133 step5 = step5b . step5a
134
135 allSteps = step5 . step4 . step3 . step2 . step1
136
137 stem s | length s < 3 = s
138 | otherwise = allSteps s
139
140 fixpoint f x = let fx = f x in
141 if fx == x
142 then x
143 else fixpoint f fx
144
145 fixstem = fixpoint stem
146
147 {-
148
149 main :: IO ()
150 main = do
151 content <- readFile "input.txt"
152 writeFile "output.txt" $ unlines $ map stem $ lines content
153
154 -}