]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Ngrams/Stem/En.hs
[FEAT/STEM] implemenging Porter lib into Gargantext for English language.
[gargantext.git] / src / Gargantext / Ngrams / Stem / En.hs
1 {-|
2 Module : Gargantext.
3 Description : Porter Algorithm Implementation purely Haskell
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Adapted from:
11 - source: https://hackage.haskell.org/package/porter
12 - [Char] -> [Text]
13 - adding Types signatures
14 - fixes unseen cases
15
16 -}
17
18 {-# LANGUAGE NoImplicitPrelude #-}
19
20 module Gargantext.Ngrams.Stem.En
21 where
22
23 import Control.Monad
24 import Data.Either
25 import Data.Maybe
26 import Data.Text (Text(), pack, unpack)
27
28 import Data.List hiding (map, head)
29
30 import Gargantext.Prelude
31
32 vowels :: [Char]
33 vowels = ['a','e','i','o','u']
34
35 isConsonant :: [Char] -> Int -> Bool
36 isConsonant str i
37 | c `elem` vowels = False
38 | c == 'y' = i == 0 || isVowel str (i - 1)
39 | otherwise = True
40 where
41 c = str !! i
42
43 isVowel :: [Char] -> Int -> Bool
44 isVowel = (not .) . isConsonant
45
46 byIndex :: Foldable t1 => (t1 a -> [Int] -> t2) -> t1 a -> t2
47 byIndex fun str = fun str [0..length str - 1]
48
49 containsVowel :: [Char] -> Bool
50 containsVowel = byIndex (any . isVowel)
51
52 -- | /!\ unsafe fromJust
53 measure :: [Char] -> Int
54 measure = length . filter not . init . (True:)
55 . map fromJust . map head
56 . group . byIndex (map . isConsonant)
57
58
59 endsWithDouble :: [Char] -> Bool
60 endsWithDouble = startsWithDouble . reverse
61 where
62 startsWithDouble l = case l of
63 (x:y:_) -> x == y && x `notElem` vowels
64 _ -> False
65
66 cvc :: [Char] -> Bool
67 cvc word | length word < 3 = False
68 | otherwise = isConsonant word lastIndex &&
69 isVowel word (lastIndex - 1) &&
70 isConsonant word (lastIndex - 2) &&
71 last word `notElem` ['w','x','y']
72 where lastIndex = length word - 1
73
74 statefulReplace :: Eq a => ([a] -> Bool)
75 -> [a] -> [a] -> [a]
76 -> Maybe (Data.Either.Either [a] [a])
77 statefulReplace predicate str end replacement
78 | end `isSuffixOf` str = Just replaced
79 | otherwise = Nothing
80 where
81 part = take (length str - length end) str
82 replaced | predicate part = Right (part ++ replacement)
83 | otherwise = Left str
84
85 replaceEnd :: Eq a => ([a] -> Bool) -> [a] -> [a] -> [a] -> Maybe [a]
86 replaceEnd predicate str end replacement = do
87 result <- statefulReplace predicate str end replacement
88 return (either identity identity result)
89
90 findStem
91 :: (Foldable t, Functor t, Eq a) =>
92 ([a] -> Bool) -> [a] -> t ([a], [a]) -> Maybe [a]
93 findStem f word pairs = msum $ map (uncurry (replaceEnd f word)) pairs
94
95 measureGT :: Int -> [Char] -> Bool
96 measureGT = flip ((>) . measure)
97
98 step1a :: [Char] -> [Char]
99 step1a word = fromMaybe word result
100 where
101 result = findStem (const True) word suffixes
102 suffixes = [("sses", "ss"), ("ies", "i"), ("ss", "ss"), ("s", "")]
103
104 beforeStep1b :: [Char] -> Either [Char] [Char]
105 beforeStep1b word = fromMaybe (Left word) result
106 where
107 cond23 x = do { v <- x; either (const Nothing) (return . Right) v }
108 cond1 x = do { v <- x; return (Left v) }
109 result =
110 cond1 (replaceEnd (measureGT 0) word "eed" "ee") `mplus`
111 cond23 (statefulReplace containsVowel word "ed" "" ) `mplus`
112 cond23 (statefulReplace containsVowel word "ing" "" )
113
114 afterStep1b :: [Char] -> [Char]
115 afterStep1b word = fromMaybe word result
116 where
117 double = endsWithDouble word && not (any ((`isSuffixOf` word) . return) ['l','s','z'])
118 mEq1AndCvc = measure word == 1 && cvc word
119 iif cond val = if cond then Just val else Nothing
120 result = findStem (const True) word [("at", "ate"), ("bl", "ble"), ("iz", "ize")]
121 `mplus` iif double (init word)
122 `mplus` iif mEq1AndCvc (word ++ "e")
123
124 step1b :: [Char] -> [Char]
125 step1b = either identity afterStep1b . beforeStep1b
126
127 step1c :: [Char] -> [Char]
128 step1c word = fromMaybe word result
129 where result = replaceEnd containsVowel word "y" "i"
130
131 step1 :: [Char] -> [Char]
132 step1 = step1c . step1b . step1a
133
134 step2 :: [Char] -> [Char]
135 step2 word = fromMaybe word result
136 where
137 result = findStem (measureGT 0) word
138 [ ("ational", "ate" )
139 , ("tional", "tion")
140 , ("enci", "ence")
141 , ("anci", "ance")
142 , ("izer", "ize" )
143 , ("bli", "ble" )
144 , ("alli", "al" )
145 , ("entli", "ent" )
146 , ("eli", "e" )
147 , ("ousli", "ous" )
148 , ("ization", "ize" )
149 , ("ation", "ate" )
150 , ("ator", "ate" )
151 , ("alism", "al" )
152 , ("iveness", "ive" )
153 , ("fulness", "ful" )
154 , ("ousness", "ous" )
155 , ("aliti", "al" )
156 , ("iviti", "ive" )
157 , ("biliti", "ble" )
158 , ("logi", "log" ) ]
159
160 step3 :: [Char] -> [Char]
161 step3 word = fromMaybe word result
162 where
163 result = findStem (measureGT 0) word
164 [ ("icate", "ic")
165 , ("ative", "" )
166 , ("alize", "al")
167 , ("iciti", "ic")
168 , ("ical" , "ic")
169 , ("ful" , "" )
170 , ("ness" , "" ) ]
171
172 step4 :: [Char] -> [Char]
173 step4 word = fromMaybe word result
174 where
175 gt1andST str = (measureGT 1) str && any ((`isSuffixOf` str) . return) ['s','t']
176 findGT1 = findStem (measureGT 1) word . map (flip (,) "")
177 result = (findGT1 ["al", "ance", "ence", "er", "ic", "able", "ible", "ant", "ement", "ment", "ent"]) `mplus`
178 (findStem gt1andST word [("ion","")]) `mplus`
179 (findGT1 ["ou", "ism", "ate", "iti", "ous", "ive", "ize"])
180
181 step5a :: [Char] -> [Char]
182 step5a word = fromMaybe word result
183 where
184 test str = (measureGT 1 str) || ((measure str == 1) && (not $ cvc str))
185 result = replaceEnd test word "e" ""
186
187 step5b :: [Char] -> [Char]
188 step5b word = fromMaybe word result
189 where
190 cond s = last s == 'l' && measureGT 1 s
191 result = replaceEnd cond word "l" ""
192
193 step5 :: [Char] -> [Char]
194 step5 = step5b . step5a
195
196 allSteps :: [Char] -> [Char]
197 allSteps = step5 . step4 . step3 . step2 . step1
198
199 stem :: Text -> Text
200 stem s = pack (stem' $ unpack s)
201
202 stem' :: [Char] -> [Char]
203 stem' s | length s < 3 = s
204 | otherwise = allSteps s
205
206 fixpoint :: Eq t => (t -> t) -> t -> t
207 fixpoint f x = let fx = f x in
208 if fx == x
209 then x
210 else fixpoint f fx
211
212 fixstem :: [Char] -> [Char]
213 fixstem = fixpoint stem'
214
215
216 {-
217
218 main :: IO ()
219 main = do
220 content <- readFile "input.txt"
221 writeFile "output.txt" $ unlines $ map stem $ lines content
222
223 -}