]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Terms/Mono/Stem/En.hs
[FIX]
[gargantext.git] / src / Gargantext / Core / Text / Terms / Mono / Stem / En.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Stem.En
3 Description : Porter Algorithm Implementation purely Haskell
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Adapted from:
11 - source: https://hackage.haskell.org/package/porter
12 - [Char] -> [Text]
13 - adding Types signatures
14 - fixes unseen cases
15
16 -}
17
18
19 module Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
20 where
21
22 import Control.Monad
23 import Data.Either
24 import Data.List ((!!))
25 import Data.Maybe
26 import Data.Text (Text(), pack, unpack)
27 import Gargantext.Prelude
28 import qualified Data.List as List hiding (map, head)
29
30 vowels :: [Char]
31 vowels = ['a','e','i','o','u']
32
33 isConsonant :: [Char] -> Int -> Bool
34 isConsonant str i
35 | c `elem` vowels = False
36 | c == 'y' = i == 0 || isVowel str (i - 1)
37 | otherwise = True
38 where
39 c = str !! i
40
41 isVowel :: [Char] -> Int -> Bool
42 isVowel = (not .) . isConsonant
43
44 byIndex :: Foldable t1 => (t1 a -> [Int] -> t2) -> t1 a -> t2
45 byIndex fun str = fun str [0..length str - 1]
46
47 containsVowel :: [Char] -> Bool
48 containsVowel = byIndex (any . isVowel)
49
50 -- | /!\ unsafe fromJust
51 measure :: [Char] -> Int
52 measure = length . filter not . List.init . (True:)
53 . map fromJust . map head
54 . List.group . byIndex (map . isConsonant)
55
56
57 endsWithDouble :: [Char] -> Bool
58 endsWithDouble = startsWithDouble . reverse
59 where
60 startsWithDouble l = case l of
61 (x:y:_) -> x == y && x `List.notElem` vowels
62 _ -> False
63
64 cvc :: [Char] -> Bool
65 cvc word | length word < 3 = False
66 | otherwise = isConsonant word lastIndex &&
67 isVowel word (lastIndex - 1) &&
68 isConsonant word (lastIndex - 2) &&
69 List.last word `List.notElem` ['w','x','y']
70 where lastIndex = length word - 1
71
72 statefulReplace :: Eq a => ([a] -> Bool)
73 -> [a] -> [a] -> [a]
74 -> Maybe (Data.Either.Either [a] [a])
75 statefulReplace predicate str end replacement
76 | end `List.isSuffixOf` str = Just replaced
77 | otherwise = Nothing
78 where
79 part = take (length str - length end) str
80 replaced | predicate part = Right (part <> replacement)
81 | otherwise = Left str
82
83 replaceEnd :: Eq a => ([a] -> Bool) -> [a] -> [a] -> [a] -> Maybe [a]
84 replaceEnd predicate str end replacement = do
85 result <- statefulReplace predicate str end replacement
86 return (either identity identity result)
87
88 findStem
89 :: (Foldable t, Functor t, Eq a) =>
90 ([a] -> Bool) -> [a] -> t ([a], [a]) -> Maybe [a]
91 findStem f word pairs = msum $ map (uncurry (replaceEnd f word)) pairs
92
93 measureGT :: Int -> [Char] -> Bool
94 measureGT = flip ((>) . measure)
95
96 step1a :: [Char] -> [Char]
97 step1a word = fromMaybe word result
98 where
99 result = findStem (const True) word suffixes
100 suffixes = [("sses", "ss"), ("ies", "i"), ("ss", "ss"), ("s", "")]
101
102 beforeStep1b :: [Char] -> Either [Char] [Char]
103 beforeStep1b word = fromMaybe (Left word) result
104 where
105 cond23 x = do { v <- x; either (const Nothing) (return . Right) v }
106 cond1 x = do { v <- x; return (Left v) }
107 result =
108 cond1 (replaceEnd (measureGT 0) word "eed" "ee") `mplus`
109 cond23 (statefulReplace containsVowel word "ed" "" ) `mplus`
110 cond23 (statefulReplace containsVowel word "ing" "" )
111
112 afterStep1b :: [Char] -> [Char]
113 afterStep1b word = fromMaybe word result
114 where
115 double = endsWithDouble word && not (any ((`List.isSuffixOf` word) . return) ['l','s','z'])
116 mEq1AndCvc = measure word == 1 && cvc word
117 iif cond val = if cond then Just val else Nothing
118 result = findStem (const True) word [("at", "ate"), ("bl", "ble"), ("iz", "ize")]
119 `mplus` iif double (List.init word)
120 `mplus` iif mEq1AndCvc (word <> "e")
121
122 step1b :: [Char] -> [Char]
123 step1b = either identity afterStep1b . beforeStep1b
124
125 step1c :: [Char] -> [Char]
126 step1c word = fromMaybe word result
127 where result = replaceEnd containsVowel word "y" "i"
128
129 step1 :: [Char] -> [Char]
130 step1 = step1c . step1b . step1a
131
132 step2 :: [Char] -> [Char]
133 step2 word = fromMaybe word result
134 where
135 result = findStem (measureGT 0) word
136 [ ("ational", "ate" )
137 , ("tional", "tion")
138 , ("enci", "ence")
139 , ("anci", "ance")
140 , ("izer", "ize" )
141 , ("bli", "ble" )
142 , ("alli", "al" )
143 , ("entli", "ent" )
144 , ("eli", "e" )
145 , ("ousli", "ous" )
146 , ("ization", "ize" )
147 , ("ation", "ate" )
148 , ("ator", "ate" )
149 , ("alism", "al" )
150 , ("iveness", "ive" )
151 , ("fulness", "ful" )
152 , ("ousness", "ous" )
153 , ("aliti", "al" )
154 , ("iviti", "ive" )
155 , ("biliti", "ble" )
156 , ("logi", "log" ) ]
157
158 step3 :: [Char] -> [Char]
159 step3 word = fromMaybe word result
160 where
161 result = findStem (measureGT 0) word
162 [ ("icate", "ic")
163 , ("ative", "" )
164 , ("alize", "al")
165 , ("iciti", "ic")
166 , ("ical" , "ic")
167 , ("ful" , "" )
168 , ("ness" , "" ) ]
169
170 step4 :: [Char] -> [Char]
171 step4 word = fromMaybe word result
172 where
173 gt1andST str = (measureGT 1) str && any ((`List.isSuffixOf` str) . return) ['s','t']
174 findGT1 = findStem (measureGT 1) word . map (flip (,) "")
175 result = (findGT1 ["al", "ance", "ence", "er", "ic", "able", "ible", "ant", "ement", "ment", "ent"]) `mplus`
176 (findStem gt1andST word [("ion","")]) `mplus`
177 (findGT1 ["ou", "ism", "ate", "iti", "ous", "ive", "ize"])
178
179 step5a :: [Char] -> [Char]
180 step5a word = fromMaybe word result
181 where
182 test str = (measureGT 1 str) || ((measure str == 1) && (not $ cvc str))
183 result = replaceEnd test word "e" ""
184
185 step5b :: [Char] -> [Char]
186 step5b word = fromMaybe word result
187 where
188 cond s = List.last s == 'l' && measureGT 1 s
189 result = replaceEnd cond word "l" ""
190
191 step5 :: [Char] -> [Char]
192 step5 = step5b . step5a
193
194 allSteps :: [Char] -> [Char]
195 allSteps = step5 . step4 . step3 . step2 . step1
196
197 stemIt :: Text -> Text
198 stemIt s = pack (stem' $ unpack s)
199
200 stem' :: [Char] -> [Char]
201 stem' s | length s < 3 = s
202 | otherwise = allSteps s
203
204 --fixpoint :: Eq t => (t -> t) -> t -> t
205 --fixpoint f x = let fx = f x in
206 -- if fx == x
207 -- then x
208 -- else fixpoint f fx
209 --
210 --fixstem :: [Char] -> [Char]
211 --fixstem = fixpoint stem'
212
213
214 {-
215
216 main :: IO ()
217 main = do
218 content <- readFile "input.txt"
219 writeFile "output.txt" $ unlines $ map stem $ lines content
220
221 -}