]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Terms/Mono/Stem/En.hs
[ihaskell] some development towards codebook integration
[gargantext.git] / src / Gargantext / Core / Text / Terms / Mono / Stem / En.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Stem.En
3 Description : Porter Algorithm Implementation purely Haskell
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Adapted from:
11 - source: https://hackage.haskell.org/package/porter
12 - [Char] -> [Text]
13 - adding Types signatures
14 - fixes unseen cases
15
16 -}
17
18
19 module Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
20 where
21
22 import Control.Monad
23 import Data.Either
24 import Data.Maybe
25 import Data.Text (Text(), pack, unpack)
26
27 import Data.List hiding (map, head)
28
29 import Gargantext.Prelude
30
31 vowels :: [Char]
32 vowels = ['a','e','i','o','u']
33
34 isConsonant :: [Char] -> Int -> Bool
35 isConsonant str i
36 | c `elem` vowels = False
37 | c == 'y' = i == 0 || isVowel str (i - 1)
38 | otherwise = True
39 where
40 c = str !! i
41
42 isVowel :: [Char] -> Int -> Bool
43 isVowel = (not .) . isConsonant
44
45 byIndex :: Foldable t1 => (t1 a -> [Int] -> t2) -> t1 a -> t2
46 byIndex fun str = fun str [0..length str - 1]
47
48 containsVowel :: [Char] -> Bool
49 containsVowel = byIndex (any . isVowel)
50
51 -- | /!\ unsafe fromJust
52 measure :: [Char] -> Int
53 measure = length . filter not . init . (True:)
54 . map fromJust . map head
55 . group . byIndex (map . isConsonant)
56
57
58 endsWithDouble :: [Char] -> Bool
59 endsWithDouble = startsWithDouble . reverse
60 where
61 startsWithDouble l = case l of
62 (x:y:_) -> x == y && x `notElem` vowels
63 _ -> False
64
65 cvc :: [Char] -> Bool
66 cvc word | length word < 3 = False
67 | otherwise = isConsonant word lastIndex &&
68 isVowel word (lastIndex - 1) &&
69 isConsonant word (lastIndex - 2) &&
70 last word `notElem` ['w','x','y']
71 where lastIndex = length word - 1
72
73 statefulReplace :: Eq a => ([a] -> Bool)
74 -> [a] -> [a] -> [a]
75 -> Maybe (Data.Either.Either [a] [a])
76 statefulReplace predicate str end replacement
77 | end `isSuffixOf` str = Just replaced
78 | otherwise = Nothing
79 where
80 part = take (length str - length end) str
81 replaced | predicate part = Right (part ++ replacement)
82 | otherwise = Left str
83
84 replaceEnd :: Eq a => ([a] -> Bool) -> [a] -> [a] -> [a] -> Maybe [a]
85 replaceEnd predicate str end replacement = do
86 result <- statefulReplace predicate str end replacement
87 return (either identity identity result)
88
89 findStem
90 :: (Foldable t, Functor t, Eq a) =>
91 ([a] -> Bool) -> [a] -> t ([a], [a]) -> Maybe [a]
92 findStem f word pairs = msum $ map (uncurry (replaceEnd f word)) pairs
93
94 measureGT :: Int -> [Char] -> Bool
95 measureGT = flip ((>) . measure)
96
97 step1a :: [Char] -> [Char]
98 step1a word = fromMaybe word result
99 where
100 result = findStem (const True) word suffixes
101 suffixes = [("sses", "ss"), ("ies", "i"), ("ss", "ss"), ("s", "")]
102
103 beforeStep1b :: [Char] -> Either [Char] [Char]
104 beforeStep1b word = fromMaybe (Left word) result
105 where
106 cond23 x = do { v <- x; either (const Nothing) (return . Right) v }
107 cond1 x = do { v <- x; return (Left v) }
108 result =
109 cond1 (replaceEnd (measureGT 0) word "eed" "ee") `mplus`
110 cond23 (statefulReplace containsVowel word "ed" "" ) `mplus`
111 cond23 (statefulReplace containsVowel word "ing" "" )
112
113 afterStep1b :: [Char] -> [Char]
114 afterStep1b word = fromMaybe word result
115 where
116 double = endsWithDouble word && not (any ((`isSuffixOf` word) . return) ['l','s','z'])
117 mEq1AndCvc = measure word == 1 && cvc word
118 iif cond val = if cond then Just val else Nothing
119 result = findStem (const True) word [("at", "ate"), ("bl", "ble"), ("iz", "ize")]
120 `mplus` iif double (init word)
121 `mplus` iif mEq1AndCvc (word ++ "e")
122
123 step1b :: [Char] -> [Char]
124 step1b = either identity afterStep1b . beforeStep1b
125
126 step1c :: [Char] -> [Char]
127 step1c word = fromMaybe word result
128 where result = replaceEnd containsVowel word "y" "i"
129
130 step1 :: [Char] -> [Char]
131 step1 = step1c . step1b . step1a
132
133 step2 :: [Char] -> [Char]
134 step2 word = fromMaybe word result
135 where
136 result = findStem (measureGT 0) word
137 [ ("ational", "ate" )
138 , ("tional", "tion")
139 , ("enci", "ence")
140 , ("anci", "ance")
141 , ("izer", "ize" )
142 , ("bli", "ble" )
143 , ("alli", "al" )
144 , ("entli", "ent" )
145 , ("eli", "e" )
146 , ("ousli", "ous" )
147 , ("ization", "ize" )
148 , ("ation", "ate" )
149 , ("ator", "ate" )
150 , ("alism", "al" )
151 , ("iveness", "ive" )
152 , ("fulness", "ful" )
153 , ("ousness", "ous" )
154 , ("aliti", "al" )
155 , ("iviti", "ive" )
156 , ("biliti", "ble" )
157 , ("logi", "log" ) ]
158
159 step3 :: [Char] -> [Char]
160 step3 word = fromMaybe word result
161 where
162 result = findStem (measureGT 0) word
163 [ ("icate", "ic")
164 , ("ative", "" )
165 , ("alize", "al")
166 , ("iciti", "ic")
167 , ("ical" , "ic")
168 , ("ful" , "" )
169 , ("ness" , "" ) ]
170
171 step4 :: [Char] -> [Char]
172 step4 word = fromMaybe word result
173 where
174 gt1andST str = (measureGT 1) str && any ((`isSuffixOf` str) . return) ['s','t']
175 findGT1 = findStem (measureGT 1) word . map (flip (,) "")
176 result = (findGT1 ["al", "ance", "ence", "er", "ic", "able", "ible", "ant", "ement", "ment", "ent"]) `mplus`
177 (findStem gt1andST word [("ion","")]) `mplus`
178 (findGT1 ["ou", "ism", "ate", "iti", "ous", "ive", "ize"])
179
180 step5a :: [Char] -> [Char]
181 step5a word = fromMaybe word result
182 where
183 test str = (measureGT 1 str) || ((measure str == 1) && (not $ cvc str))
184 result = replaceEnd test word "e" ""
185
186 step5b :: [Char] -> [Char]
187 step5b word = fromMaybe word result
188 where
189 cond s = last s == 'l' && measureGT 1 s
190 result = replaceEnd cond word "l" ""
191
192 step5 :: [Char] -> [Char]
193 step5 = step5b . step5a
194
195 allSteps :: [Char] -> [Char]
196 allSteps = step5 . step4 . step3 . step2 . step1
197
198 stemIt :: Text -> Text
199 stemIt s = pack (stem' $ unpack s)
200
201 stem' :: [Char] -> [Char]
202 stem' s | length s < 3 = s
203 | otherwise = allSteps s
204
205 --fixpoint :: Eq t => (t -> t) -> t -> t
206 --fixpoint f x = let fx = f x in
207 -- if fx == x
208 -- then x
209 -- else fixpoint f fx
210 --
211 --fixstem :: [Char] -> [Char]
212 --fixstem = fixpoint stem'
213
214
215 {-
216
217 main :: IO ()
218 main = do
219 content <- readFile "input.txt"
220 writeFile "output.txt" $ unlines $ map stem $ lines content
221
222 -}