]> Git — Sourcephile - gargantext.git/blob - src/Data/Gargantext/Ngrams/Utils.hs
[NGRAMS] improving ngrams extraction with prep (of/de) respectively in eng/fr.
[gargantext.git] / src / Data / Gargantext / Ngrams / Utils.hs
1 -- module Data.Gargantext.Ngrams.Utils where
2
3
4 -- calculate levenshtein distance between two strings
5 levenshtein::[Char] -> [Char] -> Int
6 levenshtein "" "" = 0
7 levenshtein "" s2 = length s2
8 levenshtein s1 "" = length s1
9 levenshtein s1 s2
10 | last s1 == last s2 = levenshtein (init s1) (init s2)
11 | otherwise = minimum [
12 1 + levenshtein (init s1) s2,
13 1 + levenshtein s1 (init s2),
14 1 + levenshtein (init s1) (init s2)
15 ]
16
17
18 -- calculate levenshtein distance between two strings
19 levenshtein::[Char] -> [Char] -> Int
20 -- this part is mostly a speed optimiziation
21 levenshtein' s1 s2
22 | length s1 > length s2 = levenshtein s2 s1
23 | length s1 < length s2 =
24 let d = length s2 - length s1
25 in d + levenshtein s1 (take (length s2 - d) s2)
26 -- the meat of the algorithm
27 levenshtein' "" "" = 0
28 levenshtein' s1 s2
29 | last s1 == last s2 = levenshtein (init s1) (init s2)
30 | otherwise = minimum [1 + levenshtein (init s1) s2,
31 1 + levenshtein s1 (init s2),
32 1 + levenshtein (init s1) (init s2)]