]> Git — Sourcephile - gargantext.git/blob - src/Data/Gargantext/Ngrams/Metrics.hs
[FIX] NLP Ngrams parser works for French _and_ English.
[gargantext.git] / src / Data / Gargantext / Ngrams / Metrics.hs
1 module Data.Gargantext.Ngrams.Metrics (levenshtein
2 , levenshteinNorm
3 , damerauLevenshtein
4 , damerauLevenshteinNorm
5 , overlap
6 , jaccard
7 , hamming
8 ) where
9
10 import Data.Text (Text)
11 import GHC.Real (Ratio)
12 import qualified Data.Text.Metrics as DTM
13
14 -- | This module provide metrics to compare Text
15 -- starting as an API rexporting main functions of the great lib
16 -- text-metrics of Mark Karpov
17
18 -- | Levenshtein Distance
19 -- In information theory, Linguistics and computer science,
20 -- the Levenshtein distance is a string metric for measuring
21 -- the difference between two sequences.
22 -- See: https://en.wikipedia.org/wiki/Levenshtein_distance
23
24 levenshtein :: Text -> Text -> Int
25 levenshtein = DTM.levenshtein
26
27 -- | Return normalized Levenshtein distance between two 'Text' values.
28 -- Result is a non-negative rational number (represented as @'Ratio'
29 -- 'Data.Numeric.Natural'@), where 0 signifies no similarity between the
30 -- strings, while 1 means exact match.
31
32 levenshteinNorm :: Text -> Text -> Ratio Int
33 levenshteinNorm = DTM.levenshteinNorm
34
35 -- | Return Damerau-Levenshtein distance between two 'Text' values. The
36 -- function works like 'levenshtein', but the collection of allowed
37 -- operations also includes transposition of two /adjacent/ characters.
38 -- See also:
39 -- <https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance>
40
41 damerauLevenshtein :: Text -> Text -> Int
42 damerauLevenshtein = DTM.damerauLevenshtein
43
44 -- damerau-Levenshtein distance normalized
45
46 damerauLevenshteinNorm :: Text -> Text -> Ratio Int
47 damerauLevenshteinNorm = DTM.damerauLevenshteinNorm
48
49 -- Treating inputs like sets
50
51 -- | Return overlap coefficient for two 'Text' values. Returned value
52 -- is in the range from 0 (no similarity) to 1 (exact match). Return 1
53 -- if both 'Text' values are empty.
54 --
55 -- See also: <https://en.wikipedia.org/wiki/Overlap_coefficient>.
56 overlap :: Text -> Text -> Ratio Int
57 overlap = DTM.overlap
58
59
60 -- | Jaccard distance
61 -- measures dissimilarity between sample sets
62 jaccard :: Text -> Text -> Ratio Int
63 jaccard = DTM.jaccard
64
65 -- | Hamming Distance
66 -- In information theory, the Hamming distance between two strings of
67 -- equal length is the number of positions at which the corresponding
68 -- symbols are different. In other words, it measures the minimum number of
69 -- substitutions required to change one string into the other
70 -- See: https://en.wikipedia.org/wiki/Hamming_distance
71
72 hamming :: Text -> Text -> Maybe Int
73 hamming = DTM.hamming
74
75