]> Git — Sourcephile - gargantext.git/blob - src/Data/Gargantext/Ngrams/Metrics.hs
Add comments
[gargantext.git] / src / Data / Gargantext / Ngrams / Metrics.hs
1 {-|
2 Module : Data.Gargantext.Ngrams.Metrics
3 Description : Short description
4 Copyright : (c) Some Guy, 2013
5 Someone Else, 2014
6 License : GPL-3
7 Maintainer : sample@email.com
8 Stability : experimental
9 Portability : POSIX
10
11 Here is a longer description of this module, containing some
12 commentary with @some markup@.
13 -}
14
15 module Data.Gargantext.Ngrams.Metrics (levenshtein
16 , levenshteinNorm
17 , damerauLevenshtein
18 , damerauLevenshteinNorm
19 , overlap
20 , jaccard
21 , hamming
22 ) where
23
24 import Data.Text (Text)
25 import GHC.Real (Ratio)
26 import qualified Data.Text.Metrics as DTM
27
28 {- * Example de titre
29 -}
30
31 -- | This module provide metrics to compare Text
32 -- starting as an API rexporting main functions of the great lib
33 -- text-metrics of Mark Karpov
34
35 -- | Levenshtein Distance
36 -- In information theory, Linguistics and computer science,
37 -- the Levenshtein distance is a string metric for measuring
38 -- the difference between two sequences.
39 -- See: https://en.wikipedia.org/wiki/Levenshtein_distance
40 --
41 levenshtein :: Text -> Text -> Int
42 levenshtein = DTM.levenshtein
43
44 -- | Return normalized Levenshtein distance between two 'Text' values.
45 -- Result is a non-negative rational number (represented as @'Ratio'
46 -- 'Data.Numeric.Natural'@), where 0 signifies no similarity between the
47 -- strings, while 1 means exact match.
48 --
49 levenshteinNorm :: Text -> Text -> Ratio Int
50 levenshteinNorm = DTM.levenshteinNorm
51
52 -- | Return Damerau-Levenshtein distance between two 'Text' values. The
53 -- function works like 'levenshtein', but the collection of allowed
54 -- operations also includes transposition of two /adjacent/ characters.
55 -- See also:
56 -- <https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance>
57 --
58 damerauLevenshtein :: Text -> Text -> Int
59 damerauLevenshtein = DTM.damerauLevenshtein
60
61 -- damerau-Levenshtein distance normalized
62 --
63 damerauLevenshteinNorm :: Text -> Text -> Ratio Int
64 damerauLevenshteinNorm = DTM.damerauLevenshteinNorm
65
66 -- Treating inputs like sets
67
68 -- | Return overlap coefficient for two 'Text' values. Returned value
69 -- is in the range from 0 (no similarity) to 1 (exact match). Return 1
70 -- if both 'Text' values are empty.
71 --
72 -- See also: <https://en.wikipedia.org/wiki/Overlap_coefficient>.
73 overlap :: Text -> Text -> Ratio Int
74 overlap = DTM.overlap
75
76
77 -- | Jaccard distance
78 -- measures dissimilarity between sample sets
79 jaccard :: Text -> Text -> Ratio Int
80 jaccard = DTM.jaccard
81
82 -- | Hamming Distance
83 -- In information theory, the Hamming distance between two strings of
84 -- equal length is the number of positions at which the corresponding
85 -- symbols are different. In other words, it measures the minimum number of
86 -- substitutions required to change one string into the other
87 -- See: https://en.wikipedia.org/wiki/Hamming_distance
88
89 hamming :: Text -> Text -> Maybe Int
90 hamming = DTM.hamming
91
92