]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Ngrams/Metrics.hs
[TEXT-MINING] adding first functions/datatypes.
[gargantext.git] / src / Gargantext / Ngrams / Metrics.hs
1 {-|
2 Module : Gargantext.Ngrams.Metrics
3 Description : Short description
4 Copyright : (c) Some Guy, 2013
5 Someone Else, 2014
6 License : GPL-3
7 Maintainer : sample@email.com
8 Stability : experimental
9 Portability : POSIX
10
11 Mainly reexport functions in @Data.Text.Metrics@
12 -}
13
14 module Gargantext.Ngrams.Metrics (levenshtein
15 , levenshteinNorm
16 , damerauLevenshtein
17 , damerauLevenshteinNorm
18 , overlap
19 , jaccard
20 , hamming
21 ) where
22
23 import Gargantext.Prelude
24
25 import Data.Text (Text)
26 import GHC.Real (Ratio)
27 import qualified Data.Text.Metrics as DTM
28
29 {- * Example de titre
30 -}
31
32 -- | This module provide metrics to compare Text
33 -- starting as an API rexporting main functions of the great lib
34 -- text-metrics of Mark Karpov
35
36 -- | Levenshtein Distance
37 -- In information theory, Linguistics and computer science,
38 -- the Levenshtein distance is a string metric for measuring
39 -- the difference between two sequences.
40 -- See: https://en.wikipedia.org/wiki/Levenshtein_distance
41 --
42 levenshtein :: Text -> Text -> Int
43 levenshtein = DTM.levenshtein
44
45 -- | Return normalized Levenshtein distance between two 'Text' values.
46 -- Result is a non-negative rational number (represented as @'Ratio'
47 -- 'Data.Numeric.Natural'@), where 0 signifies no similarity between the
48 -- strings, while 1 means exact match.
49 --
50 levenshteinNorm :: Text -> Text -> Ratio Int
51 levenshteinNorm = DTM.levenshteinNorm
52
53 -- | Return Damerau-Levenshtein distance between two 'Text' values. The
54 -- function works like 'levenshtein', but the collection of allowed
55 -- operations also includes transposition of two /adjacent/ characters.
56 -- See also:
57 -- <https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance>
58 --
59 damerauLevenshtein :: Text -> Text -> Int
60 damerauLevenshtein = DTM.damerauLevenshtein
61
62 -- damerau-Levenshtein distance normalized
63 --
64 damerauLevenshteinNorm :: Text -> Text -> Ratio Int
65 damerauLevenshteinNorm = DTM.damerauLevenshteinNorm
66
67 -- Treating inputs like sets
68
69 -- | Return overlap coefficient for two 'Text' values. Returned value
70 -- is in the range from 0 (no similarity) to 1 (exact match). Return 1
71 -- if both 'Text' values are empty.
72 --
73 -- See also: <https://en.wikipedia.org/wiki/Overlap_coefficient>.
74 overlap :: Text -> Text -> Ratio Int
75 overlap = DTM.overlap
76
77
78 -- | Jaccard distance
79 -- measures dissimilarity between sample sets
80 jaccard :: Text -> Text -> Ratio Int
81 jaccard = DTM.jaccard
82
83 -- | Hamming Distance
84 -- In information theory, the Hamming distance between two strings of
85 -- equal length is the number of positions at which the corresponding
86 -- symbols are different. In other words, it measures the minimum number of
87 -- substitutions required to change one string into the other
88 -- See: https://en.wikipedia.org/wiki/Hamming_distance
89
90 hamming :: Text -> Text -> Maybe Int
91 hamming = DTM.hamming
92
93