]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Metrics/CharByChar.hs
Merge branch 'dev-ngrams-repo' of ssh://delanoe.org/haskell-gargantext into dev-ngram...
[gargantext.git] / src / Gargantext / Text / Metrics / CharByChar.hs
1 {-|
2 Module : Gargantext.Text.Metrics.CharByChar
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Mainly reexport functions in @Data.Text.Metrics@
11 -}
12
13 {-# LANGUAGE NoImplicitPrelude #-}
14
15
16 module Gargantext.Text.Metrics.CharByChar (levenshtein
17 , levenshteinNorm
18 , damerauLevenshtein
19 , damerauLevenshteinNorm
20 , overlap
21 , jaccard
22 , hamming
23 ) where
24
25
26 import Data.Text (Text)
27 import GHC.Real (Ratio)
28 import qualified Data.Text.Metrics as DTM
29
30 import Gargantext.Prelude
31
32 --noApax :: Ord a => Map a Occ -> Map a Occ
33 --noApax m = M.filter (>1) m
34
35
36 {- * Example de titre
37 -}
38
39 -- | This module provide metrics to compare Text
40 -- starting as an API rexporting main functions of the great lib
41 -- text-metrics of Mark Karpov
42
43 -- | Levenshtein Distance
44 -- In information theory, Linguistics and computer science,
45 -- the Levenshtein distance is a string metric for measuring
46 -- the difference between two sequences.
47 -- See: https://en.wikipedia.org/wiki/Levenshtein_distance
48 --
49 levenshtein :: Text -> Text -> Int
50 levenshtein = DTM.levenshtein
51
52 -- | Return normalized Levenshtein distance between two 'Text' values.
53 -- Result is a non-negative rational number (represented as @'Ratio'
54 -- 'Data.Numeric.Natural'@), where 0 signifies no similarity between the
55 -- strings, while 1 means exact match.
56 --
57 levenshteinNorm :: Text -> Text -> Ratio Int
58 levenshteinNorm = DTM.levenshteinNorm
59
60 -- | Return Damerau-Levenshtein distance between two 'Text' values. The
61 -- function works like 'levenshtein', but the collection of allowed
62 -- operations also includes transposition of two /adjacent/ characters.
63 -- See also:
64 -- <https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance>
65 --
66 damerauLevenshtein :: Text -> Text -> Int
67 damerauLevenshtein = DTM.damerauLevenshtein
68
69 -- damerau-Levenshtein distance normalized
70 --
71 damerauLevenshteinNorm :: Text -> Text -> Ratio Int
72 damerauLevenshteinNorm = DTM.damerauLevenshteinNorm
73
74 -- Treating inputs like sets
75
76 -- | Return overlap coefficient for two 'Text' values. Returned value
77 -- is in the range from 0 (no similarity) to 1 (exact match). Return 1
78 -- if both 'Text' values are empty.
79 --
80 -- See also: <https://en.wikipedia.org/wiki/Overlap_coefficient>.
81 overlap :: Text -> Text -> Ratio Int
82 overlap = DTM.overlap
83
84
85 -- | Jaccard distance
86 -- measures dissimilarity between sample sets
87 jaccard :: Text -> Text -> Ratio Int
88 jaccard = DTM.jaccard
89
90 -- | Hamming Distance
91 -- In information theory, the Hamming distance between two strings of
92 -- equal length is the number of positions at which the corresponding
93 -- symbols are different. In other words, it measures the minimum number of
94 -- substitutions required to change one string into the other
95 -- See: https://en.wikipedia.org/wiki/Hamming_distance
96
97 hamming :: Text -> Text -> Maybe Int
98 hamming = DTM.hamming
99
100