]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Metrics/CharByChar.hs
Merge branch 'dev' into dev-doc-table-optimization
[gargantext.git] / src / Gargantext / Text / Metrics / CharByChar.hs
1 {-|
2 Module : Gargantext.Text.Metrics.CharByChar
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Mainly reexport functions in @Data.Text.Metrics@
11 -}
12
13
14
15 module Gargantext.Text.Metrics.CharByChar (levenshtein
16 , levenshteinNorm
17 , damerauLevenshtein
18 , damerauLevenshteinNorm
19 , overlap
20 , jaccard
21 , hamming
22 ) where
23
24
25 import Data.Text (Text)
26 import GHC.Real (Ratio)
27 import qualified Data.Text.Metrics as DTM
28
29 import Gargantext.Prelude
30
31 --noApax :: Ord a => Map a Occ -> Map a Occ
32 --noApax m = M.filter (>1) m
33
34
35 {- * Example de titre
36 -}
37
38 -- | This module provide metrics to compare Text
39 -- starting as an API rexporting main functions of the great lib
40 -- text-metrics of Mark Karpov
41
42 -- | Levenshtein Distance
43 -- In information theory, Linguistics and computer science,
44 -- the Levenshtein distance is a string metric for measuring
45 -- the difference between two sequences.
46 -- See: https://en.wikipedia.org/wiki/Levenshtein_distance
47 --
48 levenshtein :: Text -> Text -> Int
49 levenshtein = DTM.levenshtein
50
51 -- | Return normalized Levenshtein distance between two 'Text' values.
52 -- Result is a non-negative rational number (represented as @'Ratio'
53 -- 'Data.Numeric.Natural'@), where 0 signifies no similarity between the
54 -- strings, while 1 means exact match.
55 --
56 levenshteinNorm :: Text -> Text -> Ratio Int
57 levenshteinNorm = DTM.levenshteinNorm
58
59 -- | Return Damerau-Levenshtein distance between two 'Text' values. The
60 -- function works like 'levenshtein', but the collection of allowed
61 -- operations also includes transposition of two /adjacent/ characters.
62 -- See also:
63 -- <https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance>
64 --
65 damerauLevenshtein :: Text -> Text -> Int
66 damerauLevenshtein = DTM.damerauLevenshtein
67
68 -- damerau-Levenshtein distance normalized
69 --
70 damerauLevenshteinNorm :: Text -> Text -> Ratio Int
71 damerauLevenshteinNorm = DTM.damerauLevenshteinNorm
72
73 -- Treating inputs like sets
74
75 -- | Return overlap coefficient for two 'Text' values. Returned value
76 -- is in the range from 0 (no similarity) to 1 (exact match). Return 1
77 -- if both 'Text' values are empty.
78 --
79 -- See also: <https://en.wikipedia.org/wiki/Overlap_coefficient>.
80 overlap :: Text -> Text -> Ratio Int
81 overlap = DTM.overlap
82
83
84 -- | Jaccard distance
85 -- measures dissimilarity between sample sets
86 jaccard :: Text -> Text -> Ratio Int
87 jaccard = DTM.jaccard
88
89 -- | Hamming Distance
90 -- In information theory, the Hamming distance between two strings of
91 -- equal length is the number of positions at which the corresponding
92 -- symbols are different. In other words, it measures the minimum number of
93 -- substitutions required to change one string into the other
94 -- See: https://en.wikipedia.org/wiki/Hamming_distance
95
96 hamming :: Text -> Text -> Maybe Int
97 hamming = DTM.hamming
98
99