]> Git — Sourcephile - literate-phylomemy.git/blob - src/Phylomemy/Similarity.hs
init
[literate-phylomemy.git] / src / Phylomemy / Similarity.hs
1 module Phylomemy.Similarity (
2 type RootTuple (..),
3 rootTuple,
4 type RootMatrix,
5 type DocumentCoOccurences (),
6 type CoOccurences (..),
7 documentCoOccurences,
8 type Confidences (),
9 type Similarities,
10 confidences,
11 ) where
12
13 import Control.Applicative (Applicative (..))
14 import Data.Bool
15 import Data.Eq (Eq)
16 import Data.Function (($))
17 import Data.Functor ((<$>))
18 import Data.Map.Strict qualified as Map
19 import Data.Maybe (Maybe (..))
20 import Data.Monoid (Monoid (..))
21 import Data.Ord (Ord (..))
22 import Data.Ratio ((%))
23 import Data.Semigroup (Semigroup (..), (<>))
24 import Data.Validity (Validity (..), declare)
25 import GHC.Generics (Generic)
26 import Logic
27 import Numeric.Decimal (MonadThrow)
28 import Numeric.Natural (Natural)
29 import Text.Show (Show)
30 import Prelude (fromIntegral, (+))
31
32 import Numeric.Probability
33 import Phylomemy.Indexation
34
35 -- | root-to-root co-occurrence matrix
36 type Count = Natural
37
38 -- | Orderered Tuple
39 newtype RootTuple = RootTuple (Root, Root)
40
41 rootTuple :: Root -> Root -> RootTuple
42 rootTuple i j = RootTuple (if i <= j then (i, j) else (j, i))
43
44 type RootMatrix a = Map.Map Root (Map.Map Root a)
45
46 -- Encode a symmetric matrix of co-occurences of roots inside 'Document's.
47 -- by mapping each root 'i', to a map of each root 'j' greater or equal to 'i',
48 -- to the number of documents containing both 'i' and 'j'.
49 --
50 -- The special case @(i == j)@ is thus the number of document containing 'i'.
51 --
52 -- When 'j' does not appear in any document containing 'i',
53 -- there is no entry for 'j' in the map under 'i'.
54 --
55 -- When 'i' > 'j',
56 -- there is no entry for 'j' in the map under 'i',
57 -- because it belongs to the map under 'j'.
58 newtype CoOccurences = CoOccurences (RootMatrix Count)
59 deriving (Eq, Show, Generic)
60
61 instance Semigroup CoOccurences where
62 (<>) (CoOccurences x) (CoOccurences y) =
63 CoOccurences (Map.unionWith (Map.unionWith (+)) x y)
64 instance Monoid CoOccurences where
65 mempty = CoOccurences Map.empty
66 instance Validity CoOccurences where
67 validate (CoOccurences is) =
68 mconcat
69 [ declare "The CoOccurences is valid" $
70 Map.foldrWithKey
71 ( \i js iAcc ->
72 iAcc
73 && Map.foldrWithKey
74 ( \j c jAcc ->
75 jAcc && i <= j && 1 <= c
76 )
77 True
78 js
79 )
80 True
81 is
82 ]
83
84 data DocumentCoOccurences document = DocumentCoOccurences
85
86 -- | @(documentCoOccurences document)@ returns the trivial case of `documentCoOccurences`
87 -- for a single @(document)@ where all its `documentRoots`
88 -- co-occurs with each others, in one @(document)@ (this one).
89 documentCoOccurences :: document ::: Document pos -> DocumentCoOccurences document ::: CoOccurences
90 documentCoOccurences (Named doc) = (DocumentCoOccurences ...) $
91 CoOccurences $
92 (`Map.mapWithKey` documentRoots doc) $ \i () ->
93 (`Map.mapMaybeWithKey` documentRoots doc) $ \j () ->
94 if i <= j then Just 1 else Nothing
95
96 -- | A similarity defines what is considered to be meaningful relation between root terms.
97 type Similarities similarity = RootMatrix similarity
98
99 data Confidences document = Confidences
100
101 -- | First order / syntagmatic axis.
102 -- A confidence is a weak logic implication.
103 --
104 -- Definition: in `Phylomemy.References.RefDrawMeScience`, « C.2 Similarity measures (²Φ) »
105 --
106 -- > The similarity measure P(x, y) between n-grams x and y
107 -- > is a function of the number of documents that mention both of them.
108 -- > [Here] the similarity measure i[s] the confidence
109 -- > defined as the maximum of the two probabilities
110 -- > of having a term knowing the presence of the other in the same contextual unit
111 -- > @P(x, y) = max(P(x|y), P(y|x))@.
112 confidences ::
113 MonadThrow m =>
114 DocumentCoOccurences document ::: CoOccurences ->
115 m (Confidences document ::: Similarities Probability)
116 confidences (Named (CoOccurences i2j2c)) =
117 (Confidences ...)
118 <$> Map.traverseWithKey
119 ( \i j2c ->
120 let ii = fromIntegral (j2c Map.! i)
121 in Map.traverseWithKey
122 ( \j c -> do
123 let ij = fromIntegral c
124 let jj = fromIntegral $ i2j2c Map.! j Map.! j
125 max <$> probability (ij % ii) <*> probability (ij % jj)
126 )
127 j2c
128 )
129 i2j2c