1 module Phylomemy.Similarity (
 
   5   type DocumentCoOccurences (),
 
   6   type CoOccurences (..),
 
  13 import Control.Applicative (Applicative (..))
 
  16 import Data.Function (($))
 
  17 import Data.Functor ((<$>))
 
  18 import Data.Map.Strict qualified as Map
 
  19 import Data.Maybe (Maybe (..))
 
  20 import Data.Monoid (Monoid (..))
 
  21 import Data.Ord (Ord (..))
 
  22 import Data.Ratio ((%))
 
  23 import Data.Semigroup (Semigroup (..), (<>))
 
  24 import Data.Validity (Validity (..), declare)
 
  25 import GHC.Generics (Generic)
 
  27 import Numeric.Decimal (MonadThrow)
 
  28 import Numeric.Natural (Natural)
 
  29 import Text.Show (Show)
 
  30 import Prelude (fromIntegral, (+))
 
  32 import Numeric.Probability
 
  33 import Phylomemy.Indexation
 
  35 -- | root-to-root co-occurrence matrix
 
  39 newtype RootTuple = RootTuple (Root, Root)
 
  41 rootTuple :: Root -> Root -> RootTuple
 
  42 rootTuple i j = RootTuple (if i <= j then (i, j) else (j, i))
 
  44 type RootMatrix a = Map.Map Root (Map.Map Root a)
 
  46 -- Encode a symmetric matrix of co-occurences of roots inside 'Document's.
 
  47 -- by mapping each root 'i', to a map of each root 'j' greater or equal to 'i',
 
  48 -- to the number of documents containing both 'i' and 'j'.
 
  50 -- The special case @(i == j)@ is thus the number of document containing 'i'.
 
  52 -- When 'j' does not appear in any document containing 'i',
 
  53 -- there is no entry for 'j' in the map under 'i'.
 
  56 -- there is no entry for 'j' in the map under 'i',
 
  57 -- because it belongs to the map under 'j'.
 
  58 newtype CoOccurences = CoOccurences (RootMatrix Count)
 
  59   deriving (Eq, Show, Generic)
 
  61 instance Semigroup CoOccurences where
 
  62   (<>) (CoOccurences x) (CoOccurences y) =
 
  63     CoOccurences (Map.unionWith (Map.unionWith (+)) x y)
 
  64 instance Monoid CoOccurences where
 
  65   mempty = CoOccurences Map.empty
 
  66 instance Validity CoOccurences where
 
  67   validate (CoOccurences is) =
 
  69       [ declare "The CoOccurences is valid" $
 
  75                         jAcc && i <= j && 1 <= c
 
  84 data DocumentCoOccurences document = DocumentCoOccurences
 
  86 -- | @(documentCoOccurences document)@ returns the trivial case of `documentCoOccurences`
 
  87 -- for a single @(document)@ where all its `documentRoots`
 
  88 -- co-occurs with each others, in one @(document)@ (this one).
 
  89 documentCoOccurences :: document ::: Document pos -> DocumentCoOccurences document ::: CoOccurences
 
  90 documentCoOccurences (Named doc) = (DocumentCoOccurences ...) $
 
  92     (`Map.mapWithKey` documentRoots doc) $ \i () ->
 
  93       (`Map.mapMaybeWithKey` documentRoots doc) $ \j () ->
 
  94         if i <= j then Just 1 else Nothing
 
  96 -- | A similarity defines what is considered to be meaningful relation between root terms.
 
  97 type Similarities similarity = RootMatrix similarity
 
  99 data Confidences document = ConfidencesAxiom
 
 101 -- | First order / syntagmatic axis.
 
 102 -- A confidence is a weak logic implication.
 
 104 -- Definition: in `Phylomemy.References.RefDrawMeScience`, « C.2 Similarity measures (²Φ) »
 
 106 -- > The similarity measure P(x, y) between n-grams x and y
 
 107 -- > is a function of the number of documents that mention both of them.
 
 108 -- > [Here] the similarity measure i[s] the confidence
 
 109 -- > defined as the maximum of the two probabilities
 
 110 -- > of having a term knowing the presence of the other in the same contextual unit
 
 111 -- > @P(x, y) = max(P(x|y), P(y|x))@.
 
 114   DocumentCoOccurences document ::: CoOccurences ->
 
 115   m (Confidences document ::: Similarities Probability)
 
 116 confidences (Named (CoOccurences i2j2c)) =
 
 117   (ConfidencesAxiom ...)
 
 118     <$> Map.traverseWithKey
 
 120           let ii = fromIntegral (j2c Map.! i)
 
 121           in Map.traverseWithKey
 
 123                   let ij = fromIntegral c
 
 124                   let jj = fromIntegral $ i2j2c Map.! j Map.! j
 
 125                   max <$> probability (ij % ii) <*> probability (ij % jj)