module Phylomemy.Similarity ( type RootTuple (..), rootTuple, type RootMatrix, type DocumentCoOccurences (), type CoOccurences (..), documentCoOccurences, type Confidences (), type Similarities, confidences, ) where import Control.Applicative (Applicative (..)) import Data.Bool import Data.Eq (Eq) import Data.Function (($)) import Data.Functor ((<$>)) import Data.Map.Strict qualified as Map import Data.Maybe (Maybe (..)) import Data.Monoid (Monoid (..)) import Data.Ord (Ord (..)) import Data.Ratio ((%)) import Data.Semigroup (Semigroup (..), (<>)) import Data.Validity (Validity (..), declare) import GHC.Generics (Generic) import Logic import Numeric.Decimal (MonadThrow) import Numeric.Natural (Natural) import Text.Show (Show) import Prelude (fromIntegral, (+)) import Numeric.Probability import Phylomemy.Indexation -- | root-to-root co-occurrence matrix type Count = Natural -- | Orderered Tuple newtype RootTuple = RootTuple (Root, Root) rootTuple :: Root -> Root -> RootTuple rootTuple i j = RootTuple (if i <= j then (i, j) else (j, i)) type RootMatrix a = Map.Map Root (Map.Map Root a) -- Encode a symmetric matrix of co-occurences of roots inside 'Document's. -- by mapping each root 'i', to a map of each root 'j' greater or equal to 'i', -- to the number of documents containing both 'i' and 'j'. -- -- The special case @(i == j)@ is thus the number of document containing 'i'. -- -- When 'j' does not appear in any document containing 'i', -- there is no entry for 'j' in the map under 'i'. -- -- When 'i' > 'j', -- there is no entry for 'j' in the map under 'i', -- because it belongs to the map under 'j'. newtype CoOccurences = CoOccurences (RootMatrix Count) deriving (Eq, Show, Generic) instance Semigroup CoOccurences where (<>) (CoOccurences x) (CoOccurences y) = CoOccurences (Map.unionWith (Map.unionWith (+)) x y) instance Monoid CoOccurences where mempty = CoOccurences Map.empty instance Validity CoOccurences where validate (CoOccurences is) = mconcat [ declare "The CoOccurences is valid" $ Map.foldrWithKey ( \i js iAcc -> iAcc && Map.foldrWithKey ( \j c jAcc -> jAcc && i <= j && 1 <= c ) True js ) True is ] data DocumentCoOccurences document = DocumentCoOccurences -- | @(documentCoOccurences document)@ returns the trivial case of `documentCoOccurences` -- for a single @(document)@ where all its `documentRoots` -- co-occurs with each others, in one @(document)@ (this one). documentCoOccurences :: document ::: Document pos -> DocumentCoOccurences document ::: CoOccurences documentCoOccurences (Named doc) = (DocumentCoOccurences ...) $ CoOccurences $ (`Map.mapWithKey` documentRoots doc) $ \i () -> (`Map.mapMaybeWithKey` documentRoots doc) $ \j () -> if i <= j then Just 1 else Nothing -- | A similarity defines what is considered to be meaningful relation between root terms. type Similarities similarity = RootMatrix similarity data Confidences document = Confidences -- | First order / syntagmatic axis. -- A confidence is a weak logic implication. -- -- Definition: in `Phylomemy.References.RefDrawMeScience`, « C.2 Similarity measures (²Φ) » -- -- > The similarity measure P(x, y) between n-grams x and y -- > is a function of the number of documents that mention both of them. -- > [Here] the similarity measure i[s] the confidence -- > defined as the maximum of the two probabilities -- > of having a term knowing the presence of the other in the same contextual unit -- > @P(x, y) = max(P(x|y), P(y|x))@. confidences :: MonadThrow m => DocumentCoOccurences document ::: CoOccurences -> m (Confidences document ::: Similarities Probability) confidences (Named (CoOccurences i2j2c)) = (Confidences ...) <$> Map.traverseWithKey ( \i j2c -> let ii = fromIntegral (j2c Map.! i) in Map.traverseWithKey ( \j c -> do let ij = fromIntegral c let jj = fromIntegral $ i2j2c Map.! j Map.! j max <$> probability (ij % ii) <*> probability (ij % jj) ) j2c ) i2j2c