{-# OPTIONS_GHC -Wno-orphans #-} module Phylomemy.Indexation where -- TODO: ( … ) import Data.Eq (Eq (..)) import Data.Foldable (toList) import Data.Function ((&)) import Data.Functor ((<&>)) import Data.Int (Int) import Data.Map.Strict qualified as Map import Data.Monoid (Monoid (..)) import Data.Ord (Ord (..)) import Data.Semigroup (Semigroup (..)) import Data.Sequence qualified as Seq import Data.Set qualified as Set import Data.String (IsString (..)) import Data.Text.Short (ShortText) import Data.Validity (Validity (..), declare, delve, trivialValidation) import Data.Validity.Map () import Data.Validity.Set () import Data.Validity.Time () import GHC.Generics (Generic) import Logic import Logic.Theory.Arithmetic (Zero) import Logic.Theory.Ord (type (>)) -- import Numeric.Probability (Probability) import Text.Show (Show) import Clustering.FrequentItemSet.LCM qualified as Clustering -- | A contiguous sequence of n terms newtype Ngram = Ngram {unNgram :: ShortText} deriving (Eq, Generic, Ord, IsString) deriving newtype (Show) instance Validity Ngram where validate = trivialValidation -- | A 'Root' is a set of `Ngram`s conveying the same meaning -- (according to the analyst). data Root = Root { rootLabel :: Ngram , rootSynonyms :: Set.Set Ngram } deriving (Eq, Generic, Ord, Show) instance IsString Root where fromString s = Root { rootLabel = fromString s , rootSynonyms = Set.empty } instance Validity Root where validate r = mconcat [ delve "rootLabel" (rootLabel r) , declare "The rootLabel is not a member of the rootSynonyms" (Set.notMember (rootLabel r) (rootSynonyms r)) , delve "rootSynonyms" (rootSynonyms r) ] type Roots = Clustering.ItemSet Root type Foundations = Set.Set Root data Document pos = Document { documentPosition :: pos -- ^ A position could be a date, a section, a page, an IP address, … , documentRoots :: Map.Map Root () -- , documentContent :: a } deriving (Eq, Generic, Show) instance Validity pos => Validity (Document pos) type DocumentByRange range pos = Map.Map range (Seq.Seq (Document pos)) documentsByRange :: Ord range => (pos -> range) -> [Document pos] -> DocumentByRange range pos documentsByRange mapKey docs = Map.fromListWith (<>) [ (mapKey (documentPosition doc), Seq.singleton doc) | doc <- docs ] -- | "Clustering.FrequentItemSet.BruteForce" -- and [the BF]("Clustering.FrequentItemSet.BruteForce") -- and [the Doc]("Document") data RootsOf docs = RootsOfAxiom -- @ -- `clusterize` roots minSupp minSize docs -- @ -- returns for each range the clusters of `Document` -- according to the frequent item set similarity. -- -- TODO: If a given period eventually ends up without any FIS, -- we lower both the support and the size until we succeed in repopulating it. clusterize :: Show pos => roots ::: Roots -> minSupp ::: Int / minSupp > Zero -> minSize ::: Int / minSize > Zero -> docsByRange ::: (range :-> Seq.Seq (Document pos)) -> range :-> -- Clustering.ClosedFrequentItemSets roots (RootsOf docsByRange) minSupp minSize ::: Cluster :-> Seq.Seq (Clustering.Transaction Root (Document pos)) clusterize roots minSupp minSize (Named docsByRange) = -- TODO: currently `Clustering.closedFrequentItemSets` does not return the `Transaction`s -- supporting the closed FIS, maybe it should do it. -- In the meantime, collect those after running `closedFrequentItemSets`. docsByRange <&> \docs -> let closedFISs :: [(Clustering.ItemSupport, Cluster)] = Clustering.closedFrequentItemSets minSupp minSize roots ( RootsOfAxiom ... [ documentRoots doc & Map.keys & Set.fromList | doc <- docs & toList ] ) & unName in Map.fromListWith (<>) [ ( c , Seq.singleton Clustering.Transaction { transactionData = doc , transactionItems = documentRoots doc & Map.keys & Set.fromList } ) | doc <- docs & toList , (_supp, c :: Cluster) <- closedFISs , Set.isSubsetOf c (documentRoots doc & Map.keys & Set.fromList) ] type MapList k a = [(k, a)] type (:->) = Map.Map type Cluster = Clustering.ItemSet Root infixr 0 :-> data Range pos = Range { rangeMin :: pos , rangeMax :: pos -- , periodScales :: [Scale] } deriving (Eq, Ord, Generic, Show) instance Validity pos => Validity (Range pos) type Vocabulary = Map.Map Root ()