1 {-# OPTIONS_GHC -Wno-orphans #-}
 
   3 module Phylomemy.Indexation
 
   8 import Data.Eq (Eq (..))
 
   9 import Data.Foldable (toList)
 
  10 import Data.Function ((&))
 
  11 import Data.Functor ((<&>))
 
  13 import Data.Map.Strict qualified as Map
 
  14 import Data.Monoid (Monoid (..))
 
  15 import Data.Ord (Ord (..))
 
  16 import Data.Semigroup (Semigroup (..))
 
  17 import Data.Sequence qualified as Seq
 
  18 import Data.Set qualified as Set
 
  19 import Data.String (IsString (..))
 
  20 import Data.Text.Short (ShortText)
 
  21 import Data.Validity (Validity (..), declare, delve, trivialValidation)
 
  22 import Data.Validity.Map ()
 
  23 import Data.Validity.Set ()
 
  24 import Data.Validity.Time ()
 
  25 import GHC.Generics (Generic)
 
  27 import Logic.Theory.Arithmetic (Zero)
 
  28 import Logic.Theory.Ord (type (>))
 
  30 -- import Numeric.Probability (Probability)
 
  31 import Text.Show (Show)
 
  33 import Clustering.FrequentItemSet.LCM qualified as Clustering
 
  35 -- | A contiguous sequence of n terms
 
  36 newtype Ngram = Ngram {unNgram :: ShortText}
 
  37   deriving (Eq, Generic, Ord, IsString)
 
  38   deriving newtype (Show)
 
  40 instance Validity Ngram where
 
  41   validate = trivialValidation
 
  43 -- | A 'Root' is a set of `Ngram`s conveying the same meaning
 
  44 -- (according to the analyst).
 
  47   , rootSynonyms :: Set.Set Ngram
 
  49   deriving (Eq, Generic, Ord, Show)
 
  51 instance IsString Root where
 
  54       { rootLabel = fromString s
 
  55       , rootSynonyms = Set.empty
 
  58 instance Validity Root where
 
  61       [ delve "rootLabel" (rootLabel r)
 
  63           "The rootLabel is not a member of the rootSynonyms"
 
  64           (Set.notMember (rootLabel r) (rootSynonyms r))
 
  65       , delve "rootSynonyms" (rootSynonyms r)
 
  68 type Roots = Clustering.ItemSet Root
 
  69 type Foundations = Set.Set Root
 
  71 data Document pos = Document
 
  72   { documentPosition :: pos
 
  73   -- ^ A position could be a date, a section, a page, an IP address, …
 
  74   , documentRoots :: Map.Map Root ()
 
  75   -- , documentContent :: a
 
  77   deriving (Eq, Generic, Show)
 
  78 instance Validity pos => Validity (Document pos)
 
  80 type DocumentByRange range pos = Map.Map range (Seq.Seq (Document pos))
 
  82 documentsByRange :: Ord range => (pos -> range) -> [Document pos] -> DocumentByRange range pos
 
  83 documentsByRange mapKey docs =
 
  86     [ (mapKey (documentPosition doc), Seq.singleton doc)
 
  90 -- | "Clustering.FrequentItemSet.BruteForce"
 
  91 -- and [the BF]("Clustering.FrequentItemSet.BruteForce")
 
  92 -- and [the Doc]("Document")
 
  93 data RootsOf docs = RootsOfAxiom
 
  96 -- `clusterize` roots minSupp minSize docs
 
  98 -- returns for each range the clusters of `Document`
 
  99 -- according to the frequent item set similarity.
 
 101 -- TODO: If a given period eventually ends up without any FIS,
 
 102 -- we lower both the support and the size until we succeed in repopulating it.
 
 106   minSupp ::: Int / minSupp > Zero ->
 
 107   minSize ::: Int / minSize > Zero ->
 
 108   docsByRange ::: (range :-> Seq.Seq (Document pos)) ->
 
 111     -- Clustering.ClosedFrequentItemSets roots (RootsOf docsByRange) minSupp minSize :::
 
 113     :-> Seq.Seq (Clustering.Transaction Root (Document pos))
 
 114 clusterize roots minSupp minSize (Named docsByRange) =
 
 115   -- TODO: currently `Clustering.closedFrequentItemSets` does not return the `Transaction`s
 
 116   -- supporting the closed FIS, maybe it should do it.
 
 117   -- In the meantime, collect those after running `closedFrequentItemSets`.
 
 118   docsByRange <&> \docs ->
 
 119     let closedFISs :: [(Clustering.ItemSupport, Cluster)] =
 
 120           Clustering.closedFrequentItemSets
 
 125                 [ documentRoots doc & Map.keys & Set.fromList
 
 126                 | doc <- docs & toList
 
 134               Clustering.Transaction
 
 135                 { transactionData = doc
 
 136                 , transactionItems = documentRoots doc & Map.keys & Set.fromList
 
 139         | doc <- docs & toList
 
 140         , (_supp, c :: Cluster) <- closedFISs
 
 141         , Set.isSubsetOf c (documentRoots doc & Map.keys & Set.fromList)
 
 144 type MapList k a = [(k, a)]
 
 146 type Cluster = Clustering.ItemSet Root
 
 149 data Range pos = Range
 
 152   -- , periodScales :: [Scale]
 
 154   deriving (Eq, Ord, Generic, Show)
 
 155 instance Validity pos => Validity (Range pos)
 
 157 type Vocabulary = Map.Map Root ()