1 {-# OPTIONS_GHC -Wno-orphans #-}
3 module Phylomemy.Indexation
8 import Data.Eq (Eq (..))
9 import Data.Foldable (toList)
10 import Data.Function ((&))
11 import Data.Functor ((<&>))
13 import Data.Map.Strict qualified as Map
14 import Data.Monoid (Monoid (..))
15 import Data.Ord (Ord (..))
16 import Data.Semigroup (Semigroup (..))
17 import Data.Sequence qualified as Seq
18 import Data.Set qualified as Set
19 import Data.String (IsString (..))
20 import Data.Text.Short (ShortText)
21 import Data.Validity (Validity (..), declare, delve, trivialValidation)
22 import Data.Validity.Map ()
23 import Data.Validity.Set ()
24 import Data.Validity.Time ()
25 import GHC.Generics (Generic)
27 import Logic.Theory.Arithmetic (Zero)
28 import Logic.Theory.Ord (type (>))
30 -- import Numeric.Probability (Probability)
31 import Text.Show (Show)
33 import Clustering.FrequentItemSet.LCM qualified as Clustering
35 -- | A contiguous sequence of n terms
36 newtype Ngram = Ngram {unNgram :: ShortText}
37 deriving (Eq, Generic, Ord, IsString)
38 deriving newtype (Show)
40 instance Validity Ngram where
41 validate = trivialValidation
43 -- | A 'Root' is a set of `Ngram`s conveying the same meaning
44 -- (according to the analyst).
47 , rootSynonyms :: Set.Set Ngram
49 deriving (Eq, Generic, Ord, Show)
51 instance IsString Root where
54 { rootLabel = fromString s
55 , rootSynonyms = Set.empty
58 instance Validity Root where
61 [ delve "rootLabel" (rootLabel r)
63 "The rootLabel is not a member of the rootSynonyms"
64 (Set.notMember (rootLabel r) (rootSynonyms r))
65 , delve "rootSynonyms" (rootSynonyms r)
68 type Roots = Clustering.ItemSet Root
69 type Foundations = Set.Set Root
71 data Document pos = Document
72 { documentPosition :: pos
73 -- ^ A position could be a date, a section, a page, an IP address, …
74 , documentRoots :: Map.Map Root ()
75 -- , documentContent :: a
77 deriving (Eq, Generic, Show)
78 instance Validity pos => Validity (Document pos)
80 type DocumentByRange range pos = Map.Map range (Seq.Seq (Document pos))
82 documentsByRange :: Ord range => (pos -> range) -> [Document pos] -> DocumentByRange range pos
83 documentsByRange mapKey docs =
86 [ (mapKey (documentPosition doc), Seq.singleton doc)
90 -- | "Clustering.FrequentItemSet.BruteForce"
91 -- and [the BF]("Clustering.FrequentItemSet.BruteForce")
92 -- and [the Doc]("Document")
93 data RootsOf docs = RootsOfAxiom
96 -- `clusterize` roots minSupp minSize docs
98 -- returns for each range the clusters of `Document`
99 -- according to the frequent item set similarity.
101 -- TODO: If a given period eventually ends up without any FIS,
102 -- we lower both the support and the size until we succeed in repopulating it.
106 minSupp ::: Int / minSupp > Zero ->
107 minSize ::: Int / minSize > Zero ->
108 docsByRange ::: (range :-> Seq.Seq (Document pos)) ->
111 -- Clustering.ClosedFrequentItemSets roots (RootsOf docsByRange) minSupp minSize :::
113 :-> Seq.Seq (Clustering.Transaction Root (Document pos))
114 clusterize roots minSupp minSize (Named docsByRange) =
115 -- TODO: currently `Clustering.closedFrequentItemSets` does not return the `Transaction`s
116 -- supporting the closed FIS, maybe it should do it.
117 -- In the meantime, collect those after running `closedFrequentItemSets`.
118 docsByRange <&> \docs ->
119 let closedFISs :: [(Clustering.ItemSupport, Cluster)] =
120 Clustering.closedFrequentItemSets
125 [ documentRoots doc & Map.keys & Set.fromList
126 | doc <- docs & toList
134 Clustering.Transaction
135 { transactionData = doc
136 , transactionItems = documentRoots doc & Map.keys & Set.fromList
139 | doc <- docs & toList
140 , (_supp, c :: Cluster) <- closedFISs
141 , Set.isSubsetOf c (documentRoots doc & Map.keys & Set.fromList)
144 type MapList k a = [(k, a)]
146 type Cluster = Clustering.ItemSet Root
149 data Range pos = Range
152 -- , periodScales :: [Scale]
154 deriving (Eq, Ord, Generic, Show)
155 instance Validity pos => Validity (Range pos)
157 type Vocabulary = Map.Map Root ()