1 {-# OPTIONS_GHC -Wno-orphans #-}
3 module Phylomemy.Indexation
8 import Data.Eq (Eq (..))
9 import Data.Foldable (toList)
10 import Data.Function ((&))
11 import Data.Functor ((<&>))
13 import Data.Map.Strict qualified as Map
14 import Data.Monoid (Monoid (..))
15 import Data.Ord (Ord (..))
16 import Data.Semigroup (Semigroup (..))
17 import Data.Sequence qualified as Seq
18 import Data.Set qualified as Set
19 import Data.String (IsString (..))
20 import Data.Text.Short (ShortText)
21 import Data.Validity (Validity (..), declare, delve, trivialValidation)
22 import Data.Validity.Map ()
23 import Data.Validity.Set ()
24 import Data.Validity.Time ()
25 import GHC.Generics (Generic)
27 import Logic.Theory.Arithmetic (Zero)
28 import Logic.Theory.Ord (type (>))
30 -- import Numeric.Probability (Probability)
31 import Text.Show (Show)
33 import Clustering.FrequentItemSet.LCM qualified as Clustering
35 -- | A contiguous sequence of n terms
36 newtype Ngram = Ngram {unNgram :: ShortText}
37 deriving (Eq, Generic, Ord, IsString)
38 deriving newtype (Show)
40 instance Validity Ngram where
41 validate = trivialValidation
43 -- | A 'Root' is a set of `Ngram`s conveying the same meaning
44 -- (according to the analyst).
47 , rootSynonyms :: Set.Set Ngram
49 deriving (Eq, Generic, Ord, Show)
51 instance IsString Root where
54 { rootLabel = fromString s
55 , rootSynonyms = Set.empty
58 instance Validity Root where
61 [ delve "rootLabel" (rootLabel r)
63 "The rootLabel is not a member of the rootSynonyms"
64 (Set.notMember (rootLabel r) (rootSynonyms r))
65 , delve "rootSynonyms" (rootSynonyms r)
68 type Roots = Clustering.ItemSet Root
69 type Foundations = Set.Set Root
71 data Document pos = Document
72 { documentPosition :: pos
73 -- ^ A position could be a date, a section, a page, an IP address, …
74 , documentRoots :: Map.Map Root ()
75 -- , documentContent :: a
77 deriving (Eq, Generic, Show)
78 instance Validity pos => Validity (Document pos)
80 type DocumentByRange range pos = Map.Map range (Seq.Seq (Document pos))
82 documentsByRange :: Ord range => (pos -> range) -> [Document pos] -> DocumentByRange range pos
83 documentsByRange mapKey docs =
86 [ (mapKey (documentPosition doc), Seq.singleton doc)
90 -- | "Clustering.FrequentItemSet.BruteForce"
91 -- and [the BF]("Clustering.FrequentItemSet.BruteForce")
92 -- and [the Doc]("Document")
93 data RootsOf docs = RootsOfAxiom
98 -- `clusterize` roots minSupp minSize docs
100 -- returns for each range the clusters of `Document`
101 -- according to the frequent item set similarity.
103 -- TODO: If a given period eventually ends up without any FIS,
104 -- we lower both the support and the size until we succeed in repopulating it.
108 minSupp ::: Int / minSupp > Zero ->
109 minSize ::: Int / minSize > Zero ->
110 docsByRange ::: (range :-> Seq.Seq (Document pos)) ->
113 -- Clustering.ClosedFrequentItemSets roots (RootsOf docsByRange) minSupp minSize :::
114 Cluster :-> Seq.Seq (Clustering.Transaction Root (Document pos))
115 clusterize roots minSupp minSize (Named docsByRange) =
116 -- TODO: currently `Clustering.closedFrequentItemSets` does not return the `Transaction`s
117 -- supporting the closed FIS, maybe it should do it.
118 -- In the meantime, collect those after running `closedFrequentItemSets`.
119 docsByRange <&> \docs ->
120 let closedFISs :: [(Clustering.ItemSupport, Cluster)] =
121 Clustering.closedFrequentItemSets minSupp minSize roots (
123 [ documentRoots doc & Map.keys & Set.fromList
124 | doc <- docs & toList
126 Map.fromListWith (<>)
127 [ (c, Seq.singleton Clustering.Transaction
128 { transactionData = doc
129 , transactionItems = documentRoots doc & Map.keys & Set.fromList
131 | doc <- docs & toList
132 , (_supp, c:: Cluster) <- closedFISs
133 , Set.isSubsetOf c (documentRoots doc & Map.keys & Set.fromList)
136 type MapList k a = [(k, a)]
138 type Cluster = Clustering.ItemSet Root
141 data Range pos = Range
144 -- , periodScales :: [Scale]
146 deriving (Eq, Ord, Generic, Show)
147 instance Validity pos => Validity (Range pos)
149 type Vocabulary = Map.Map Root ()