]> Git — Sourcephile - literate-phylomemy.git/blob - src/Phylomemy/Indexation.hs
init
[literate-phylomemy.git] / src / Phylomemy / Indexation.hs
1 {-# OPTIONS_GHC -Wno-orphans #-}
2
3 module Phylomemy.Indexation
4 where
5
6 -- TODO: ( … )
7
8 import Data.Eq (Eq (..))
9 import Data.Foldable (toList)
10 import Data.Function ((&))
11 import Data.Functor ((<&>))
12 import Data.Int (Int)
13 import Data.Map.Strict qualified as Map
14 import Data.Monoid (Monoid (..))
15 import Data.Ord (Ord (..))
16 import Data.Semigroup (Semigroup (..))
17 import Data.Sequence qualified as Seq
18 import Data.Set qualified as Set
19 import Data.String (IsString (..))
20 import Data.Text.Short (ShortText)
21 import Data.Validity (Validity (..), declare, delve, trivialValidation)
22 import Data.Validity.Map ()
23 import Data.Validity.Set ()
24 import Data.Validity.Time ()
25 import GHC.Generics (Generic)
26 import Logic
27 import Logic.Theory.Arithmetic (Zero)
28 import Logic.Theory.Ord (type (>))
29
30 -- import Numeric.Probability (Probability)
31 import Text.Show (Show)
32
33 import Clustering.FrequentItemSet.LCM qualified as Clustering
34
35 -- | A contiguous sequence of n terms
36 newtype Ngram = Ngram {unNgram :: ShortText}
37 deriving (Eq, Generic, Ord, IsString)
38 deriving newtype (Show)
39
40 instance Validity Ngram where
41 validate = trivialValidation
42
43 -- | A 'Root' is a set of `Ngram`s conveying the same meaning
44 -- (according to the analyst).
45 data Root = Root
46 { rootLabel :: Ngram
47 , rootSynonyms :: Set.Set Ngram
48 }
49 deriving (Eq, Generic, Ord, Show)
50
51 instance IsString Root where
52 fromString s =
53 Root
54 { rootLabel = fromString s
55 , rootSynonyms = Set.empty
56 }
57
58 instance Validity Root where
59 validate r =
60 mconcat
61 [ delve "rootLabel" (rootLabel r)
62 , declare
63 "The rootLabel is not a member of the rootSynonyms"
64 (Set.notMember (rootLabel r) (rootSynonyms r))
65 , delve "rootSynonyms" (rootSynonyms r)
66 ]
67
68 type Roots = Clustering.ItemSet Root
69 type Foundations = Set.Set Root
70
71 data Document pos = Document
72 { documentPosition :: pos
73 -- ^ A position could be a date, a section, a page, an IP address, …
74 , documentRoots :: Map.Map Root ()
75 -- , documentContent :: a
76 }
77 deriving (Eq, Generic, Show)
78 instance Validity pos => Validity (Document pos)
79
80 type DocumentByRange range pos = Map.Map range (Seq.Seq (Document pos))
81
82 documentsByRange :: Ord range => (pos -> range) -> [Document pos] -> DocumentByRange range pos
83 documentsByRange mapKey docs =
84 Map.fromListWith
85 (<>)
86 [ (mapKey (documentPosition doc), Seq.singleton doc)
87 | doc <- docs
88 ]
89
90 -- | "Clustering.FrequentItemSet.BruteForce"
91 -- and [the BF]("Clustering.FrequentItemSet.BruteForce")
92 -- and [the Doc]("Document")
93 data RootsOf docs = RootsOfAxiom
94
95 -- appendGroups
96 -- 3Φ
97 -- @
98 -- `clusterize` roots minSupp minSize docs
99 -- @
100 -- returns for each range the clusters of `Document`
101 -- according to the frequent item set similarity.
102 --
103 -- TODO: If a given period eventually ends up without any FIS,
104 -- we lower both the support and the size until we succeed in repopulating it.
105 clusterize ::
106 Show pos =>
107 roots ::: Roots ->
108 minSupp ::: Int / minSupp > Zero ->
109 minSize ::: Int / minSize > Zero ->
110 docsByRange ::: (range :-> Seq.Seq (Document pos)) ->
111 range
112 :->
113 -- Clustering.ClosedFrequentItemSets roots (RootsOf docsByRange) minSupp minSize :::
114 Cluster :-> Seq.Seq (Clustering.Transaction Root (Document pos))
115 clusterize roots minSupp minSize (Named docsByRange) =
116 -- TODO: currently `Clustering.closedFrequentItemSets` does not return the `Transaction`s
117 -- supporting the closed FIS, maybe it should do it.
118 -- In the meantime, collect those after running `closedFrequentItemSets`.
119 docsByRange <&> \docs ->
120 let closedFISs :: [(Clustering.ItemSupport, Cluster)] =
121 Clustering.closedFrequentItemSets minSupp minSize roots (
122 RootsOfAxiom ...
123 [ documentRoots doc & Map.keys & Set.fromList
124 | doc <- docs & toList
125 ]) & unName in
126 Map.fromListWith (<>)
127 [ (c, Seq.singleton Clustering.Transaction
128 { transactionData = doc
129 , transactionItems = documentRoots doc & Map.keys & Set.fromList
130 } )
131 | doc <- docs & toList
132 , (_supp, c:: Cluster) <- closedFISs
133 , Set.isSubsetOf c (documentRoots doc & Map.keys & Set.fromList)
134 ]
135
136 type MapList k a = [(k, a)]
137 type (:->) = Map.Map
138 type Cluster = Clustering.ItemSet Root
139 infixr 0 :->
140
141 data Range pos = Range
142 { rangeMin :: pos
143 , rangeMax :: pos
144 -- , periodScales :: [Scale]
145 }
146 deriving (Eq, Ord, Generic, Show)
147 instance Validity pos => Validity (Range pos)
148
149 type Vocabulary = Map.Map Root ()