]> Git — Sourcephile - literate-phylomemy.git/blob - src/Phylomemy/Indexation.hs
init
[literate-phylomemy.git] / src / Phylomemy / Indexation.hs
1 {-# OPTIONS_GHC -Wno-orphans #-}
2
3 module Phylomemy.Indexation
4 where
5
6 -- TODO: ( … )
7
8 import Data.Eq (Eq (..))
9 import Data.Foldable (toList)
10 import Data.Function ((&))
11 import Data.Functor ((<&>))
12 import Data.Int (Int)
13 import Data.Map.Strict qualified as Map
14 import Data.Monoid (Monoid (..))
15 import Data.Ord (Ord (..))
16 import Data.Semigroup (Semigroup (..))
17 import Data.Sequence qualified as Seq
18 import Data.Set qualified as Set
19 import Data.String (IsString (..))
20 import Data.Text.Short (ShortText)
21 import Data.Validity (Validity (..), declare, delve, trivialValidation)
22 import Data.Validity.Map ()
23 import Data.Validity.Set ()
24 import Data.Validity.Time ()
25 import GHC.Generics (Generic)
26 import Logic
27 import Logic.Theory.Arithmetic (Zero)
28 import Logic.Theory.Ord (type (>))
29
30 -- import Numeric.Probability (Probability)
31 import Text.Show (Show)
32
33 import Clustering.FrequentItemSet.LCM qualified as Clustering
34
35 -- | A contiguous sequence of n terms
36 newtype Ngram = Ngram {unNgram :: ShortText}
37 deriving (Eq, Generic, Ord, IsString)
38 deriving newtype (Show)
39
40 instance Validity Ngram where
41 validate = trivialValidation
42
43 -- | A 'Root' is a set of `Ngram`s conveying the same meaning
44 -- (according to the analyst).
45 data Root = Root
46 { rootLabel :: Ngram
47 , rootSynonyms :: Set.Set Ngram
48 }
49 deriving (Eq, Generic, Ord, Show)
50
51 instance IsString Root where
52 fromString s =
53 Root
54 { rootLabel = fromString s
55 , rootSynonyms = Set.empty
56 }
57
58 instance Validity Root where
59 validate r =
60 mconcat
61 [ delve "rootLabel" (rootLabel r)
62 , declare
63 "The rootLabel is not a member of the rootSynonyms"
64 (Set.notMember (rootLabel r) (rootSynonyms r))
65 , delve "rootSynonyms" (rootSynonyms r)
66 ]
67
68 type Roots = Clustering.ItemSet Root
69 type Foundations = Set.Set Root
70
71 data Document pos = Document
72 { documentPosition :: pos
73 -- ^ A position could be a date, a section, a page, an IP address, …
74 , documentRoots :: Map.Map Root ()
75 -- , documentContent :: a
76 }
77 deriving (Eq, Generic, Show)
78 instance Validity pos => Validity (Document pos)
79
80 type DocumentByRange range pos = Map.Map range (Seq.Seq (Document pos))
81
82 documentsByRange :: Ord range => (pos -> range) -> [Document pos] -> DocumentByRange range pos
83 documentsByRange mapKey docs =
84 Map.fromListWith
85 (<>)
86 [ (mapKey (documentPosition doc), Seq.singleton doc)
87 | doc <- docs
88 ]
89
90 -- | "Clustering.FrequentItemSet.BruteForce"
91 -- and [the BF]("Clustering.FrequentItemSet.BruteForce")
92 -- and [the Doc]("Document")
93 data RootsOf docs = RootsOfAxiom
94
95 -- @
96 -- `clusterize` roots minSupp minSize docs
97 -- @
98 -- returns for each range the clusters of `Document`
99 -- according to the frequent item set similarity.
100 --
101 -- TODO: If a given period eventually ends up without any FIS,
102 -- we lower both the support and the size until we succeed in repopulating it.
103 clusterize ::
104 Show pos =>
105 roots ::: Roots ->
106 minSupp ::: Int / minSupp > Zero ->
107 minSize ::: Int / minSize > Zero ->
108 docsByRange ::: (range :-> Seq.Seq (Document pos)) ->
109 range
110 :->
111 -- Clustering.ClosedFrequentItemSets roots (RootsOf docsByRange) minSupp minSize :::
112 Cluster
113 :-> Seq.Seq (Clustering.Transaction Root (Document pos))
114 clusterize roots minSupp minSize (Named docsByRange) =
115 -- TODO: currently `Clustering.closedFrequentItemSets` does not return the `Transaction`s
116 -- supporting the closed FIS, maybe it should do it.
117 -- In the meantime, collect those after running `closedFrequentItemSets`.
118 docsByRange <&> \docs ->
119 let closedFISs :: [(Clustering.ItemSupport, Cluster)] =
120 Clustering.closedFrequentItemSets
121 minSupp
122 minSize
123 roots
124 ( RootsOfAxiom ...
125 [ documentRoots doc & Map.keys & Set.fromList
126 | doc <- docs & toList
127 ]
128 )
129 & unName
130 in Map.fromListWith
131 (<>)
132 [ ( c
133 , Seq.singleton
134 Clustering.Transaction
135 { transactionData = doc
136 , transactionItems = documentRoots doc & Map.keys & Set.fromList
137 }
138 )
139 | doc <- docs & toList
140 , (_supp, c :: Cluster) <- closedFISs
141 , Set.isSubsetOf c (documentRoots doc & Map.keys & Set.fromList)
142 ]
143
144 type MapList k a = [(k, a)]
145 type (:->) = Map.Map
146 type Cluster = Clustering.ItemSet Root
147 infixr 0 :->
148
149 data Range pos = Range
150 { rangeMin :: pos
151 , rangeMax :: pos
152 -- , periodScales :: [Scale]
153 }
154 deriving (Eq, Ord, Generic, Show)
155 instance Validity pos => Validity (Range pos)
156
157 type Vocabulary = Map.Map Root ()