]> Git — Sourcephile - literate-phylomemy.git/blob - src/Phylomemy/TemporalMatching.hs
init
[literate-phylomemy.git] / src / Phylomemy / TemporalMatching.hs
1 {-# LANGUAGE OverloadedStrings #-}
2
3 module Phylomemy.TemporalMatching where
4
5 import Clustering.FrequentItemSet.BruteForce qualified as Clustering
6 import Clustering.UnionFind.ST qualified as UF
7 import Control.Monad (Monad (..), foldM, forM_, mapM_, unless, when, zipWithM_)
8 import Control.Monad.ST qualified as ST
9 import Control.Monad.Trans.Class qualified as MT
10 import Control.Monad.Trans.Reader qualified as MT
11 import Control.Monad.Trans.Writer.CPS qualified as MT
12 import Data.Bool (Bool (..), otherwise, (||))
13 import Data.ByteString.Builder qualified as BS
14 import Data.ByteString.Short qualified as BSh
15 import Data.Either (either)
16 import Data.Eq qualified as Eq
17 import Data.Foldable (any, foldMap, toList)
18 import Data.Function (id, ($), (&), (.))
19 import Data.Functor ((<$>), (<&>))
20 import Data.List qualified as List
21 import Data.Map.Strict qualified as Map
22 import Data.Maybe (Maybe (..), fromJust, maybe)
23 import Data.Monoid (Monoid (..))
24 import Data.Ord (Ord (..))
25 import Data.Ord qualified as Ord
26 import Data.Ratio ((%))
27 import Data.Scientific (toBoundedRealFloat)
28 import Data.Semigroup (Semigroup (..))
29 import Data.Sequence qualified as Seq
30 import Data.Set (Set)
31 import Data.Set qualified as Set
32 import Data.String (IsString (..), String)
33 import Data.Text.Short qualified as TS
34 import Logic
35 import Numeric (showFFloat)
36 import Numeric.Decimal (MonadThrow (..))
37 import Numeric.Decimal qualified as Decimal
38 import Numeric.Probability
39 import Phylomemy.Indexation
40 import Text.Show (Show (..))
41 import Prelude (Double, Num (..), mod, fromIntegral, fromRational, pi, tan, toRational, undefined, (/))
42 import Data.Int (Int)
43
44 -- import Data.DisjointMap qualified as DisMap
45
46 type MST range = Probability :-> (range, Cluster) :-> Seq.Seq (range, Cluster)
47
48 -- | Kruskal algorithm to find the maximal spanning trees.
49 --
50 -- We know in advance the edges which will be deleted,
51 -- they go from the lower weights to the higher weights
52 -- (tie breaking with an order on the vertices)
53 -- hence we can compute a maximal spanning tree (MST) wrt. weights
54 -- such that the next weight causing a split
55 -- is the minimal weight inside this MST.
56 branchesMSF ::
57 forall range.
58 Ord range =>
59 Phylomemy range ->
60 -- range :-> Cluster :-> Probability :-> Seq.Seq (range, Cluster) ->
61 -- Phy branches, each as a maximal spanning tree
62 ((range, Cluster) :-> MST range)
63 branchesMSF phy = ST.runST do
64 -- Create a Point for all given nodes
65 rangeToClusterToPoint :: (range :-> Cluster :-> UF.Point s ((range, Cluster), MST range)) <-
66 (`Map.traverseWithKey` phy) \range ->
67 Map.traverseWithKey \cluster _weightToEdges ->
68 UF.fresh ((range, cluster), Map.empty)
69 -- Iterate from the heavier to the lighter edge
70 forM_ (weightToEdges & Map.toDescList) \(weight, links) -> do
71 -- Iterate through all the links of that weight
72 forM_ (links & toList) \(src@(srcR, srcC), dst@(dstR, dstC)) -> do
73 let srcP = rangeToClusterToPoint Map.! srcR Map.! srcC
74 let dstP = rangeToClusterToPoint Map.! dstR Map.! dstC
75 alreadyInSameMST <- UF.equivalent srcP dstP
76 unless alreadyInSameMST do
77 -- This edge (src -> dst) belongs to an MST
78 UF.union' srcP dstP \(srcD, srcB) (dstD, dstB) -> do
79 let root = min srcD dstD
80 return $
81 (root,) $
82 Map.insertWith (Map.unionWith (<>)) weight (Map.singleton src (Seq.singleton dst)) $
83 Map.unionWith (Map.unionWith (<>)) srcB dstB
84 let points ::
85 [UF.Point s ((range, Cluster), MST range)] =
86 rangeToClusterToPoint
87 & Map.elems
88 & List.concatMap Map.elems
89 rootTrees :: [((range, Cluster), MST range)] <-
90 foldM
91 ( \acc p ->
92 UF.redundant p >>= \case
93 True -> return acc
94 False -> UF.descriptor p <&> (: acc)
95 )
96 []
97 points
98 return $ Map.fromList rootTrees
99 where
100 weightToEdges :: Probability :-> Seq.Seq ((range, Cluster), (range, Cluster))
101 weightToEdges =
102 Map.unionsWith
103 (<>)
104 [ Map.map (\dst -> (\dstC -> (src, (dstR, dstC))) <$> dst) dstWC
105 | (srcR, srcCRWC) <- phy & Map.toList
106 , (srcC, dstRWC) <- srcCRWC & Map.toList
107 , let src = (srcR, srcC)
108 , (dstR, dstWC) <- dstRWC & Map.toList
109 ]
110
111 nodeBranchIndex :: Eq.Eq range => ((range, Cluster) :-> MST range) -> (range, Cluster) -> Maybe Int
112 nodeBranchIndex rootToMST n =
113 List.findIndex
114 (\ (root, mst) ->
115 root Eq.== n ||
116 any
117 ( \links ->
118 any ( \(src, dsts) -> src Eq.== n || any (Eq.== n) dsts)
119 (links & Map.toList)
120 ) mst
121 )
122 (rootToMST & Map.toList)
123
124 -- TODO: branches
125 -- TODO: order clusters of a range by their similarity
126 -- An On-Line Edge-Deletion Problem
127 -- https://dl.acm.org/doi/pdf/10.1145/322234.322235
128 -- type Branches range = DisSet.DisjointSet (range, Cluster)
129 type Phylomemy range = range :-> Cluster :-> range :-> Probability :-> Seq.Seq Cluster
130
131 -- TODO: remove Transaction wrapping in favor of type class giving `itemSet`
132 -- TODO: threshold
133 phylomemy ::
134 Ord range =>
135 {-similarityMeasure :::-} (Cluster -> Cluster -> Probability) ->
136 range :-> Cluster :-> Seq.Seq (Clustering.Transaction Root (Document pos)) ->
137 -- The second range is greater than the first.
138 Phylomemy range
139 phylomemy similarity rangeToClusterToDocs =
140 (`Map.mapWithKey` rangeToClusterToDocs) \range clusterToDocs ->
141 (`Map.mapWithKey` clusterToDocs) \cluster _txs ->
142 let (_, rangeToClusterToDocsGT) = Map.split range rangeToClusterToDocs
143 in (`Map.mapMaybeWithKey` rangeToClusterToDocsGT) \_rangeGT clusterToDocsGT ->
144 let childWC =
145 Map.fromListWith
146 (<>)
147 [ (weight, Seq.singleton clusterGT)
148 | clusterGT <- clusterToDocsGT & Map.keys
149 , let weight = similarity cluster clusterGT
150 , proba0 < weight
151 ]
152 in if Map.null childWC
153 then Nothing
154 else Just childWC
155
156 type PhylomemyDownstream range =
157 -- everything
158 range
159 :-> Cluster
160 :->
161 -- children
162 Probability
163 :-> Seq.Seq (range, Cluster)
164 phylomemyDownstream ::
165 Ord range =>
166 (Cluster -> Cluster -> Probability) ->
167 range :-> Cluster :-> Seq.Seq (Clustering.Transaction Root (Document pos)) ->
168 PhylomemyDownstream range
169 phylomemyDownstream similarity rangeToClusterToDocs =
170 (`Map.mapWithKey` rangeToClusterToDocs) \range clusterToDocs ->
171 (`Map.mapWithKey` clusterToDocs) \cluster _txs ->
172 -- TODO: avoid split
173 let (_, rangeToClusterToDocsGT) = Map.split range rangeToClusterToDocs
174 in Map.fromListWith
175 (<>)
176 [ (similarity cluster clusterGT, Seq.singleton (rangeGT, clusterGT))
177 | (rangeGT, clusterToDocsGT) <- rangeToClusterToDocsGT & Map.toList
178 , clusterGT <- clusterToDocsGT & Map.keys
179 ]
180
181 -- | @
182 -- `phylomemyWeights` phy
183 -- @
184 -- returns the set of weights used by the given `Phylomemy`.
185 phylomemyWeights :: Phylomemy range -> Set Probability
186 phylomemyWeights =
187 Map.foldMapWithKey \_ ->
188 Map.foldMapWithKey \_ ->
189 Map.foldMapWithKey \_ ->
190 Set.fromList . Map.keys
191
192 -- | @
193 -- `phylomemyRaise` minWeight phy
194 -- @
195 -- TODO: remove clusters not linked to
196 phylomemyRaise :: Probability -> Phylomemy range -> Phylomemy range
197 phylomemyRaise minWeight =
198 Map.map $ Map.map $ Map.mapMaybe $ (deleteEmptyMap .) $ \childWC ->
199 let (_lt, eqM, gt) = Map.splitLookup minWeight childWC
200 in maybe gt (\eq -> Map.insert minWeight eq gt) eqM
201 where
202 deleteEmptyMap m
203 | Map.null m = Nothing
204 | otherwise = Just m
205
206 -- | @
207 -- `phylomemyDOT` phy
208 -- @
209 -- returns a graph of the given `Phylomemy` in [DOT](https://graphviz.org/doc/info/lang.html) format.
210 phylomemyDOT :: Ord range => Show range => Phylomemy range -> BS.Builder
211 phylomemyDOT parentRCchildRWC = run do
212 let rootToMST = branchesMSF parentRCchildRWC
213 let branchesNum = Map.size rootToMST
214 comments ["Num of branches: " <> fromString (show branchesNum)]
215 line "digraph g"
216 block do
217 line "splines=\"ortho\""
218 -- line "splines=false"
219 indexFrom1M_
220 (parentRCchildRWC & Map.toList)
221 \(parentR, parentCchildRWC) parentRI -> do
222 let parentRB = "r" <> BS.intDec parentRI
223 line $ "subgraph cluster_" <> parentRB
224 block do
225 comments ["Create a node for the range " <> parentRB]
226 node
227 parentRB
228 [ ("shape", "box")
229 , ("label", builderQuotedString (show parentR))
230 , ("color", "gray")
231 , ("style", "filled")
232 , ("fillcolor", "gray")
233 ]
234 line "color=gray"
235 block do
236 line "rank=same"
237 comments ["Create cluster nodes within the range " <> parentRB]
238 indexFrom1M_
239 (parentCchildRWC & Map.toList)
240 \(parentC, _childRWC) parentCI -> do
241 let parentCL = parentC & Set.toList <&> unNgram . rootLabel
242 let parentBI = fromJust $ nodeBranchIndex rootToMST (parentR, parentC)
243 node
244 (parentRB <> "c" <> BS.intDec parentCI)
245 [ ("label", builderQuotedString (mconcat (List.intersperse ", " (parentCL <&> TS.unpack))))
246 , -- , ("pos", BS.intDec parentCI <> "," <> BS.intDec parentRI <> "!"
247 ("style", "filled")
248 , ("fillcolor", builderQuotedString (show ((parentBI + 1) `mod` 12)))
249 , ("colorscheme", "paired12")
250 ]
251 comments ["Horizontally align nodes within the same range"]
252 when (1 < Map.size parentCchildRWC) do
253 edges
254 [parentRB, parentRB <> "c" <> BS.intDec 1]
255 [("style", "invis")]
256 forM_ (List.zip [1 .. Map.size parentCchildRWC - 1] [2 .. Map.size parentCchildRWC]) \(i, j) ->
257 edges
258 [ parentRB <> "c" <> BS.intDec i
259 , parentRB <> "c" <> BS.intDec j
260 ]
261 [ ("weight", "10")
262 , ("style", "invis")
263 ]
264 -- let parentRCAlignment =
265 -- List.intersperse " -> " $
266 -- "r" <> BS.intDec parentRI :
267 -- [ "r" <> BS.intDec parentRI <> "c" <> BS.intDec parentCI <> ":e -> " <> ""
268 -- | (_parentC, _childRWC) <- parentCchildRWC & Map.toList -- TODO: use Map.keys
269 -- | parentCI <- [1 ..]
270 -- ]
271 -- when (1 < List.length parentRCAlignment) do
272 -- line $ mconcat parentRCAlignment <> "[weight=10,style=dotted]"
273 comments
274 [ "Create edges from clusters of the range " <> parentRB
275 , "to clusters within subsequent ranges"
276 ]
277 indexFrom1M_
278 (parentCchildRWC & Map.toList) -- TODO: use Map.keys
279 \(_parentC, childRWC) parentCI -> do
280 -- TODO: se what to do here, using lookupMin, or not
281 case Map.lookupMin childRWC of
282 Nothing -> return ()
283 Just (childR, childWC) -> do
284 let childRI = 1 + Map.findIndex childR parentRCchildRWC
285 forM_ (childWC & Map.toList) \(weight, childCs) -> do
286 forM_ childCs \childC -> do
287 let childCI = 1 + Map.findIndex childC (parentRCchildRWC Map.! childR)
288 let parentRCB = mconcat [parentRB, "c", BS.intDec parentCI]
289 let childRCB = mconcat ["r", BS.intDec childRI, "c", BS.intDec childCI]
290 let labelB = builderQuotedString (showFFloat @Double (Just 2) (fromRational (runProbability weight)) "")
291 let weightB = BS.doubleDec (fromRational (runProbability weight))
292 edges
293 [parentRCB, childRCB]
294 [ ("weight", weightB)
295 , ("label", labelB)
296 , ("fontcolor", "gray60")
297 , ("constraint", "false")
298 ]
299 comments ["Vertically align range nodes"]
300 let rangeLinks =
301 [ "r" <> BS.intDec parentRI
302 | parentRI <- [1 .. Map.size parentRCchildRWC]
303 ]
304 when (1 < List.length rangeLinks) do
305 edges rangeLinks [("weight", "10"), ("style", "invis")]
306 where
307 run :: MT.ReaderT BSh.ShortByteString (MT.Writer BS.Builder) () -> BS.Builder
308 run = MT.execWriter . (`MT.runReaderT` {-indent-} "")
309 block s = do
310 line "{"
311 () <- MT.withReaderT (" " <>) s
312 line "}"
313 line s = do
314 indent <- MT.ask
315 MT.lift $ MT.tell (BS.shortByteString indent <> s <> "\n")
316 indexFrom1M_ xs f = zipWithM_ f xs [1 ..]
317 comments = mapM_ \c -> line $ "// " <> c
318 edges :: [BS.Builder] -> [(BS.Builder, BS.Builder)] -> MT.ReaderT BSh.ShortByteString (MT.Writer BS.Builder) ()
319 edges names as = line $ mconcat (List.intersperse " -> " names) <> attrs as
320 node name as = line $ name <> attrs as
321 attrs as
322 | List.null as = ""
323 | otherwise = "[" <> mconcat (List.intersperse "," [k <> "=" <> v | (k, v) <- as]) <> "]"
324
325
326 builderQuotedString :: String -> BS.Builder
327 builderQuotedString cs = BS.charUtf8 '"' <> foldMap escape cs <> BS.charUtf8 '"'
328 where
329 escape '\\' = BS.charUtf8 '\\' <> BS.charUtf8 '\\'
330 escape '\"' = BS.charUtf8 '\\' <> BS.charUtf8 '\"'
331 escape c = BS.charUtf8 c
332
333 -- newtype Transaction pos = Transaction (Document pos)
334 -- instance Eq (Transaction pos) where
335 -- Transaction x == Transaction y =
336 -- documentRoots x == documentRoots y
337 -- instance Ord (Transaction pos) where
338 -- Transaction x `compare` Transaction y =
339 -- documentRoots x `compare` documentRoots y
340
341 data Foliation = Foliation
342 {
343 }
344
345 data Group date = Group
346 { groupRoots :: Roots
347 , groupPeriod :: (date, date)
348 }
349 type Branch date = [Group date]
350
351 jaccardSimilarity :: Ord.Ord a => Set a -> Set a -> Probability
352 jaccardSimilarity x y =
353 fromJust $
354 probability $
355 fromIntegral (Set.size (Set.intersection x y))
356 % fromIntegral (Set.size (Set.union x y))
357
358 -- data FScore = FScoreAxiom
359
360 -- | https://en.wikipedia.org/wiki/F-score
361 -- L'idée de la F-mesure est de s'assurer qu'un classificateur fait de bonnes
362 -- prédictions de la classe pertinente (bonne précision) en suffisamment grand
363 -- nombre (bon rappel) sur un jeu de données cible. Tout comme la précision et
364 -- le rappel, la F-mesure varie de 0 (plus mauvaise valeur) à 1 (meilleure valeur possible).
365 --
366 -- @(`fScore` lambda)@
367 -- determines whether a given branch should continue to be divided or not at
368 -- the current value of @(delta)@. To that end, the quality score is set up by a
369 -- parameter @(lambda)@.
370 fScore ::
371 Eq.Eq date =>
372 MonadThrow m =>
373 -- | Trade-off between `precision` and `recall`.
374 -- See https://en.wikipedia.org/wiki/Precision_and_recall
375 --
376 -- > [It] predetermine[s] the desired shape of the phylomemy: a continent
377 -- > (i.e., one large branch) or an archipelago of elements of knowledge
378 -- > (i.e., many small branches). The estimation of `lambda` is left to the researcher’s
379 -- > discretion in light of her own expertise and research questions, which makes
380 -- > any phylomemy an artifact of the researcher’s perception
381 --
382 -- For @(lambda == 0)@, only the `precision` counts, whereas for @(lambda == 1)@, only the `recall` counts.
383 lambda ::: Probability ->
384 x ::: Root ->
385 -- | periods
386 [(date, date)] ->
387 -- | the set of all the fields of the branch bk
388 bk ::: [Group date] ->
389 [[Group date]] ->
390 m Probability
391 fScore (Named lambda) (Named x) periods (Named bk) bx
392 | lambda Eq.== proba0 = precision x periods bk
393 | lambda Eq.== proba1 = recall x bk bx
394 | otherwise = do
395 reca <- runProbability <$> recall x bk bx
396 prec <- runProbability <$> precision x periods bk
397 let
398 denom = reca + fl * fl * prec
399 l = either undefined id $ toBoundedRealFloat @Double $ Decimal.toScientificDecimal lambda
400 fl = toRational (tan (l * pi Prelude./ 2))
401 if denom Eq.== 0
402 then return proba0
403 else probability $ ((1 + fl * fl) * prec * reca) Prelude./ denom
404
405 -- | @(`precision` x ps bk)@ returns the probability to observe @(x)@
406 -- by choosing at random a cluster in @(bk)@ within the periods @(ps)@.
407 -- precision = TP / (TP+FP)
408 -- Precision is the fraction of true positive examples among the examples that
409 -- the model classified as positive.
410 -- In other words, the number of true positives divided by
411 -- the number of false positives plus true positives.
412 precision :: Eq.Eq date => MonadThrow m => Root -> [(date, date)] -> Branch date -> m Probability
413 precision x periods bk =
414 probability $
415 fromIntegral (List.length (List.filter (\g -> List.elem x (groupRoots g)) bk'))
416 % fromIntegral (List.length bk')
417 where
418 bk' = List.filter (\g -> List.elem (groupPeriod g) periods) bk
419
420 -- | sensitivity.
421 -- The recall 𝜌𝑘𝑥 = | | 𝑘 of 𝐵𝑘 against 𝑥. It is related to the probability to be in 𝐵𝑘 when choosing a cluster 𝑥 about 𝑥 at random in 𝜙.
422 -- recall = TP / (TP+FN)
423 -- Recall, also known as sensitivity, is the fraction of examples classified as positive,
424 -- among the total number of positive examples.
425 -- In other words, the number of true positives divided by
426 -- the number of true positives plus false negatives.
427 recall :: MonadThrow m => Root -> Branch date -> [Branch date] -> m Probability
428 recall x bk bx =
429 probability $
430 fromIntegral (List.length (List.filter (\g -> Set.member x (groupRoots g)) bk))
431 % fromIntegral (List.length (List.filter (\g -> Set.member x (groupRoots g)) (List.concat bx)))