src/Language/Chinese.hs

   1 {-# LANGUAGE OverloadedLists #-}
   2 {-# LANGUAGE OverloadedStrings #-}
   3
   4 module Language.Chinese where
   5
   6 import Control.Applicative (Alternative (..))
   7 import Control.Monad.ST qualified as ST
   8 import Control.Monad.Trans.State qualified as MT
   9 import Data.Aeson qualified as JSON
  10 import Data.Aeson.Key qualified as JSON.DictKey
  11 import Data.Aeson.KeyMap qualified as JSON
  12 import Data.ByteString (ByteString)
  13 import Data.ByteString qualified as ByteString
  14 import Data.ByteString.Builder qualified as BS
  15 import Data.ByteString.Char8 qualified as ByteString.Char8
  16 import Data.Char qualified as Char
  17 import Data.Csv ((.!))
  18 import Data.Csv qualified as CSV
  19 import Data.Csv.Incremental qualified as CSV.Incremental
  20 import Data.List qualified as List
  21 import Data.Map.Strict qualified as Map
  22 import Data.STRef qualified as ST
  23 import Data.Set qualified as Set
  24 import Data.Text qualified as Text
  25 import Data.Text.Short qualified as ShortText
  26 import Graph.DOT qualified as DOT
  27 import Numeric.Decimal.BoundedArithmetic (arithError, arithM)
  28 import Paths_worksheets qualified as Self
  29 import System.Exit qualified as Sys
  30 import System.FilePath qualified as Sys
  31 import System.FilePath.Posix ((</>))
  32 import System.FilePath.Posix qualified as File
  33 import System.IO qualified as Sys
  34 import Text.Read (read, readMaybe)
  35 import Worksheets.Utils.HTML (className, classes, styles, (!))
  36 import Worksheets.Utils.HTML qualified as HTML
  37 import Prelude (error, fromIntegral, undefined, (/))
  38
  39 -- import Text.Blaze
  40 import Text.Blaze.Html5 qualified as HTML
  41 import Text.Blaze.Html5.Attributes qualified as HA
  42 import Text.Blaze.Renderer.Utf8 qualified as Blaze
  43
  44 import Language.Pronunciation qualified as Pron
  45 import Worksheets.Utils.Char qualified as Char
  46 import Worksheets.Utils.JSON
  47 import Worksheets.Utils.Prelude
  48 import Worksheets.Utils.Probability
  49
  50 newtype ChineseDict = ChineseDict (Map ShortText ChineseDictEntries)
  51   deriving (Show)
  52 unChineseDict (ChineseDict x) = x
  53 instance Semigroup ChineseDict where
  54   ChineseDict x <> ChineseDict y =
  55     ChineseDict (Map.unionWithKey merge x y)
  56     where
  57       merge _k !xV !yV = xV <> yV
  58
  59 keyLengthToSumOfEntries :: ChineseDict -> Map Natural Natural
  60 keyLengthToSumOfEntries (ChineseDict d) =
  61   [ ( k & ShortText.length & fromIntegral
  62     , 1
  63     )
  64   | k <- d & Map.keys
  65   ]
  66     & Map.fromListWith (+)
  67
  68 chineseFrequencySum :: ChineseDict -> Double
  69 chineseFrequencySum (ChineseDict d) =
  70   [ pure freq
  71   | e <- d & Map.elems
  72   , freq <- e & chineseFrequency & maybeToList
  73   ]
  74     & sum
  75     & arithError
  76     & runProbability
  77     & fromRational @Double
  78
  79 instance Monoid ChineseDict where
  80   mempty = ChineseDict mempty
  81
  82 -- FIXME: use a sum type Char/Word
  83 data ChineseDictEntries = ChineseDictEntries
  84   { chinesePinyins :: ![ShortText]
  85   , -- TODO: , chineseIPA :: !IPA.IPA
  86     chineseEnglish :: ![ShortText]
  87   , chineseFrequency :: !(Maybe Probability)
  88   , chineseComponents :: !(Maybe [Char])
  89   , chineseStrokes :: !(Maybe Natural)
  90   -- ^ FIXME: use Natural
  91   }
  92   deriving (Generic, Show)
  93 chineseDecomp :: ShortText -> ChineseDictEntries -> [Char]
  94 chineseDecomp key ChineseDictEntries{chineseComponents} =
  95   chineseComponents
  96     & fromMaybe (if 1 < ShortText.length key then ShortText.unpack key else [])
  97
  98 instance Semigroup ChineseDictEntries where
  99   x <> y =
 100     ChineseDictEntries
 101       { chinesePinyins =
 102           if null (chinesePinyins x)
 103             then chinesePinyins y
 104             else
 105               if null (chinesePinyins y)
 106                 || chinesePinyins x
 107                 == chinesePinyins y
 108                 then chinesePinyins x
 109                 else -- traceString
 110                 --  ( mconcat $
 111                 --      [ "Semigroup: chinesePinyins: mismatch: "
 112                 --      , show (chinesePinyins x)
 113                 --      , " <> "
 114                 --      , show (chinesePinyins y)
 115                 --      ]
 116                 --  )
 117                   (chinesePinyins x)
 118       , chineseEnglish = chineseEnglish x <> chineseEnglish y
 119       , chineseFrequency = chineseFrequency x <|> chineseFrequency y
 120       , chineseComponents =
 121           case chineseComponents x of
 122             Nothing -> chineseComponents y
 123             Just xS ->
 124               case chineseComponents y of
 125                 Nothing -> chineseComponents x
 126                 Just yS
 127                   | xS == yS -> chineseComponents x
 128                   | otherwise -> error $ "Semigroup: chineseComponents: mismatch: " <> show xS <> " <> " <> show yS
 129       , chineseStrokes =
 130           case chineseStrokes x of
 131             Nothing -> chineseStrokes y
 132             Just xS ->
 133               case chineseStrokes y of
 134                 Nothing -> chineseStrokes x
 135                 Just yS
 136                   | xS == yS -> chineseStrokes x
 137                   | otherwise -> error $ "Semigroup: chineseStrokes: mismatch: " <> show xS <> " <> " <> show yS
 138       }
 139 instance Monoid ChineseDictEntries where
 140   mempty =
 141     ChineseDictEntries
 142       { chinesePinyins = mempty
 143       , chineseEnglish = mempty
 144       , chineseFrequency = Nothing
 145       , chineseComponents = Nothing
 146       , chineseStrokes = Nothing
 147       }
 148
 149 readLoachWangFreq :: IO ChineseDict
 150 readLoachWangFreq = do
 151   readJSON "data/langs/mandarin/LoachWang/word_freq.json" $
 152     JSON.withObject "WordFreq" \kv ->
 153       ChineseDict
 154         . Map.fromAscList
 155         <$> forM (kv & JSON.toMap & Map.toList) \(k, v) ->
 156           v & JSON.withScientific "Frequency" \freqSci -> do
 157             let freq = freqSci & toRational & probability & arithError
 158             return
 159               ( k & JSON.DictKey.toShortText
 160               , mempty{chineseFrequency = Just freq}
 161               )
 162
 163 readLoachWangStrokes :: IO ChineseDict
 164 readLoachWangStrokes =
 165   readJSON ("data/langs/mandarin/LoachWang/char_strokes.json") $
 166     JSON.withObject "WordFreq" \kv ->
 167       ChineseDict
 168         . Map.fromAscList
 169         <$> forM (kv & JSON.toMap & Map.toList) \(k, v) -> do
 170           strokes <- JSON.parseJSON v
 171           return
 172             ( k & JSON.DictKey.toShortText
 173             , mempty{chineseStrokes = Just strokes}
 174             )
 175
 176 circledNumbers :: Set ShortText
 177 circledNumbers = ["①", "②", "③", "④", "⑤", "⑥", "⑦", "⑧", "⑨", "⑩", "⑪", "⑫", "⑬", "⑭", "⑮", "⑯", "⑰", "⑱", "⑲", "⑳"]
 178
 179 readChiseStrokes :: IO ChineseDict
 180 readChiseStrokes =
 181   withDataFile "data/langs/mandarin/cjkvi-ids/ucs-strokes.txt" \dataHandle -> do
 182     let loop accDict@(ChineseDict !acc) = do
 183           isEOF <- dataHandle & Sys.hIsEOF
 184           if isEOF
 185             then return accDict
 186             else do
 187               line <- ByteString.Char8.hGetLine dataHandle
 188               let decodeUtf8 = ShortText.fromByteString >>> fromMaybe (error "invalid UTF-8")
 189               let splitOnChar c = ByteString.split (fromIntegral (fromEnum c))
 190               case line & splitOnChar '\t' of
 191                 [unicodeLit, decodeUtf8 -> unicodeChar, chineseStrokesLit]
 192                   | unicodeLit & ByteString.isPrefixOf "U+" ->
 193                       loop $
 194                         ChineseDict $
 195                           acc & Map.insert unicodeChar mempty{chineseStrokes}
 196                   where
 197                     chineseStrokes :: Maybe Natural =
 198                       chineseStrokesLit
 199                         & splitOnChar ','
 200                         <&> ( decodeUtf8
 201                                 >>> ShortText.unpack
 202                                 >>> readMaybe @Natural
 203                             )
 204                         -- CorrectnessWarning: conservative bias toward the greatest number of strokes.
 205                         & List.maximum
 206                         <&> fromIntegral
 207                 _ -> loop accDict
 208     loop mempty
 209
 210 readChiseDecomp :: IO ChineseDict
 211 readChiseDecomp =
 212   withDataFile "data/langs/mandarin/cjkvi-ids/ids.txt" $ (`loop` mempty)
 213   where
 214     loop dataHandle accDict@(ChineseDict !acc) = do
 215       isEOF <- dataHandle & Sys.hIsEOF
 216       if isEOF
 217         then return accDict
 218         else do
 219           line <- ByteString.Char8.hGetLine dataHandle
 220           let decodeUtf8 = ShortText.fromByteString >>> fromMaybe (error "invalid UTF-8")
 221           let splitOnChar c = ByteString.split (fromIntegral (fromEnum c))
 222           case line & splitOnChar '\t' of
 223             unicodeLit : (decodeUtf8 -> unicodeChar) : decomps
 224               | unicodeLit & ByteString.isPrefixOf "U+" ->
 225                   let
 226                     decompWithCodes :: [([Char], Set Char)] =
 227                       decomps <&> \decomp ->
 228                         case decomp & ByteString.stripSuffix "]" of
 229                           Nothing ->
 230                             ( decomp & decodeUtf8 & ShortText.unpack
 231                             , mempty
 232                             )
 233                           Just decompWithCode ->
 234                             ( decompChars
 235                                 & ShortText.unpack
 236                                 -- ExplanationNote: remove the final '['
 237                                 & List.init
 238                             , decompCodes & ShortText.unpack & fromList
 239                             )
 240                             where
 241                               (decompChars, decompCodes) =
 242                                 decompWithCode
 243                                   & decodeUtf8
 244                                   & ShortText.breakEnd (== '[')
 245                     chineseComponents =
 246                       decompWithCodes
 247                         -- FIXME: CorrectnessWarning: maybe select using the Codes
 248                         -- instead of just taking the first.
 249                         & List.head
 250                         & fst
 251                         -- ExplanationNote: remove IDCs (Ideographic Description Characters)
 252                         -- namely "⿰" (U+2FF0) to "⿻" (U+2FFB).
 253                         & List.filter (\c -> not ('\x2FF0' <= c && c <= '\x2FFB'))
 254                         & List.filter (\c -> ShortText.singleton c /= unicodeChar)
 255                         & Just
 256                   in
 257                     loop dataHandle $
 258                       ChineseDict $
 259                         acc & Map.insert unicodeChar mempty{chineseComponents}
 260             _ -> loop dataHandle accDict
 261
 262 readChineseDict :: IO ChineseDict
 263 readChineseDict =
 264   fold $
 265     list
 266       [ return extraEntries
 267       , readLoachWangFreq
 268       , readChiseStrokes
 269       , readChiseDecomp
 270       , -- , readChineseLexicalDatabase
 271         readCEDICT
 272         -- , readLoachWangStrokes
 273         -- , readOutlierDecomp
 274       ]
 275 extraEntries :: ChineseDict
 276 extraEntries =
 277   ChineseDict $
 278     [
 279       [
 280         ( "·"
 281         , mempty
 282             { chineseStrokes = Just 1
 283             , chineseComponents = Just ['·']
 284             }
 285         )
 286       ,
 287         ( "库蠓属"
 288         , mempty
 289             { chinesePinyins = ["ku4", "meng3", "shi4"]
 290             , chineseEnglish = ["culicoides"]
 291             }
 292         )
 293       ,
 294         ( "瘦果"
 295         , mempty
 296             { chinesePinyins = ["shou4", "guo3"]
 297             , chineseEnglish = ["achene"]
 298             }
 299         )
 300       ,
 301         ( "美洲野马"
 302         , mempty
 303             { chinesePinyins = ["mei3", "zhou1", "ye3", "ma3"]
 304             , chineseEnglish = ["mustang horse"]
 305             }
 306         )
 307       ,
 308         ( "钩粉蝶"
 309         , mempty
 310             { chinesePinyins = ["gou1", "fen3", "die2"]
 311             , chineseEnglish = ["brimstone butterfly"]
 312             }
 313         )
 314       ,
 315         ( "核果"
 316         , mempty
 317             { chinesePinyins = ["he2", "guo3"]
 318             , chineseEnglish = ["drupe"]
 319             }
 320         )
 321       ,
 322         ( "0"
 323         , mempty
 324             { chineseStrokes = Just 14
 325             , chinesePinyins = ["ling2"] -- 零
 326             }
 327         )
 328       ,
 329         ( "1"
 330         , mempty
 331             { chineseStrokes = Just 2
 332             , chinesePinyins = ["yi1"]
 333             }
 334         )
 335       ,
 336         ( "2"
 337         , mempty
 338             { chineseStrokes = Just 2
 339             , chinesePinyins = ["er4"]
 340             }
 341         )
 342       ,
 343         ( "3"
 344         , mempty
 345             { chineseStrokes = Just 2
 346             , chinesePinyins = ["san1"]
 347             }
 348         )
 349       ,
 350         ( "4"
 351         , mempty
 352             { chineseStrokes = Just 2
 353             , chinesePinyins = ["si4"]
 354             }
 355         )
 356       ,
 357         ( "5"
 358         , mempty
 359             { chineseStrokes = Just 2
 360             , chinesePinyins = ["wu3"]
 361             }
 362         )
 363       ,
 364         ( "6"
 365         , mempty
 366             { chineseStrokes = Just 1
 367             , chinesePinyins = ["liu4"]
 368             }
 369         )
 370       ,
 371         ( "7"
 372         , mempty
 373             { chineseStrokes = Just 1
 374             , chinesePinyins = ["qi1"]
 375             }
 376         )
 377       ,
 378         ( "8"
 379         , mempty
 380             { chineseStrokes = Just 2
 381             , chinesePinyins = ["ba1"]
 382             }
 383         )
 384       ,
 385         ( "9"
 386         , mempty
 387             { chineseStrokes = Just 1
 388             , chinesePinyins = ["jiu3"]
 389             }
 390         )
 391       ]
 392     , [ ( letter & ShortText.singleton
 393         , mempty
 394             { chineseStrokes = Just 1
 395             , chinesePinyins = []
 396             }
 397         )
 398       | letter <-
 399           [ ['A' .. 'Z']
 400           , ['a' .. 'z']
 401           , ['π', '□', '○']
 402           ]
 403             & mconcat
 404       ]
 405     ]
 406       & mconcat
 407       & Map.fromList
 408
 409 readLoachWangWordOrder :: IO JSON.Array
 410 readLoachWangWordOrder =
 411   readJSON "data/langs/mandarin/LoachWang/loach_word_order.json" $
 412     JSON.withArray "LoachWordOrder" \arr -> return arr
 413
 414 readOutlierDecomp :: IO ChineseDict
 415 readOutlierDecomp =
 416   readJSON "data/langs/mandarin/LoachWang/outlier_decomp.json" $
 417     JSON.withObject "OutlierDecomp" \kv ->
 418       ChineseDict
 419         . Map.fromAscList
 420         <$> forM (kv & JSON.toMap & Map.toList) \(k, v) ->
 421           v & JSON.withArray "Decomp" \decomp ->
 422             forM
 423               (decomp & toList)
 424               ( \sub ->
 425                   sub & JSON.withText "Atom" \atom ->
 426                     return $ atom & Text.unpack
 427               )
 428               <&> \atoms ->
 429                 ( k & JSON.DictKey.toShortText
 430                 , mempty{chineseComponents = Just $ atoms & mconcat}
 431                 )
 432
 433 reverseDecomp :: ChineseDict -> Map {-def-} ShortText {-refs-} [ShortText]
 434 reverseDecomp (ChineseDict dict) =
 435   [ sub := [key]
 436   | (key, entry) <- dict & Map.toList
 437   , sub <- entry & chineseDecomp key <&> ShortText.singleton
 438   ]
 439     & Map.fromListWith (<>)
 440
 441 type Benefit = Probability
 442 type Cost = Natural
 443 type Weight = Probability
 444 type DictKey = ShortText
 445 data Weights = Weights
 446   { weightsMap :: Map DictKey (Benefit, Cost)
 447   }
 448   deriving (Show)
 449
 450 -- |
 451 -- Zhao, K., zhao, D., Yang, J., Ma, W., & Yu, B. (2019).
 452 -- Efficient Learning Strategy of Chinese Characters Based on
 453 -- Usage Frequency and Structural Relationship.
 454 -- 2019 IEEE 4th International Conference on Big Data Analytics (ICBDA).
 455 -- doi:10.1109/icbda.2019.8713245
 456 -- https://sci-hub.se/10.1109/icbda.2019.8713245
 457 dictToWeights :: ChineseDict -> Weights
 458 dictToWeights cn@(ChineseDict dict) = ST.runST do
 459   -- Explanation: memo tables, to avoid recomputing benefits and costs
 460   keyToBenefitST :: ST.STRef st (Map DictKey Benefit) <- ST.newSTRef Map.empty
 461   keyToCostST :: ST.STRef st (Map DictKey Cost) <- ST.newSTRef Map.empty
 462   let refsMap :: Map ShortText [ShortText] = reverseDecomp cn
 463   let
 464     keyToBenefitLoop :: DictKey -> ST.ST st Benefit
 465     keyToBenefitLoop key = do
 466       -- traceShowM ("keyToBenefitLoop"::String, key)
 467       keyToBenefitMap <- keyToBenefitST & ST.readSTRef
 468       if Set.member key circledNumbers
 469         then
 470           -- Explanation: circledNumbers induce a cost but zero benefit
 471           return proba0
 472         else
 473           keyToBenefitMap
 474             & Map.lookup key
 475             & \case
 476               Just bene -> return bene
 477               Nothing -> do
 478                 let refs :: [ShortText] =
 479                       refsMap
 480                         & Map.lookup key
 481                         & fromMaybe []
 482                         -- & fromMaybe []
 483                         & List.delete key
 484                 -- traceShowM ("keyToBenefitLoop"::String, key, refs)
 485                 let freq =
 486                       dict
 487                         & Map.lookup key
 488                         & maybe proba0 (chineseFrequency >>> fromMaybe proba0)
 489                 refsBC <- forM refs \ref -> do
 490                   bene <- keyToBenefitLoop ref
 491                   cost <- keyToCostLoop ref
 492                   return (bene, cost)
 493                 cost <- keyToCostLoop key
 494                 bene <-
 495                   arithM $
 496                     probability $
 497                       runProbability freq
 498                         + toRational cost
 499                         * sum
 500                           [ runProbability b / toRational c
 501                           | (b, c) <- refsBC
 502                           , c /= 0
 503                           ]
 504                 ST.modifySTRef keyToBenefitST $ Map.insert key bene
 505                 return bene
 506     keyToCostLoop :: DictKey -> ST.ST st Cost
 507     keyToCostLoop key = do
 508       -- traceShowM ("keyToCostLoop"::String, key)
 509       keyToCostMap <- keyToCostST & ST.readSTRef
 510       if Set.member key circledNumbers
 511         then
 512           -- Explanation: circledNumbers induce a cost given by their number
 513           return $ dict & Map.lookup key & maybe 1 (chineseStrokes >>> fromMaybe 1)
 514         else
 515           keyToCostMap
 516             & Map.lookup key
 517             & \case
 518               Just cost -> return cost
 519               Nothing -> do
 520                 let entry = dict & Map.lookup key & fromMaybe mempty
 521                 cost <-
 522                   -- single char
 523                   if ShortText.length key == 1
 524                     then do
 525                       let deps =
 526                             entry
 527                               & chineseDecomp key
 528                               & List.delete (ShortText.unpack key & List.head)
 529                       -- & (`Set.difference` (ShortText.unpack key & Set.fromList))
 530                       if null deps
 531                         then -- atom char
 532                           return $
 533                             entry
 534                               & chineseStrokes
 535                               & fromMaybe 1
 536                         else -- & fromMaybe (error $ "dictToWeights: missing chineseStrokes for: " <> ShortText.unpack key)
 537                           do
 538                             -- composite char
 539                             -- traceShowM ("composite"::String, show key, show <$> Set.toList deps
 540                             --            , deps & (`Set.difference` (ShortText.unpack key & Set.fromList))
 541                             --            )
 542                             deps
 543                               & foldMapM (\dep -> Sum <$> keyToCostLoop (dep & ShortText.singleton))
 544                               <&> getSum
 545                     else -- word
 546                       do
 547                         -- traceShowM ("word"::String, key)
 548                         key
 549                           & ShortText.unpack
 550                           & foldMapM (\dep -> Sum <$> keyToCostLoop (dep & ShortText.singleton))
 551                           <&> getSum
 552                 ST.modifySTRef keyToCostST $ Map.insert key cost
 553                 return cost
 554   weightsMap <-
 555     dict & Map.traverseWithKey \key _entry -> do
 556       -- traceShowM ("dictToWeights"::String, key)
 557       bene <- keyToBenefitLoop key
 558       cost <- keyToCostLoop key
 559       return (bene, cost)
 560   return
 561     Weights
 562       { weightsMap
 563       }
 564
 565 newtype ChineseOrder = ChineseOrder (Map (Down Weight) Weights)
 566 unChineseOrder (ChineseOrder x) = x
 567 dictOrder :: Weights -> ChineseOrder
 568 dictOrder w =
 569   [ ( probability (runProbability b / toRational c) & arithError & Down
 570     , Weights
 571         { weightsMap = Map.singleton k v
 572         }
 573     )
 574   | (k, v@(b, c)) <- w & weightsMap & Map.toList
 575   ]
 576     & Map.fromListWith (\x y -> Weights{weightsMap = Map.unionWith (error "dictOrder") (weightsMap x) (weightsMap y)})
 577     & ChineseOrder
 578
 579 dictOrderIO = readChineseDict <&> (dictToWeights >>> dictOrder)
 580
 581 -- order100 :: IO [(DictKey, Maybe ChineseDictEntries)]
 582 order100 = do
 583   dict@(ChineseDict d) <- readChineseDict
 584   let ChineseOrder o = dict & dictToWeights & dictOrder
 585   return $
 586     o
 587       & Map.take 100
 588       & Map.elems
 589       & foldMap (weightsMap >>> Map.keys)
 590       <&> \k -> (k, d & Map.lookup k <&> \r -> (chinesePinyins r, chineseEnglish r))
 591
 592 dictOrderBySameWeight (ChineseOrder o) =
 593   [ ( Map.size (weightsMap v)
 594     , Map.singleton k v
 595     )
 596   | (k, v) <- o & Map.toList
 597   ]
 598     & Map.fromListWith (<>)
 599
 600 {-
 601 data UnfoldedGraph node lbl edge = UnfoldedGraph
 602   { unfoldedVertexToEdges :: !(Array.Array Vertex (lbl, Map edge IntSet))
 603   , unfoldedNodeToVertex :: !(Map node Vertex)
 604   }
 605   deriving (Show, Generic)
 606 unfoldGMany ::
 607   forall node edge nodeLabel.
 608   Ord node =>
 609   (node -> (nodeLabel, Map edge (Set node))) ->
 610   [node] ->
 611   UnfoldedGraph node nodeLabel edge
 612 unfoldGMany gen roots = ST.runST do
 613   nodeToVertexST :: ST.STRef st (Map node Vertex) <- ST.newSTRef Map.empty
 614   vertexToEdgesST :: ST.STRef st [(Vertex, (nodeLabel, Map edge IntSet))] <- ST.newSTRef []
 615   lastVertexST <- ST.newSTRef firstVertex
 616   let newVertex = do
 617         v <- ST.readSTRef lastVertexST
 618         ST.writeSTRef lastVertexST (v + 1)
 619         return v
 620   let
 621     nodeToVertexDepthFirst :: node -> ST.ST st Vertex
 622     nodeToVertexDepthFirst src =
 623       nodeToVertexST
 624         & ST.readSTRef
 625           <&> Map.lookup src
 626           >>= \case
 627             -- DescriptionNote: `src` has already been seen,
 628             -- just return its unique `Vertex`.
 629             Just v -> return v
 630             -- DescriptionNote: `src` has never been seen yet,
 631             -- allocate a `newVertex` for it
 632             Nothing -> do
 633               dst <- newVertex
 634               ST.modifySTRef nodeToVertexST $ Map.insert src dst
 635               let (nodeLabel, edgeToDsts) = gen src
 636               edgeToDstsV :: Map edge IntSet <-
 637                 edgeToDsts & Map.traverseWithKey \_edge dsts ->
 638                   forM (dsts & Set.toList) nodeToVertexDepthFirst
 639                     <&> IntSet.fromList
 640               let res = (dst, (nodeLabel, edgeToDstsV))
 641               ST.modifySTRef vertexToEdgesST (res :)
 642               return dst
 643   forM_ roots nodeToVertexDepthFirst
 644   nodeToVertex <- ST.readSTRef nodeToVertexST
 645   vertexToEdges <- ST.readSTRef vertexToEdgesST
 646   lastId <- ST.readSTRef lastVertexST
 647   return
 648     UnfoldedGraph
 649       { unfoldedVertexToEdges = vertexToEdges & Array.array (firstVertex, lastId - 1)
 650       , unfoldedNodeToVertex = nodeToVertex
 651       }
 652   where
 653     firstVertex :: Vertex = 0
 654 -}
 655
 656 -- mostComp d =
 657 --   [
 658 --   | d & reverseDecomp & Map.toList
 659 --   ] & Map.fromListWith (<>)
 660
 661 data ChineseTone
 662   = ChineseTone1
 663   | ChineseTone2
 664   | ChineseTone3
 665   | ChineseTone4
 666   | ChineseTone5
 667   deriving (Eq, Ord, Show, Enum)
 668
 669 numberedPinyinToDiacriticPiniyn :: ShortText -> ShortText
 670 numberedPinyinToDiacriticPiniyn numPin =
 671   case pinTone of
 672     Nothing -> pinRoot
 673     Just tone ->
 674       pinRoot
 675         & ShortText.unpack
 676         & addUmlaut
 677         & setDiacritic
 678         & ShortText.pack
 679       where
 680         addUmlaut ('u' : ':' : cs) = 'ü' : addUmlaut cs
 681         addUmlaut ('v' : cs) = 'ü' : addUmlaut cs
 682         addUmlaut ('U' : ':' : cs) = 'Ü' : addUmlaut cs
 683         addUmlaut ('V' : cs) = 'Ü' : addUmlaut cs
 684         addUmlaut (c : cs) = c : addUmlaut cs
 685         addUmlaut [] = []
 686         -- CorrectnessNote: a, e or the o in ou always get the marker
 687         setDiacritic (c@'a' : cs) = convert tone c : cs
 688         setDiacritic (c@'e' : cs) = convert tone c : cs
 689         setDiacritic (c@'o' : 'u' : cs) = convert tone c : 'u' : cs
 690         setDiacritic (c : cs)
 691           -- CorrectnessNote: if no a, e, or ou found, the tone mark goes on the last vowel
 692           | List.elem c vowels && all (`List.notElem` vowels) cs = convert tone c : cs
 693           | otherwise = c : setDiacritic cs
 694         setDiacritic [] = []
 695         vowels = lowerVowels <> (Char.toUpper <$> lowerVowels)
 696         lowerVowels = ['a', 'e', 'i', 'o', 'u', 'ü'] & list
 697   where
 698     (pinRoot, pinTone) ::
 699       (ShortText, Maybe ChineseTone) =
 700         numPin & ShortText.spanEnd Char.isDigit & second \case
 701           "" -> Nothing
 702           ds -> ds & ShortText.unpack & read & (\x -> x - (1 :: Int)) & toEnum & Just
 703     convert :: ChineseTone -> Char -> Char
 704     convert t c = case t of
 705       ChineseTone1 ->
 706         case c of
 707           'a' -> 'ā'
 708           'e' -> 'ē'
 709           'i' -> 'ī'
 710           'o' -> 'ō'
 711           'u' -> 'ū'
 712           'ü' -> 'ǖ'
 713           'A' -> 'Ā'
 714           'E' -> 'Ē'
 715           'I' -> 'Ī'
 716           'O' -> 'Ō'
 717           'U' -> 'Ū'
 718           'Ü' -> 'Ǖ'
 719           _ -> undefined
 720       ChineseTone2 ->
 721         case c of
 722           'a' -> 'á'
 723           'e' -> 'é'
 724           'i' -> 'í'
 725           'o' -> 'ó'
 726           'u' -> 'ú'
 727           'ü' -> 'ǘ'
 728           'A' -> 'Á'
 729           'E' -> 'É'
 730           'I' -> 'Í'
 731           'O' -> 'Ó'
 732           'U' -> 'Ú'
 733           'Ü' -> 'Ǘ'
 734           _ -> undefined
 735       ChineseTone3 ->
 736         case c of
 737           'a' -> 'ǎ'
 738           'e' -> 'ě'
 739           'i' -> 'ǐ'
 740           'o' -> 'ǒ'
 741           'u' -> 'ǔ'
 742           'ü' -> 'ǚ'
 743           'A' -> 'Ǎ'
 744           'E' -> 'Ě'
 745           'I' -> 'Ǐ'
 746           'O' -> 'Ǒ'
 747           'U' -> 'Ǔ'
 748           'Ü' -> 'Ǚ'
 749           _ -> undefined
 750       ChineseTone4 ->
 751         case c of
 752           'a' -> 'à'
 753           'e' -> 'è'
 754           'i' -> 'ì'
 755           'o' -> 'ò'
 756           'u' -> 'ù'
 757           'ü' -> 'ǜ'
 758           'A' -> 'À'
 759           'E' -> 'È'
 760           'I' -> 'Ì'
 761           'O' -> 'Ò'
 762           'U' -> 'Ù'
 763           'Ü' -> 'Ǜ'
 764           _ -> undefined
 765       ChineseTone5 -> c
 766
 767 data HskLevel
 768   = HskLevel301
 769   | HskLevel302
 770   | HskLevel303
 771   | HskLevel304
 772   | HskLevel305
 773   | HskLevel306
 774   deriving (Eq, Ord, Enum, Show)
 775
 776 -- instance CSV.FromRecord ChineseDictEntries
 777 -- instance CSV.ToRecord ChineseDictEntries
 778 -- instance CSV.FromNamedRecord ChineseDictEntries
 779 -- instance CSV.ToNamedRecord ChineseDictEntries
 780 -- instance CSV.DefaultOrdered ChineseDictEntries
 781
 782 feed :: (ByteString -> r) -> Sys.Handle -> Sys.IO r
 783 feed k csvFile = do
 784   Sys.hIsEOF csvFile >>= \case
 785     True -> return $ k ""
 786     False -> k <$> ByteString.hGetSome csvFile 4096
 787
 788 readHSK :: HskLevel -> IO ChineseDict
 789 readHSK chineseHSK = do
 790   withDataFile ("data/langs/mandarin/HSK/hsk" <> show hskIndex Sys.<.> "csv") \fileHandle -> do
 791     loop fileHandle mempty $
 792       CSV.Incremental.decodeWithP parser decodeOpts CSV.NoHeader
 793   where
 794     hskIndex = chineseHSK & fromEnum & (+ 1)
 795     decodeOpts = CSV.defaultDecodeOptions
 796     parser :: CSV.Record -> CSV.Parser (ShortText, ChineseDictEntries)
 797     parser v
 798       | length v == 3 = do
 799           chinese <- v .! 0
 800           chinesePinyins <- v .! 1 <&> ShortText.split (== ' ')
 801           chineseEnglish <- v .! 2 <&> pure
 802           let chineseFrequency = Nothing
 803           let chineseComponents = Nothing
 804           let chineseStrokes = Nothing
 805           pure (chinese, ChineseDictEntries{..})
 806       | otherwise = empty
 807     check =
 808       either (\x -> Sys.print x >> return mempty) \(chinese, e) ->
 809         return $ ChineseDict $ Map.singleton chinese e
 810     loop fileHandle !acc = \case
 811       CSV.Incremental.Fail _ errMsg -> do
 812         Sys.putStrLn errMsg
 813         Sys.exitFailure
 814       CSV.Incremental.Many rs k -> do
 815         ok <- rs & foldMapM check
 816         t <- feed k fileHandle
 817         loop fileHandle (acc <> ok) t
 818       CSV.Incremental.Done rs -> do
 819         ok <- rs & foldMapM check
 820         return (acc <> ok)
 821
 822 -- | Sun, C.C., Hendrix, P., Ma, J. et al.
 823 -- Chinese lexical database (CLD).
 824 -- Behav Res 50, 2606–2629 (2018).
 825 -- https://doi.org/10.3758/s13428-018-1038-3
 826 readChineseLexicalDatabase :: IO ChineseDict
 827 readChineseLexicalDatabase = do
 828   withDataFile "data/langs/mandarin/chineselexicaldatabase2.1.csv" \fileHandle -> do
 829     loop fileHandle mempty $
 830       CSV.Incremental.decodeWithP parser decodeOpts CSV.HasHeader
 831   where
 832     decodeOpts = CSV.defaultDecodeOptions
 833     parser :: CSV.Record -> CSV.Parser (DictKey, ChineseDictEntries)
 834     parser v
 835       | length v == 269 = do
 836           key <- v .! 0
 837           keyLength :: Int <- v .! 5
 838           pinyin1 <- v .! 15
 839           pinyin2 <- v .! 16
 840           pinyin3 <- v .! 17
 841           pinyin4 <- v .! 18
 842           let chinesePinyins =
 843                 [pinyin1, pinyin2, pinyin3, pinyin4]
 844                   & List.take keyLength
 845           freqPerMillion :: Double <- v .! 39
 846           let chineseFrequency =
 847                 freqPerMillion
 848                   & toRational
 849                   & (/ 1000000)
 850                   & probability
 851                   & fromMaybe (error "readChineseLexicalDatabase: Frequency")
 852                   & Just
 853           pure (key, mempty{chineseFrequency, chinesePinyins})
 854       | otherwise = error $ "readChineseLexicalDatabase: wrong number of columns: " <> show v
 855     check =
 856       either (\x -> Sys.print x >> return mempty) \(key, val) ->
 857         return $ ChineseDict $ Map.singleton key val
 858     loop fileHandle !acc = \case
 859       CSV.Incremental.Fail _ errMsg -> do
 860         Sys.putStrLn errMsg
 861         Sys.exitFailure
 862       CSV.Incremental.Many rs k -> do
 863         ok <- rs & foldMapM check
 864         t <- feed k fileHandle
 865         loop fileHandle (acc <> ok) t
 866       CSV.Incremental.Done rs -> do
 867         ok <- rs & foldMapM check
 868         return (acc <> ok)
 869
 870 readCEDICT :: IO ChineseDict
 871 readCEDICT = do
 872   withDataFile "data/langs/mandarin/CEDICT/cedict_ts.u8" \cedictHandle -> do
 873     let skipHeader = do
 874           isEOF <- cedictHandle & Sys.hIsEOF
 875           if isEOF
 876             then return ()
 877             else do
 878               lineBS <- ByteString.hGetLine cedictHandle
 879               let lineST = lineBS & ShortText.fromByteString & fromMaybe (error "invalid UTF-8")
 880               let begin = lineST & ShortText.take 1
 881               when (begin == "#") do
 882                 skipHeader
 883     skipHeader
 884     let loop !acc = do
 885           isEOF <- cedictHandle & Sys.hIsEOF
 886           if isEOF
 887             then return acc
 888             else do
 889               line <- ByteString.hGetLine cedictHandle
 890               let decodeUtf8 = ShortText.fromByteString >>> fromMaybe (error "invalid UTF-8")
 891               -- DescriptionNote: each line is formatted as: #(.+) (.+) \[(.+)] /(.*)/#'
 892               let skipChar c = void $ MT.state $ ByteString.span (== fromIntegral (fromEnum c))
 893               let skipPrefix p =
 894                     MT.modify' $
 895                       ByteString.stripPrefix p
 896                         >>> fromMaybe (error $ "skipPrefix fail to match: " <> show p)
 897               let breakOnChar c = ByteString.break (== fromIntegral (fromEnum c))
 898               let breakOnSpace = breakOnChar ' '
 899               let skipSuffix p =
 900                     MT.modify' $
 901                       ByteString.stripSuffix p
 902                         >>> fromMaybe
 903                           ( error $
 904                               "skipSuffix: mismatch: "
 905                                 <> show p
 906                                 <> "\n  on line: "
 907                                 <> ShortText.unpack (decodeUtf8 line)
 908                                 <> "\n  escaped: "
 909                                 <> show (ShortText.unpack (decodeUtf8 line))
 910                           )
 911               let (dict, leftover) = (`MT.runState` line) do
 912                     _chineseTrad <- MT.state $ breakOnSpace >>> first decodeUtf8
 913                     skipChar ' '
 914                     chineseSimpl <- MT.state $ breakOnSpace >>> first decodeUtf8
 915                     skipChar ' '
 916                     skipPrefix "["
 917                     chinesePinyins <-
 918                       MT.state $
 919                         breakOnChar ']'
 920                           >>> first
 921                             ( \s ->
 922                                 s
 923                                   & ByteString.split (fromIntegral (fromEnum ' '))
 924                                   -- CorrectnessWarning: convert pinyin to lowercase, and remove dashes
 925                                   <&> (decodeUtf8 >>> ShortText.unpack >>> (<&> Char.toLower) >>> ShortText.pack)
 926                                   & List.filter (/= "-")
 927                             )
 928                     skipPrefix "] /"
 929                     -- CorrectnessNote: some lines do not end with \r
 930                     -- hence make it optional.
 931                     MT.modify' \s -> s & ByteString.stripSuffix "\r" & fromMaybe s
 932                     skipSuffix "/"
 933                     chineseEnglish <- MT.gets \s -> s & ByteString.split (fromIntegral (fromEnum '/')) <&> decodeUtf8
 934                     MT.put mempty
 935                     let chinese = chineseSimpl
 936                     return $
 937                       ChineseDict $
 938                         Map.singleton chinese $
 939                           mempty
 940                             { chinesePinyins
 941                             , chineseEnglish
 942                             }
 943               if not (ByteString.null leftover)
 944                 then error $ "parserLine: leftover: " <> show leftover
 945                 else loop (acc <> dict)
 946     loop mempty
 947
 948 lookupPinyins :: ChineseDict -> ShortText -> [ShortText]
 949 lookupPinyins (ChineseDict dict) word =
 950   word
 951     & (`Map.lookup` dict)
 952     & fromMaybe (error $ "lookupPinyins: no entry for: {" <> wordString <> "}")
 953     & chinesePinyins
 954     & (\ps -> if null ps then error $ "lookupPinyins: empty entry for: " <> wordString else ps)
 955   where
 956     wordString = word & ShortText.unpack
 957
 958 pronunciation :: ChineseDict -> [Pron.Lexeme] -> [[Either Char Pron.Pron]]
 959 pronunciation dict inp =
 960   inp & Pron.lexemesChars & Text.pack & Text.words <&> wordToPron
 961   where
 962     wordToPron :: Text -> [Either Char Pron.Pron]
 963     wordToPron wordText =
 964       [ pronun chars charCateg charBlock
 965       | (charCateg, uniBlockToChars) <-
 966           wordText
 967             & Text.unpack
 968             & List.map (\c -> (Char.generalCategory c, (Char.unicodeBlock c, c)))
 969             & Char.consecutiveGroups
 970             <&> (<&> Char.consecutiveGroups)
 971       , (charBlock, chars) <- uniBlockToChars
 972       ]
 973         & mconcat
 974       where
 975         pronun :: [Char] -> Char.GeneralCategory -> Maybe Char.UnicodeBlock -> [Either Char Pron.Pron]
 976         pronun chars Char.DecimalNumber _ =
 977           [ Right
 978               Pron.Pron
 979                 { pronInput = [Pron.LexemeChar char]
 980                 , pronRule =
 981                     Pron.rule
 982                       { Pron.rulePron =
 983                           Pron.Pronunciations
 984                             { Pron.unPronunciations =
 985                                 [ Pron.RuleLexemes [Pron.LexemeChar char] :=
 986                                     Pron.Pronunciation
 987                                       { Pron.pronunciationIPABroad = []
 988                                       , Pron.pronunciationText = txt char
 989                                       }
 990                                 ]
 991                             }
 992                       }
 993                 }
 994           | char <- chars
 995           ]
 996           where
 997             txt c =
 998               c
 999                 & ShortText.singleton
1000                 & lookupPinyins dict
1001                 <&> ( \p ->
1002                         p
1003                           & numberedPinyinToDiacriticPiniyn
1004                           & ShortText.toText
1005                           & Text.toLower
1006                     )
1007                 & Text.intercalate " "
1008         pronun chars Char.OpenPunctuation _ =
1009           [ Left c
1010           | c <- chars
1011           ]
1012         pronun chars Char.ClosePunctuation _ =
1013           [ Left c
1014           | c <- chars
1015           ]
1016         pronun chars Char.OtherPunctuation _ =
1017           [ Left c
1018           | c <- chars
1019           ]
1020         pronun chars _ (Just Char.UnicodeBlockCJK{}) =
1021           [ Right
1022               Pron.Pron
1023                 { pronInput = Pron.LexemeChar <$> chars
1024                 , pronRule =
1025                     Pron.rule
1026                       { Pron.rulePron =
1027                           Pron.Pronunciations
1028                             { Pron.unPronunciations =
1029                                 [ Pron.RuleLexemes (Pron.LexemeChar <$> chars) :=
1030                                     Pron.Pronunciation
1031                                       { Pron.pronunciationIPABroad = []
1032                                       , Pron.pronunciationText = txt
1033                                       }
1034                                 ]
1035                             }
1036                       }
1037                 }
1038           ]
1039           where
1040             txt =
1041               chars
1042                 & ShortText.pack
1043                 & lookupPinyins dict
1044                 & List.map
1045                   ( \pinyin ->
1046                       pinyin
1047                         & numberedPinyinToDiacriticPiniyn
1048                         & ShortText.toText
1049                         & Text.toLower
1050                   )
1051                 & Text.intercalate " "
1052         -- if List.length wordPinyins == Text.length wordText = wordPinyins
1053         -- where
1054         -- wordPinyins = chars
1055         --  & ShortText.pack
1056         --  & lookupPinyins dict
1057         pronun chars categ block = errorShow ("pronunciation" :: Text, chars, categ, block)
1058
1059 orderHTML :: Maybe Int -> ChineseDict -> IO _
1060 orderHTML limit dict@(ChineseDict d) = do
1061   dataPath <- Self.getDataDir <&> File.normalise
1062   return $ Blaze.renderMarkupBuilder do
1063     HTML.docTypeHtml do
1064       HTML.head do
1065         HTML.title $ ("Chinese Order" <> maybe "" (\l -> " (first " <> show l <> ")") limit) & HTML.toHtml
1066         forM_
1067           ( [ "styles/Paper.css"
1068             , "styles/Rosetta/Common.css"
1069             , "styles/Rosetta/Reading.css"
1070             ]
1071               & list
1072           )
1073           \cssFile ->
1074             HTML.link
1075               ! HA.rel "stylesheet"
1076               ! HA.type_ "text/css"
1077               ! HA.href (dataPath </> cssFile & HTML.toValue)
1078       -- HTML.styleCSS $ pagesDifficulties & difficultyCSS
1079       HTML.body do
1080         let order =
1081               dict
1082                 & dictToWeights
1083                 & dictOrder
1084                 & unChineseOrder
1085                 & Map.elems
1086                 & foldMap (weightsMap >>> Map.keys)
1087                 -- Explanation: several entries can share the same 'Weight"
1088                 -- hence compute ranks after merging.
1089                 & ol1
1090         let keyToRank = [(key, rank) | (rank, key) <- order] & Map.fromListWith undefined
1091         let rankId rank = "entry-" <> HTML.toValue (show rank)
1092         let rankLink rank =
1093               HTML.a
1094                 ! styles
1095                   [ "color" := "blue"
1096                   , "text-decoration" := "none"
1097                   ]
1098                 ! HA.href ("#" <> rankId rank)
1099                 $ do
1100                   "#"
1101                   rank & HTML.toHtml
1102         HTML.div ! styles ["display" := "flex", "flex-direction" := "column"] $ do
1103           forM_
1104             ( order
1105                 & maybe id List.take limit
1106             )
1107             \(rank :: Int, k) ->
1108               forM_ (d & Map.lookup k) \v -> do
1109                 HTML.span
1110                   ! styles ["text-align" := "justify", "break-inside" := "avoid"]
1111                   ! HA.id (rankId rank)
1112                   $ do
1113                     rankLink rank
1114                     ". "
1115                     HTML.span $ k & HTML.toHtml
1116                     unless (v & chinesePinyins & List.null) do
1117                       "["
1118                       HTML.span $ v & chinesePinyins <&> ShortText.toText & Text.unwords & HTML.toHtml
1119                       "] "
1120                     unless (v & chineseDecomp k & List.null) do
1121                       "("
1122                       let decomps =
1123                             [ (component, componentRankMaybe) :: (ShortText, Maybe Int)
1124                             | component <- chineseDecomp k v <&> ShortText.singleton
1125                             , let componentRankMaybe = keyToRank & Map.lookup component
1126                             ]
1127                       HTML.span do
1128                         decomps
1129                           <&> ( \(component, componentRankMaybe) -> do
1130                                   HTML.span do
1131                                     component & HTML.toHtml
1132                                   case componentRankMaybe of
1133                                     Nothing -> ""
1134                                     Just componentRank
1135                                       | Set.member component circledNumbers -> ""
1136                                       | otherwise -> rankLink componentRank
1137                               )
1138                           & List.intersperse " "
1139                           & mconcat
1140                       ") "
1141                     HTML.span ! styles ["color" := "#666"] $ do
1142                       v & chineseEnglish <&> ShortText.toText & Text.intercalate "; " & HTML.toHtml
1143
1144 orderDOT :: ChineseDict -> IO BS.Builder
1145 orderDOT dict@(ChineseDict d) = do
1146   DOT.runDOT do
1147     -- dotComments [(BS.lazyByteString $ LazyText.encodeUtf8 $ pShow $ rangeToMstToGroupToClusters & Map.map Map.keys)]
1148     -- pTraceShow ("num of nodes", Map.size nodeToBranch, "num of branches", Map.size msf) $
1149     DOT.dotLine "digraph g"
1150
1151 {-
1152 dotBlock do
1153   dotLine "splines=\"ortho\""
1154   indexFrom1M_ (rangeToMstToGroupToClusters & Map.toList) \(srcR, mstToGroupToClusters) srcRI -> do
1155     let srcRB = "r" <> BS.intDec srcRI
1156     dotLine $ "subgraph cluster_" <> srcRB
1157     dotBlock do
1158       dotComments ["Create a node for the range " <> srcRB]
1159       dotNode
1160         srcRB
1161         [ ("shape", "box")
1162         , ("label", builderQuotedString (showHuman srcR))
1163         , ("color", "gray")
1164         , ("style", "filled")
1165         , ("fillcolor", "gray")
1166         ]
1167       dotLine "color=gray"
1168       dotBlock do
1169         dotLine "rank=same"
1170         dotComments ["Create the cluster nodes within the range " <> srcRB]
1171         forM_ (mstToGroupToClusters & Map.toList) \(mstI, groupToClusters) -> do
1172           forM_ (groupToClusters & Map.toList) \(srcGroup, srcClusters) -> do
1173             dotNodeCluster
1174               srcRI
1175               mstI
1176               srcGroup
1177               [
1178                 ( "label"
1179                 , builderQuotedString $
1180                     (srcClusters & toList <&> showHuman & List.unlines)
1181                       <> "\nT"
1182                       <> printf "%03d" mstI
1183                       <> "\nS"
1184                       <> show scaleI
1185                       -- <> {-maybe ""-} (("\n" <>) . showSimilarity) minSimil
1186                 )
1187               , ("style", "filled")
1188               , -- , minSimil & {-maybe ("", "")-} (\s -> ("fillcolor", 1 + ((floor (runProbability s * 10)) `mod` 10) & BS.intDec))
1189                 ("colorscheme", "ylorrd9")
1190               , ("shape", "box")
1191               ]
1192         dotComments ["Horizontally align the cluster nodes within the same range"]
1193         let row =
1194               [ (mstI, group)
1195               | (mstI, groupToClusters) <- mstToGroupToClusters & Map.toList
1196               , (group, _clusters) <- groupToClusters & Map.toList
1197               ]
1198         case row of
1199           [] -> return ()
1200           c@(firstMst, firstGroup) : cs -> do
1201             dotEdges
1202               [srcRB, srcRB <> "t" <> BS.intDec firstMst <> "c" <> BS.intDec firstGroup]
1203               [ ("style", "invis")
1204               ]
1205             cs & (`foldM_` c) \(srcMst, srcGroup) dst@(dstMst, dstGroup) -> do
1206               dotEdgesCluster
1207                 [(srcRI, srcMst, srcGroup), (srcRI, dstMst, dstGroup)]
1208                 [ ("weight", "10")
1209                 , ("style", "invis")
1210                 ]
1211               return dst
1212   indexFrom1M_ sortedMSF \mst mstI -> do
1213     dotComments ["Create the edges of the MST " <> BS.intDec mstI]
1214     -- pTraceShowM (mstI, List.length (Tree.flatten mst))
1215     let loop (Tree.Node MSTNode{mstNodeRangeCluster = src} dsts) = do
1216           forM_ dsts \dstNode@(Tree.Node MSTNode{mstNodeRangeCluster = dst, mstNodeSimilarity = simil} _) -> do
1217             -- let similB = BS.stringUtf8 $ showFFloat (Just 2) (simil & runProbability & fromRational @Double) ""
1218             let indexRangeCluster (r, c) =
1219                   let clusterToGroup :: cluster :-> ClusterGroup =
1220                         Map.fromList
1221                           [ (cluster, group)
1222                           | (group, clusters) <-
1223                               rangeToMstToGroupToClusters
1224                                 & Map.lookup r
1225                                 & fromMaybe Map.empty
1226                                 & Map.lookup mstI
1227                                 & fromMaybe Map.empty
1228                                 & Map.toList
1229                           , cluster <- clusters & Set.toList
1230                           ]
1231                   in ( 1 + Map.findIndex r rangeToMstToGroupToClusters
1232                      , mstI
1233                      , Map.lookup c clusterToGroup
1234                         & fromMaybe (error (LazyText.unpack (pShow ("r", r, "c", c {-, "clusterToGroup", clusterToGroup, "rangeToMstToGroupToClusters", rangeToMstToGroupToClusters-}))))
1235                      )
1236             dotEdgesCluster
1237               [ indexRangeCluster src
1238               , indexRangeCluster dst
1239               ]
1240               [ ("constraint", "false")
1241               , ("color", (floor (runProbability simil * 10)) `mod` 10 & BS.intDec)
1242               , ("colorscheme", "ylorrd9")
1243               , -- , ("label", similB)
1244                 ("fontcolor", "blue")
1245               , ("dir", "both")
1246               , ("arrowhead", "dot")
1247               , ("arrowtail", "dot")
1248               ]
1249             loop dstNode
1250     loop mst
1251   dotRanges rangeToMstToGroupToClusters
1252 -}