src/Language/Chinese.hs

   1 {-# LANGUAGE OverloadedLists #-}
   2 {-# LANGUAGE OverloadedStrings #-}
   3
   4 module Language.Chinese where
   5
   6 import Control.Applicative (Alternative (..))
   7 import Control.Monad.ST qualified as ST
   8 import Control.Monad.Trans.State qualified as MT
   9 import Data.Aeson qualified as JSON
  10 import Data.Aeson.Key qualified as JSON.DictKey
  11 import Data.Aeson.KeyMap qualified as JSON
  12 import Data.ByteString (ByteString)
  13 import Data.ByteString qualified as ByteString
  14 import Data.ByteString.Builder qualified as BS
  15 import Data.Char qualified as Char
  16 import Data.Csv ((.!))
  17 import Data.Csv qualified as CSV
  18 import Data.Csv.Incremental qualified as CSV.Incremental
  19 import Data.List qualified as List
  20 import Data.Map.Strict qualified as Map
  21 import Data.STRef qualified as ST
  22 import Data.Set qualified as Set
  23 import Data.Text qualified as Text
  24 import Data.Text.Short qualified as ShortText
  25 import Graph.DOT qualified as DOT
  26 import Numeric.Decimal.BoundedArithmetic (arithError, arithM)
  27 import Paths_worksheets qualified as Self
  28 import System.Exit qualified as Sys
  29 import System.FilePath qualified as Sys
  30 import System.FilePath.Posix ((</>))
  31 import System.FilePath.Posix qualified as File
  32 import System.IO qualified as Sys
  33 import Text.Read (read, readMaybe)
  34 import Worksheets.Utils.HTML (className, classes, styles, (!))
  35 import Worksheets.Utils.HTML qualified as HTML
  36 import Prelude (error, fromIntegral, undefined, (/))
  37
  38 -- import Text.Blaze
  39 import Text.Blaze.Html5 qualified as HTML
  40 import Text.Blaze.Html5.Attributes qualified as HA
  41 import Text.Blaze.Renderer.Utf8 qualified as Blaze
  42
  43 import Language.Pronunciation qualified as Pron
  44 import Worksheets.Utils.Char qualified as Char
  45 import Worksheets.Utils.JSON
  46 import Worksheets.Utils.Prelude
  47 import Worksheets.Utils.Probability
  48
  49 newtype ChineseDict = ChineseDict (Map ShortText ChineseDictEntries)
  50   deriving (Show)
  51 unChineseDict (ChineseDict x) = x
  52 instance Semigroup ChineseDict where
  53   ChineseDict x <> ChineseDict y =
  54     ChineseDict (Map.unionWithKey merge x y)
  55     where
  56       merge _k !xV !yV = xV <> yV
  57
  58 keyLengthToSumOfEntries :: ChineseDict -> Map Natural Natural
  59 keyLengthToSumOfEntries (ChineseDict d) =
  60   [ ( k & ShortText.length & fromIntegral
  61     , 1
  62     )
  63   | k <- d & Map.keys
  64   ]
  65     & Map.fromListWith (+)
  66
  67 chineseFrequencySum :: ChineseDict -> Double
  68 chineseFrequencySum (ChineseDict d) =
  69   [ pure freq
  70   | e <- d & Map.elems
  71   , freq <- e & chineseFrequency & maybeToList
  72   ]
  73     & sum
  74     & arithError
  75     & runProbability
  76     & fromRational @Double
  77
  78 instance Monoid ChineseDict where
  79   mempty = ChineseDict mempty
  80
  81 -- FIXME: use a sum type Char/Word
  82 data ChineseDictEntries = ChineseDictEntries
  83   { chinesePinyins :: ![ShortText]
  84   , -- TODO: , chineseIPA :: !IPA.IPA
  85     chineseEnglish :: ![ShortText]
  86   , chineseFrequency :: !(Maybe Probability)
  87   , chineseComponents :: !(Maybe [Char])
  88   , chineseStrokes :: !(Maybe Natural)
  89   -- ^ FIXME: use Natural
  90   }
  91   deriving (Generic, Show)
  92 chineseDecomp :: ShortText -> ChineseDictEntries -> [Char]
  93 chineseDecomp key ChineseDictEntries{chineseComponents} =
  94   chineseComponents
  95     & fromMaybe (if 1 < ShortText.length key then ShortText.unpack key else [])
  96
  97 instance Semigroup ChineseDictEntries where
  98   x <> y =
  99     ChineseDictEntries
 100       { chinesePinyins =
 101           if null (chinesePinyins x)
 102             then chinesePinyins y
 103             else
 104               if null (chinesePinyins y)
 105                 || chinesePinyins x
 106                 == chinesePinyins y
 107                 then chinesePinyins x
 108                 else -- traceString
 109                 --  ( mconcat $
 110                 --      [ "Semigroup: chinesePinyins: mismatch: "
 111                 --      , show (chinesePinyins x)
 112                 --      , " <> "
 113                 --      , show (chinesePinyins y)
 114                 --      ]
 115                 --  )
 116                   (chinesePinyins x)
 117       , chineseEnglish = chineseEnglish x <> chineseEnglish y
 118       , chineseFrequency = chineseFrequency x <|> chineseFrequency y
 119       , chineseComponents =
 120           case chineseComponents x of
 121             Nothing -> chineseComponents y
 122             Just xS ->
 123               case chineseComponents y of
 124                 Nothing -> chineseComponents x
 125                 Just yS
 126                   | xS == yS -> chineseComponents x
 127                   | otherwise -> error $ "Semigroup: chineseComponents: mismatch: " <> show xS <> " <> " <> show yS
 128       , chineseStrokes =
 129           case chineseStrokes x of
 130             Nothing -> chineseStrokes y
 131             Just xS ->
 132               case chineseStrokes y of
 133                 Nothing -> chineseStrokes x
 134                 Just yS
 135                   | xS == yS -> chineseStrokes x
 136                   | otherwise -> error $ "Semigroup: chineseStrokes: mismatch: " <> show xS <> " <> " <> show yS
 137       }
 138 instance Monoid ChineseDictEntries where
 139   mempty =
 140     ChineseDictEntries
 141       { chinesePinyins = mempty
 142       , chineseEnglish = mempty
 143       , chineseFrequency = Nothing
 144       , chineseComponents = Nothing
 145       , chineseStrokes = Nothing
 146       }
 147
 148 readLoachWangFreq :: IO ChineseDict
 149 readLoachWangFreq = do
 150   readJSON "data/langs/mandarin/LoachWang/word_freq.json" $
 151     JSON.withObject "WordFreq" \kv ->
 152       ChineseDict
 153         . Map.fromAscList
 154         <$> forM (kv & JSON.toMap & Map.toList) \(k, v) ->
 155           v & JSON.withScientific "Frequency" \freqSci -> do
 156             let freq = freqSci & toRational & probability & arithError
 157             return
 158               ( k & JSON.DictKey.toShortText
 159               , mempty{chineseFrequency = Just freq}
 160               )
 161
 162 readLoachWangStrokes :: IO ChineseDict
 163 readLoachWangStrokes =
 164   readJSON ("data/langs/mandarin/LoachWang/char_strokes.json") $
 165     JSON.withObject "WordFreq" \kv ->
 166       ChineseDict
 167         . Map.fromAscList
 168         <$> forM (kv & JSON.toMap & Map.toList) \(k, v) -> do
 169           strokes <- JSON.parseJSON v
 170           return
 171             ( k & JSON.DictKey.toShortText
 172             , mempty{chineseStrokes = Just strokes}
 173             )
 174
 175 readChiseStrokes :: IO ChineseDict
 176 readChiseStrokes =
 177   withDataFile "data/langs/mandarin/cjkvi-ids/ucs-strokes.txt" \dataHandle -> do
 178     let loop accDict@(ChineseDict !acc) = do
 179           isEOF <- dataHandle & Sys.hIsEOF
 180           if isEOF
 181             then return accDict
 182             else do
 183               line <- ByteString.hGetLine dataHandle
 184               let decodeUtf8 = ShortText.fromByteString >>> fromMaybe (error "invalid UTF-8")
 185               let splitOnChar c = ByteString.split (fromIntegral (fromEnum c))
 186               case line & splitOnChar '\t' of
 187                 [unicodeLit, unicodeChar, chineseStrokesLit]
 188                   | unicodeLit & ByteString.isPrefixOf "U+" ->
 189                       loop $
 190                         ChineseDict $
 191                           acc
 192                             & Map.insert
 193                               (unicodeChar & decodeUtf8)
 194                               mempty{chineseStrokes}
 195                   where
 196                     chineseStrokes :: Maybe Natural =
 197                       chineseStrokesLit
 198                         & splitOnChar ','
 199                         <&> ( decodeUtf8
 200                                 >>> ShortText.unpack
 201                                 >>> readMaybe @Natural
 202                             )
 203                         -- CorrectnessWarning: conservative bias toward the greatest number of strokes.
 204                         & List.maximum
 205                         <&> fromIntegral
 206                 _ -> loop accDict
 207     loop mempty
 208
 209 readChiseDecomp :: IO ChineseDict
 210 readChiseDecomp =
 211   withDataFile "data/langs/mandarin/cjkvi-ids/ids.txt" $ (`loop` mempty)
 212   where
 213     loop dataHandle accDict@(ChineseDict !acc) = do
 214       isEOF <- dataHandle & Sys.hIsEOF
 215       if isEOF
 216         then return accDict
 217         else do
 218           line <- ByteString.hGetLine dataHandle
 219           let decodeUtf8 = ShortText.fromByteString >>> fromMaybe (error "invalid UTF-8")
 220           let splitOnChar c = ByteString.split (fromIntegral (fromEnum c))
 221           case line & splitOnChar '\t' of
 222             unicodeLit : (decodeUtf8 -> unicodeChar) : decomps
 223               | unicodeLit & ByteString.isPrefixOf "U+" ->
 224                   let
 225                     decompWithCodes :: [([Char], Set Char)] =
 226                       decomps <&> \decomp ->
 227                         case decomp & ByteString.stripSuffix "]" of
 228                           Nothing ->
 229                             ( decomp & decodeUtf8 & ShortText.unpack
 230                             , mempty
 231                             )
 232                           Just decompWithCode ->
 233                             ( decompChars
 234                                 & ShortText.unpack
 235                                 -- ExplanationNote: remove the final '['
 236                                 & List.init
 237                             , decompCodes & ShortText.unpack & fromList
 238                             )
 239                             where
 240                               (decompChars, decompCodes) =
 241                                 decompWithCode
 242                                   & decodeUtf8
 243                                   & ShortText.breakEnd (== '[')
 244                     chineseComponents =
 245                       decompWithCodes
 246                         -- FIXME: CorrectnessWarning: maybe select using the Codes
 247                         -- instead of just taking the first.
 248                         & List.head
 249                         & fst
 250                         -- ExplanationNote: remove IDCs (Ideographic Description Characters)
 251                         -- namely "⿰" (U+2FF0) to "⿻" (U+2FFB).
 252                         & List.filter (\c -> not ('\x2FF0' <= c && c <= '\x2FFB'))
 253                         & List.filter (\c -> ShortText.singleton c /= unicodeChar)
 254                         & Just
 255                   in
 256                     loop dataHandle $
 257                       ChineseDict $
 258                         acc & Map.insert unicodeChar mempty{chineseComponents}
 259             _ -> loop dataHandle accDict
 260
 261 readChineseDict :: IO ChineseDict
 262 readChineseDict =
 263   fold $
 264     list
 265       [ return extraEntries
 266       , readLoachWangFreq
 267       , readChiseStrokes
 268       , readChiseDecomp
 269       , -- , readChineseLexicalDatabase
 270         readCEDICT
 271         -- , readLoachWangStrokes
 272         -- , readOutlierDecomp
 273       ]
 274 extraEntries :: ChineseDict
 275 extraEntries =
 276   ChineseDict $
 277     [
 278       [
 279         ( "·"
 280         , mempty
 281             { chineseStrokes = Just 1
 282             , chineseComponents = Just ['·']
 283             }
 284         )
 285       ,
 286         ( "库蠓属"
 287         , mempty
 288             { chinesePinyins = ["ku4", "meng3", "shi4"]
 289             , chineseEnglish = ["culicoides"]
 290             }
 291         )
 292       ,
 293         ( "瘦果"
 294         , mempty
 295             { chinesePinyins = ["shou4", "guo3"]
 296             , chineseEnglish = ["achene"]
 297             }
 298         )
 299       ,
 300         ( "美洲野马"
 301         , mempty
 302             { chinesePinyins = ["mei3", "zhou1", "ye3", "ma3"]
 303             , chineseEnglish = ["mustang horse"]
 304             }
 305         )
 306       ,
 307         ( "钩粉蝶"
 308         , mempty
 309             { chinesePinyins = ["gou1", "fen3", "die2"]
 310             , chineseEnglish = ["brimstone butterfly"]
 311             }
 312         )
 313       ,
 314         ( "核果"
 315         , mempty
 316             { chinesePinyins = ["he2", "guo3"]
 317             , chineseEnglish = ["drupe"]
 318             }
 319         )
 320       ,
 321         ( "0"
 322         , mempty
 323             { chineseStrokes = Just 14
 324             , chinesePinyins = ["ling2"] -- 零
 325             }
 326         )
 327       ,
 328         ( "1"
 329         , mempty
 330             { chineseStrokes = Just 2
 331             , chinesePinyins = ["yi1"]
 332             }
 333         )
 334       ,
 335         ( "2"
 336         , mempty
 337             { chineseStrokes = Just 2
 338             , chinesePinyins = ["er4"]
 339             }
 340         )
 341       ,
 342         ( "3"
 343         , mempty
 344             { chineseStrokes = Just 2
 345             , chinesePinyins = ["san1"]
 346             }
 347         )
 348       ,
 349         ( "4"
 350         , mempty
 351             { chineseStrokes = Just 2
 352             , chinesePinyins = ["si4"]
 353             }
 354         )
 355       ,
 356         ( "5"
 357         , mempty
 358             { chineseStrokes = Just 2
 359             , chinesePinyins = ["wu3"]
 360             }
 361         )
 362       ,
 363         ( "6"
 364         , mempty
 365             { chineseStrokes = Just 1
 366             , chinesePinyins = ["liu4"]
 367             }
 368         )
 369       ,
 370         ( "7"
 371         , mempty
 372             { chineseStrokes = Just 1
 373             , chinesePinyins = ["qi1"]
 374             }
 375         )
 376       ,
 377         ( "8"
 378         , mempty
 379             { chineseStrokes = Just 2
 380             , chinesePinyins = ["ba1"]
 381             }
 382         )
 383       ,
 384         ( "9"
 385         , mempty
 386             { chineseStrokes = Just 1
 387             , chinesePinyins = ["jiu3"]
 388             }
 389         )
 390       ]
 391     , [ ( letter & ShortText.singleton
 392         , mempty
 393             { chineseStrokes = Just 1
 394             , chinesePinyins = []
 395             }
 396         )
 397       | letter <-
 398           [ ['A' .. 'Z']
 399           , ['a' .. 'z']
 400           , ['π', '□', '○']
 401           ]
 402             & mconcat
 403       ]
 404     ]
 405       & mconcat
 406       & Map.fromList
 407
 408 readLoachWangWordOrder :: IO JSON.Array
 409 readLoachWangWordOrder =
 410   readJSON "data/langs/mandarin/LoachWang/loach_word_order.json" $
 411     JSON.withArray "LoachWordOrder" \arr -> return arr
 412
 413 readOutlierDecomp :: IO ChineseDict
 414 readOutlierDecomp =
 415   readJSON "data/langs/mandarin/LoachWang/outlier_decomp.json" $
 416     JSON.withObject "OutlierDecomp" \kv ->
 417       ChineseDict
 418         . Map.fromAscList
 419         <$> forM (kv & JSON.toMap & Map.toList) \(k, v) ->
 420           v & JSON.withArray "Decomp" \decomp ->
 421             forM
 422               (decomp & toList)
 423               ( \sub ->
 424                   sub & JSON.withText "Atom" \atom ->
 425                     return $ atom & Text.unpack
 426               )
 427               <&> \atoms ->
 428                 ( k & JSON.DictKey.toShortText
 429                 , mempty{chineseComponents = Just $ atoms & mconcat}
 430                 )
 431
 432 reverseDecomp :: ChineseDict -> Map {-def-} ShortText {-refs-} [ShortText]
 433 reverseDecomp (ChineseDict dict) =
 434   [ sub := [key]
 435   | (key, entry) <- dict & Map.toList
 436   , sub <- entry & chineseDecomp key <&> ShortText.singleton
 437   ]
 438     & Map.fromListWith (<>)
 439
 440 type Benefit = Probability
 441 type Cost = Natural
 442 type Weight = Probability
 443 type DictKey = ShortText
 444 data Weights = Weights
 445   { weightsMap :: Map DictKey (Benefit, Cost)
 446   }
 447   deriving (Show)
 448
 449 -- |
 450 -- Zhao, K., zhao, D., Yang, J., Ma, W., & Yu, B. (2019).
 451 -- Efficient Learning Strategy of Chinese Characters Based on
 452 -- Usage Frequency and Structural Relationship.
 453 -- 2019 IEEE 4th International Conference on Big Data Analytics (ICBDA).
 454 -- doi:10.1109/icbda.2019.8713245
 455 -- https://sci-hub.se/10.1109/icbda.2019.8713245
 456 dictToWeights :: ChineseDict -> Weights
 457 dictToWeights cn@(ChineseDict dict) = ST.runST do
 458   -- ExplanationNote: memo tables, to avoid recomputing benefits and costs
 459   keyToBenefitST :: ST.STRef st (Map DictKey Benefit) <- ST.newSTRef Map.empty
 460   keyToCostST :: ST.STRef st (Map DictKey Cost) <- ST.newSTRef Map.empty
 461   let refsMap :: Map ShortText [ShortText] = reverseDecomp cn
 462   let
 463     keyToBenefitLoop :: DictKey -> ST.ST st Benefit
 464     keyToBenefitLoop key = do
 465       -- traceShowM ("keyToBenefitLoop"::String, key)
 466       keyToBenefitMap <- keyToBenefitST & ST.readSTRef
 467       keyToBenefitMap
 468         & Map.lookup key
 469         & \case
 470           Just bene -> return bene
 471           Nothing -> do
 472             let refs :: [ShortText] =
 473                   refsMap
 474                     & Map.lookup key
 475                     & fromMaybe []
 476                     -- & fromMaybe []
 477                     & List.delete key
 478             -- traceShowM ("keyToBenefitLoop"::String, key, refs)
 479             let freq =
 480                   dict
 481                     & Map.lookup key
 482                     & maybe proba0 (chineseFrequency >>> fromMaybe proba0)
 483             refsBC <- forM refs \ref -> do
 484               bene <- keyToBenefitLoop ref
 485               cost <- keyToCostLoop ref
 486               return (bene, cost)
 487             cost <- keyToCostLoop key
 488             bene <-
 489               arithM $
 490                 probability $
 491                   runProbability freq
 492                     + toRational cost
 493                     * sum
 494                       [ runProbability b / toRational c
 495                       | (b, c) <- refsBC
 496                       , c /= 0
 497                       ]
 498             ST.modifySTRef keyToBenefitST $ Map.insert key bene
 499             return bene
 500     keyToCostLoop :: DictKey -> ST.ST st Cost
 501     keyToCostLoop key = do
 502       -- traceShowM ("keyToCostLoop"::String, key)
 503       keyToCostMap <- keyToCostST & ST.readSTRef
 504       keyToCostMap
 505         & Map.lookup key
 506         & \case
 507           Just cost -> return cost
 508           Nothing -> do
 509             let entry = dict & Map.lookup key & fromMaybe mempty
 510             cost <-
 511               -- single char
 512               if ShortText.length key == 1
 513                 then do
 514                   let deps =
 515                         entry
 516                           & chineseDecomp key
 517                           & List.delete (ShortText.unpack key & List.head)
 518                   -- & (`Set.difference` (ShortText.unpack key & Set.fromList))
 519                   if null deps
 520                     then -- atom char
 521                       return $
 522                         entry
 523                           & chineseStrokes
 524                           & fromMaybe 1
 525                     else -- & fromMaybe (error $ "dictToWeights: missing chineseStrokes for: " <> ShortText.unpack key)
 526                       do
 527                         -- composite char
 528                         -- traceShowM ("composite"::String, show key, show <$> Set.toList deps
 529                         --            , deps & (`Set.difference` (ShortText.unpack key & Set.fromList))
 530                         --            )
 531                         deps
 532                           & foldMapM (\dep -> Sum <$> keyToCostLoop (dep & ShortText.singleton))
 533                           <&> getSum
 534                 else -- word
 535                   do
 536                     -- traceShowM ("word"::String, key)
 537                     key
 538                       & ShortText.unpack
 539                       & foldMapM (\dep -> Sum <$> keyToCostLoop (dep & ShortText.singleton))
 540                       <&> getSum
 541             ST.modifySTRef keyToCostST $ Map.insert key cost
 542             return cost
 543   weightsMap <-
 544     dict & Map.traverseWithKey \key _entry -> do
 545       -- traceShowM ("dictToWeights"::String, key)
 546       bene <- keyToBenefitLoop key
 547       cost <- keyToCostLoop key
 548       return (bene, cost)
 549   return
 550     Weights
 551       { weightsMap
 552       }
 553
 554 newtype ChineseOrder = ChineseOrder (Map (Down Weight) Weights)
 555 unChineseOrder (ChineseOrder x) = x
 556 dictOrder :: Weights -> ChineseOrder
 557 dictOrder w =
 558   [ ( probability (runProbability b / toRational c) & arithError & Down
 559     , Weights
 560         { weightsMap = Map.singleton k v
 561         }
 562     )
 563   | (k, v@(b, c)) <- w & weightsMap & Map.toList
 564   ]
 565     & Map.fromListWith (\x y -> Weights{weightsMap = Map.unionWith (error "dictOrder") (weightsMap x) (weightsMap y)})
 566     & ChineseOrder
 567
 568 dictOrderIO = readChineseDict <&> (dictToWeights >>> dictOrder)
 569
 570 -- order100 :: IO [(DictKey, Maybe ChineseDictEntries)]
 571 order100 = do
 572   dict@(ChineseDict d) <- readChineseDict
 573   let ChineseOrder o = dict & dictToWeights & dictOrder
 574   return $
 575     o
 576       & Map.take 100
 577       & Map.elems
 578       & foldMap (weightsMap >>> Map.keys)
 579       <&> \k -> (k, d & Map.lookup k <&> \r -> (chinesePinyins r, chineseEnglish r))
 580
 581 dictOrderBySameWeight (ChineseOrder o) =
 582   [ ( Map.size (weightsMap v)
 583     , Map.singleton k v
 584     )
 585   | (k, v) <- o & Map.toList
 586   ]
 587     & Map.fromListWith (<>)
 588
 589 {-
 590 data UnfoldedGraph node lbl edge = UnfoldedGraph
 591   { unfoldedVertexToEdges :: !(Array.Array Vertex (lbl, Map edge IntSet))
 592   , unfoldedNodeToVertex :: !(Map node Vertex)
 593   }
 594   deriving (Show, Generic)
 595 unfoldGMany ::
 596   forall node edge nodeLabel.
 597   Ord node =>
 598   (node -> (nodeLabel, Map edge (Set node))) ->
 599   [node] ->
 600   UnfoldedGraph node nodeLabel edge
 601 unfoldGMany gen roots = ST.runST do
 602   nodeToVertexST :: ST.STRef st (Map node Vertex) <- ST.newSTRef Map.empty
 603   vertexToEdgesST :: ST.STRef st [(Vertex, (nodeLabel, Map edge IntSet))] <- ST.newSTRef []
 604   lastVertexST <- ST.newSTRef firstVertex
 605   let newVertex = do
 606         v <- ST.readSTRef lastVertexST
 607         ST.writeSTRef lastVertexST (v + 1)
 608         return v
 609   let
 610     nodeToVertexDepthFirst :: node -> ST.ST st Vertex
 611     nodeToVertexDepthFirst src =
 612       nodeToVertexST
 613         & ST.readSTRef
 614           <&> Map.lookup src
 615           >>= \case
 616             -- DescriptionNote: `src` has already been seen,
 617             -- just return its unique `Vertex`.
 618             Just v -> return v
 619             -- DescriptionNote: `src` has never been seen yet,
 620             -- allocate a `newVertex` for it
 621             Nothing -> do
 622               dst <- newVertex
 623               ST.modifySTRef nodeToVertexST $ Map.insert src dst
 624               let (nodeLabel, edgeToDsts) = gen src
 625               edgeToDstsV :: Map edge IntSet <-
 626                 edgeToDsts & Map.traverseWithKey \_edge dsts ->
 627                   forM (dsts & Set.toList) nodeToVertexDepthFirst
 628                     <&> IntSet.fromList
 629               let res = (dst, (nodeLabel, edgeToDstsV))
 630               ST.modifySTRef vertexToEdgesST (res :)
 631               return dst
 632   forM_ roots nodeToVertexDepthFirst
 633   nodeToVertex <- ST.readSTRef nodeToVertexST
 634   vertexToEdges <- ST.readSTRef vertexToEdgesST
 635   lastId <- ST.readSTRef lastVertexST
 636   return
 637     UnfoldedGraph
 638       { unfoldedVertexToEdges = vertexToEdges & Array.array (firstVertex, lastId - 1)
 639       , unfoldedNodeToVertex = nodeToVertex
 640       }
 641   where
 642     firstVertex :: Vertex = 0
 643 -}
 644
 645 -- mostComp d =
 646 --   [
 647 --   | d & reverseDecomp & Map.toList
 648 --   ] & Map.fromListWith (<>)
 649
 650 data ChineseTone
 651   = ChineseTone1
 652   | ChineseTone2
 653   | ChineseTone3
 654   | ChineseTone4
 655   | ChineseTone5
 656   deriving (Eq, Ord, Show, Enum)
 657
 658 numberedPinyinToDiacriticPiniyn :: ShortText -> ShortText
 659 numberedPinyinToDiacriticPiniyn numPin =
 660   case pinTone of
 661     Nothing -> pinRoot
 662     Just tone ->
 663       pinRoot
 664         & ShortText.unpack
 665         & addUmlaut
 666         & setDiacritic
 667         & ShortText.pack
 668       where
 669         addUmlaut ('u' : ':' : cs) = 'ü' : addUmlaut cs
 670         addUmlaut ('v' : cs) = 'ü' : addUmlaut cs
 671         addUmlaut ('U' : ':' : cs) = 'Ü' : addUmlaut cs
 672         addUmlaut ('V' : cs) = 'Ü' : addUmlaut cs
 673         addUmlaut (c : cs) = c : addUmlaut cs
 674         addUmlaut [] = []
 675         -- CorrectnessNote: a, e or the o in ou always get the marker
 676         setDiacritic (c@'a' : cs) = convert tone c : cs
 677         setDiacritic (c@'e' : cs) = convert tone c : cs
 678         setDiacritic (c@'o' : 'u' : cs) = convert tone c : 'u' : cs
 679         setDiacritic (c : cs)
 680           -- CorrectnessNote: if no a, e, or ou found, the tone mark goes on the last vowel
 681           | List.elem c vowels && all (`List.notElem` vowels) cs = convert tone c : cs
 682           | otherwise = c : setDiacritic cs
 683         setDiacritic [] = []
 684         vowels = lowerVowels <> (Char.toUpper <$> lowerVowels)
 685         lowerVowels = ['a', 'e', 'i', 'o', 'u', 'ü'] & list
 686   where
 687     (pinRoot, pinTone) ::
 688       (ShortText, Maybe ChineseTone) =
 689         numPin & ShortText.spanEnd Char.isDigit & second \case
 690           "" -> Nothing
 691           ds -> ds & ShortText.unpack & read & (\x -> x - (1 :: Int)) & toEnum & Just
 692     convert :: ChineseTone -> Char -> Char
 693     convert t c = case t of
 694       ChineseTone1 ->
 695         case c of
 696           'a' -> 'ā'
 697           'e' -> 'ē'
 698           'i' -> 'ī'
 699           'o' -> 'ō'
 700           'u' -> 'ū'
 701           'ü' -> 'ǖ'
 702           'A' -> 'Ā'
 703           'E' -> 'Ē'
 704           'I' -> 'Ī'
 705           'O' -> 'Ō'
 706           'U' -> 'Ū'
 707           'Ü' -> 'Ǖ'
 708           _ -> undefined
 709       ChineseTone2 ->
 710         case c of
 711           'a' -> 'á'
 712           'e' -> 'é'
 713           'i' -> 'í'
 714           'o' -> 'ó'
 715           'u' -> 'ú'
 716           'ü' -> 'ǘ'
 717           'A' -> 'Á'
 718           'E' -> 'É'
 719           'I' -> 'Í'
 720           'O' -> 'Ó'
 721           'U' -> 'Ú'
 722           'Ü' -> 'Ǘ'
 723           _ -> undefined
 724       ChineseTone3 ->
 725         case c of
 726           'a' -> 'ǎ'
 727           'e' -> 'ě'
 728           'i' -> 'ǐ'
 729           'o' -> 'ǒ'
 730           'u' -> 'ǔ'
 731           'ü' -> 'ǚ'
 732           'A' -> 'Ǎ'
 733           'E' -> 'Ě'
 734           'I' -> 'Ǐ'
 735           'O' -> 'Ǒ'
 736           'U' -> 'Ǔ'
 737           'Ü' -> 'Ǚ'
 738           _ -> undefined
 739       ChineseTone4 ->
 740         case c of
 741           'a' -> 'à'
 742           'e' -> 'è'
 743           'i' -> 'ì'
 744           'o' -> 'ò'
 745           'u' -> 'ù'
 746           'ü' -> 'ǜ'
 747           'A' -> 'À'
 748           'E' -> 'È'
 749           'I' -> 'Ì'
 750           'O' -> 'Ò'
 751           'U' -> 'Ù'
 752           'Ü' -> 'Ǜ'
 753           _ -> undefined
 754       ChineseTone5 -> c
 755
 756 data HskLevel
 757   = HskLevel301
 758   | HskLevel302
 759   | HskLevel303
 760   | HskLevel304
 761   | HskLevel305
 762   | HskLevel306
 763   deriving (Eq, Ord, Enum, Show)
 764
 765 -- instance CSV.FromRecord ChineseDictEntries
 766 -- instance CSV.ToRecord ChineseDictEntries
 767 -- instance CSV.FromNamedRecord ChineseDictEntries
 768 -- instance CSV.ToNamedRecord ChineseDictEntries
 769 -- instance CSV.DefaultOrdered ChineseDictEntries
 770
 771 feed :: (ByteString -> r) -> Sys.Handle -> Sys.IO r
 772 feed k csvFile = do
 773   Sys.hIsEOF csvFile >>= \case
 774     True -> return $ k ""
 775     False -> k <$> ByteString.hGetSome csvFile 4096
 776
 777 readHSK :: HskLevel -> IO ChineseDict
 778 readHSK chineseHSK = do
 779   withDataFile ("data/langs/mandarin/HSK/hsk" <> show hskIndex Sys.<.> "csv") \fileHandle -> do
 780     loop fileHandle mempty $
 781       CSV.Incremental.decodeWithP parser decodeOpts CSV.NoHeader
 782   where
 783     hskIndex = chineseHSK & fromEnum & (+ 1)
 784     decodeOpts = CSV.defaultDecodeOptions
 785     parser :: CSV.Record -> CSV.Parser (ShortText, ChineseDictEntries)
 786     parser v
 787       | length v == 3 = do
 788           chinese <- v .! 0
 789           chinesePinyins <- v .! 1 <&> ShortText.split (== ' ')
 790           chineseEnglish <- v .! 2 <&> pure
 791           let chineseFrequency = Nothing
 792           let chineseComponents = Nothing
 793           let chineseStrokes = Nothing
 794           pure (chinese, ChineseDictEntries{..})
 795       | otherwise = empty
 796     check =
 797       either (\x -> Sys.print x >> return mempty) \(chinese, e) ->
 798         return $ ChineseDict $ Map.singleton chinese e
 799     loop fileHandle !acc = \case
 800       CSV.Incremental.Fail _ errMsg -> do
 801         Sys.putStrLn errMsg
 802         Sys.exitFailure
 803       CSV.Incremental.Many rs k -> do
 804         ok <- rs & foldMapM check
 805         t <- feed k fileHandle
 806         loop fileHandle (acc <> ok) t
 807       CSV.Incremental.Done rs -> do
 808         ok <- rs & foldMapM check
 809         return (acc <> ok)
 810
 811 -- | Sun, C.C., Hendrix, P., Ma, J. et al.
 812 -- Chinese lexical database (CLD).
 813 -- Behav Res 50, 2606–2629 (2018).
 814 -- https://doi.org/10.3758/s13428-018-1038-3
 815 readChineseLexicalDatabase :: IO ChineseDict
 816 readChineseLexicalDatabase = do
 817   withDataFile "data/langs/mandarin/chineselexicaldatabase2.1.csv" \fileHandle -> do
 818     loop fileHandle mempty $
 819       CSV.Incremental.decodeWithP parser decodeOpts CSV.HasHeader
 820   where
 821     decodeOpts = CSV.defaultDecodeOptions
 822     parser :: CSV.Record -> CSV.Parser (DictKey, ChineseDictEntries)
 823     parser v
 824       | length v == 269 = do
 825           key <- v .! 0
 826           keyLength :: Int <- v .! 5
 827           pinyin1 <- v .! 15
 828           pinyin2 <- v .! 16
 829           pinyin3 <- v .! 17
 830           pinyin4 <- v .! 18
 831           let chinesePinyins =
 832                 [pinyin1, pinyin2, pinyin3, pinyin4]
 833                   & List.take keyLength
 834           freqPerMillion :: Double <- v .! 39
 835           let chineseFrequency =
 836                 freqPerMillion
 837                   & toRational
 838                   & (/ 1000000)
 839                   & probability
 840                   & fromMaybe (error "readChineseLexicalDatabase: Frequency")
 841                   & Just
 842           pure (key, mempty{chineseFrequency, chinesePinyins})
 843       | otherwise = error $ "readChineseLexicalDatabase: wrong number of columns: " <> show v
 844     check =
 845       either (\x -> Sys.print x >> return mempty) \(key, val) ->
 846         return $ ChineseDict $ Map.singleton key val
 847     loop fileHandle !acc = \case
 848       CSV.Incremental.Fail _ errMsg -> do
 849         Sys.putStrLn errMsg
 850         Sys.exitFailure
 851       CSV.Incremental.Many rs k -> do
 852         ok <- rs & foldMapM check
 853         t <- feed k fileHandle
 854         loop fileHandle (acc <> ok) t
 855       CSV.Incremental.Done rs -> do
 856         ok <- rs & foldMapM check
 857         return (acc <> ok)
 858
 859 readCEDICT :: IO ChineseDict
 860 readCEDICT = do
 861   withDataFile "data/langs/mandarin/CEDICT/cedict_ts.u8" \cedictHandle -> do
 862     let skipHeader = do
 863           isEOF <- cedictHandle & Sys.hIsEOF
 864           if isEOF
 865             then return ()
 866             else do
 867               lineBS <- ByteString.hGetLine cedictHandle
 868               let lineST = lineBS & ShortText.fromByteString & fromMaybe (error "invalid UTF-8")
 869               let begin = lineST & ShortText.take 1
 870               when (begin == "#") do
 871                 skipHeader
 872     skipHeader
 873     let loop !acc = do
 874           isEOF <- cedictHandle & Sys.hIsEOF
 875           if isEOF
 876             then return acc
 877             else do
 878               line <- ByteString.hGetLine cedictHandle
 879               let decodeUtf8 = ShortText.fromByteString >>> fromMaybe (error "invalid UTF-8")
 880               -- DescriptionNote: each line is formatted as: #(.+) (.+) \[(.+)] /(.*)/#'
 881               let skipChar c = void $ MT.state $ ByteString.span (== fromIntegral (fromEnum c))
 882               let skipPrefix p =
 883                     MT.modify' $
 884                       ByteString.stripPrefix p
 885                         >>> fromMaybe (error $ "skipPrefix fail to match: " <> show p)
 886               let breakOnChar c = ByteString.break (== fromIntegral (fromEnum c))
 887               let breakOnSpace = breakOnChar ' '
 888               let skipSuffix p =
 889                     MT.modify' $
 890                       ByteString.stripSuffix p
 891                         >>> fromMaybe
 892                           ( error $
 893                               "skipSuffix: mismatch: "
 894                                 <> show p
 895                                 <> "\n  on line: "
 896                                 <> ShortText.unpack (decodeUtf8 line)
 897                                 <> "\n  escaped: "
 898                                 <> show (ShortText.unpack (decodeUtf8 line))
 899                           )
 900               let (dict, leftover) = (`MT.runState` line) do
 901                     _chineseTrad <- MT.state $ breakOnSpace >>> first decodeUtf8
 902                     skipChar ' '
 903                     chineseSimpl <- MT.state $ breakOnSpace >>> first decodeUtf8
 904                     skipChar ' '
 905                     skipPrefix "["
 906                     chinesePinyins <-
 907                       MT.state $
 908                         breakOnChar ']'
 909                           >>> first
 910                             ( \s ->
 911                                 s
 912                                   & ByteString.split (fromIntegral (fromEnum ' '))
 913                                   -- CorrectnessWarning: convert pinyin to lowercase, and remove dashes
 914                                   <&> (decodeUtf8 >>> ShortText.unpack >>> (<&> Char.toLower) >>> ShortText.pack)
 915                                   & List.filter (/= "-")
 916                             )
 917                     skipPrefix "] /"
 918                     -- CorrectnessNote: some lines do not end with \r
 919                     -- hence make it optional.
 920                     MT.modify' \s -> s & ByteString.stripSuffix "\r" & fromMaybe s
 921                     skipSuffix "/"
 922                     chineseEnglish <- MT.gets \s -> s & ByteString.split (fromIntegral (fromEnum '/')) <&> decodeUtf8
 923                     MT.put mempty
 924                     let chinese = chineseSimpl
 925                     return $
 926                       ChineseDict $
 927                         Map.singleton chinese $
 928                           mempty
 929                             { chinesePinyins
 930                             , chineseEnglish
 931                             }
 932               if not (ByteString.null leftover)
 933                 then error $ "parserLine: leftover: " <> show leftover
 934                 else loop (acc <> dict)
 935     loop mempty
 936
 937 lookupPinyins :: ChineseDict -> ShortText -> [ShortText]
 938 lookupPinyins (ChineseDict dict) word =
 939   word
 940     & (`Map.lookup` dict)
 941     & fromMaybe (error $ "lookupPinyins: no entry for: {" <> wordString <> "}")
 942     & chinesePinyins
 943     & (\ps -> if null ps then error $ "lookupPinyins: empty entry for: " <> wordString else ps)
 944   where
 945     wordString = word & ShortText.unpack
 946
 947 pronunciation :: ChineseDict -> [Pron.Lexeme] -> [[Either Char Pron.Pron]]
 948 pronunciation dict inp =
 949   inp & Pron.lexemesChars & Text.pack & Text.words <&> wordToPron
 950   where
 951     wordToPron :: Text -> [Either Char Pron.Pron]
 952     wordToPron wordText =
 953       [ pronun chars charCateg charBlock
 954       | (charCateg, uniBlockToChars) <-
 955           wordText
 956             & Text.unpack
 957             & List.map (\c -> (Char.generalCategory c, (Char.unicodeBlock c, c)))
 958             & Char.consecutiveGroups
 959             <&> (<&> Char.consecutiveGroups)
 960       , (charBlock, chars) <- uniBlockToChars
 961       ]
 962         & mconcat
 963       where
 964         pronun :: [Char] -> Char.GeneralCategory -> Maybe Char.UnicodeBlock -> [Either Char Pron.Pron]
 965         pronun chars Char.DecimalNumber _ =
 966           [ Right
 967             Pron.Pron
 968               { pronInput = [Pron.LexemeChar char]
 969               , pronRule =
 970                   Pron.rule
 971                     { Pron.rulePron =
 972                         Pron.Pronunciations
 973                           { Pron.unPronunciations =
 974                               [ Pron.RuleLexemes [Pron.LexemeChar char] :=
 975                                   Pron.Pronunciation
 976                                     { Pron.pronunciationIPABroad = []
 977                                     , Pron.pronunciationText = txt char
 978                                     }
 979                               ]
 980                           }
 981                     }
 982               }
 983           | char <- chars
 984           ]
 985           where
 986             txt c =
 987               c
 988                 & ShortText.singleton
 989                 & lookupPinyins dict
 990                 <&> ( \p ->
 991                         p
 992                           & numberedPinyinToDiacriticPiniyn
 993                           & ShortText.toText
 994                           & Text.toLower
 995                     )
 996                 & Text.intercalate " "
 997         pronun chars Char.OpenPunctuation _ =
 998           [ Left c
 999           | c <- chars
1000           ]
1001         pronun chars Char.ClosePunctuation _ =
1002           [ Left c
1003           | c <- chars
1004           ]
1005         pronun chars Char.OtherPunctuation _ =
1006           [ Left c
1007           | c <- chars
1008           ]
1009         pronun chars _ (Just Char.UnicodeBlockCJK{}) =
1010           [ Right
1011               Pron.Pron
1012                 { pronInput = Pron.LexemeChar <$> chars
1013                 , pronRule =
1014                     Pron.rule
1015                       { Pron.rulePron =
1016                           Pron.Pronunciations
1017                             { Pron.unPronunciations =
1018                                 [ Pron.RuleLexemes (Pron.LexemeChar <$> chars) :=
1019                                     Pron.Pronunciation
1020                                       { Pron.pronunciationIPABroad = []
1021                                       , Pron.pronunciationText = txt
1022                                       }
1023                                 ]
1024                             }
1025                       }
1026                 }
1027           ]
1028           where
1029             txt =
1030               chars
1031                 & ShortText.pack
1032                 & lookupPinyins dict
1033                 & List.map
1034                   ( \pinyin ->
1035                       pinyin
1036                         & numberedPinyinToDiacriticPiniyn
1037                         & ShortText.toText
1038                         & Text.toLower
1039                   )
1040                 & Text.intercalate " "
1041         -- if List.length wordPinyins == Text.length wordText = wordPinyins
1042         -- where
1043         -- wordPinyins = chars
1044         --  & ShortText.pack
1045         --  & lookupPinyins dict
1046         pronun chars categ block = errorShow ("pronunciation" :: Text, chars, categ, block)
1047
1048 orderHTML dict@(ChineseDict d) = do
1049   dataPath <- Self.getDataDir <&> File.normalise
1050   return $ Blaze.renderMarkupBuilder do
1051     HTML.docTypeHtml do
1052       HTML.head do
1053         HTML.title $ ("Chinese Order" :: Text) & HTML.toHtml
1054         forM_
1055           ( [ "styles/Paper.css"
1056             , "styles/Rosetta/Common.css"
1057             , "styles/Rosetta/Reading.css"
1058             ]
1059               & list
1060           )
1061           \cssFile ->
1062             HTML.link
1063               ! HA.rel "stylesheet"
1064               ! HA.type_ "text/css"
1065               ! HA.href (dataPath </> cssFile & HTML.toValue)
1066       -- HTML.styleCSS $ pagesDifficulties & difficultyCSS
1067       HTML.body do
1068         let ChineseOrder o = dict & dictToWeights & dictOrder
1069         HTML.ol do
1070           forM_
1071             ( o
1072                 & Map.take 5000
1073                 & Map.elems
1074                 & foldMap (weightsMap >>> Map.keys)
1075             )
1076             \k ->
1077               HTML.li
1078                 ! HTML.styles
1079                   [ "break-inside" := "avoid"
1080                   , "list-style-position" := "inside"
1081                   ]
1082                 $ do
1083                   HTML.span $ k & HTML.toHtml
1084                   forM_ (d & Map.lookup k) \v -> do
1085                     HTML.ul ! HTML.styles [] $ do
1086                       HTML.li do
1087                         HTML.span $ v & chinesePinyins <&> ShortText.toText & Text.unwords & HTML.toHtml
1088                       HTML.li do
1089                         HTML.span $ v & chineseDecomp k & Text.pack & HTML.toHtml
1090                       HTML.li do
1091                         HTML.span $ v & chineseEnglish <&> ShortText.toText & Text.intercalate "; " & HTML.toHtml
1092
1093 orderDOT :: ChineseDict -> IO BS.Builder
1094 orderDOT dict@(ChineseDict d) = do
1095   DOT.runDOT do
1096     -- dotComments [(BS.lazyByteString $ LazyText.encodeUtf8 $ pShow $ rangeToMstToGroupToClusters & Map.map Map.keys)]
1097     -- pTraceShow ("num of nodes", Map.size nodeToBranch, "num of branches", Map.size msf) $
1098     DOT.dotLine "digraph g"
1099
1100 {-
1101 dotBlock do
1102   dotLine "splines=\"ortho\""
1103   indexFrom1M_ (rangeToMstToGroupToClusters & Map.toList) \(srcR, mstToGroupToClusters) srcRI -> do
1104     let srcRB = "r" <> BS.intDec srcRI
1105     dotLine $ "subgraph cluster_" <> srcRB
1106     dotBlock do
1107       dotComments ["Create a node for the range " <> srcRB]
1108       dotNode
1109         srcRB
1110         [ ("shape", "box")
1111         , ("label", builderQuotedString (showHuman srcR))
1112         , ("color", "gray")
1113         , ("style", "filled")
1114         , ("fillcolor", "gray")
1115         ]
1116       dotLine "color=gray"
1117       dotBlock do
1118         dotLine "rank=same"
1119         dotComments ["Create the cluster nodes within the range " <> srcRB]
1120         forM_ (mstToGroupToClusters & Map.toList) \(mstI, groupToClusters) -> do
1121           forM_ (groupToClusters & Map.toList) \(srcGroup, srcClusters) -> do
1122             dotNodeCluster
1123               srcRI
1124               mstI
1125               srcGroup
1126               [
1127                 ( "label"
1128                 , builderQuotedString $
1129                     (srcClusters & toList <&> showHuman & List.unlines)
1130                       <> "\nT"
1131                       <> printf "%03d" mstI
1132                       <> "\nS"
1133                       <> show scaleI
1134                       -- <> {-maybe ""-} (("\n" <>) . showSimilarity) minSimil
1135                 )
1136               , ("style", "filled")
1137               , -- , minSimil & {-maybe ("", "")-} (\s -> ("fillcolor", 1 + ((floor (runProbability s * 10)) `mod` 10) & BS.intDec))
1138                 ("colorscheme", "ylorrd9")
1139               , ("shape", "box")
1140               ]
1141         dotComments ["Horizontally align the cluster nodes within the same range"]
1142         let row =
1143               [ (mstI, group)
1144               | (mstI, groupToClusters) <- mstToGroupToClusters & Map.toList
1145               , (group, _clusters) <- groupToClusters & Map.toList
1146               ]
1147         case row of
1148           [] -> return ()
1149           c@(firstMst, firstGroup) : cs -> do
1150             dotEdges
1151               [srcRB, srcRB <> "t" <> BS.intDec firstMst <> "c" <> BS.intDec firstGroup]
1152               [ ("style", "invis")
1153               ]
1154             cs & (`foldM_` c) \(srcMst, srcGroup) dst@(dstMst, dstGroup) -> do
1155               dotEdgesCluster
1156                 [(srcRI, srcMst, srcGroup), (srcRI, dstMst, dstGroup)]
1157                 [ ("weight", "10")
1158                 , ("style", "invis")
1159                 ]
1160               return dst
1161   indexFrom1M_ sortedMSF \mst mstI -> do
1162     dotComments ["Create the edges of the MST " <> BS.intDec mstI]
1163     -- pTraceShowM (mstI, List.length (Tree.flatten mst))
1164     let loop (Tree.Node MSTNode{mstNodeRangeCluster = src} dsts) = do
1165           forM_ dsts \dstNode@(Tree.Node MSTNode{mstNodeRangeCluster = dst, mstNodeSimilarity = simil} _) -> do
1166             -- let similB = BS.stringUtf8 $ showFFloat (Just 2) (simil & runProbability & fromRational @Double) ""
1167             let indexRangeCluster (r, c) =
1168                   let clusterToGroup :: cluster :-> ClusterGroup =
1169                         Map.fromList
1170                           [ (cluster, group)
1171                           | (group, clusters) <-
1172                               rangeToMstToGroupToClusters
1173                                 & Map.lookup r
1174                                 & fromMaybe Map.empty
1175                                 & Map.lookup mstI
1176                                 & fromMaybe Map.empty
1177                                 & Map.toList
1178                           , cluster <- clusters & Set.toList
1179                           ]
1180                   in ( 1 + Map.findIndex r rangeToMstToGroupToClusters
1181                      , mstI
1182                      , Map.lookup c clusterToGroup
1183                         & fromMaybe (error (LazyText.unpack (pShow ("r", r, "c", c {-, "clusterToGroup", clusterToGroup, "rangeToMstToGroupToClusters", rangeToMstToGroupToClusters-}))))
1184                      )
1185             dotEdgesCluster
1186               [ indexRangeCluster src
1187               , indexRangeCluster dst
1188               ]
1189               [ ("constraint", "false")
1190               , ("color", (floor (runProbability simil * 10)) `mod` 10 & BS.intDec)
1191               , ("colorscheme", "ylorrd9")
1192               , -- , ("label", similB)
1193                 ("fontcolor", "blue")
1194               , ("dir", "both")
1195               , ("arrowhead", "dot")
1196               , ("arrowtail", "dot")
1197               ]
1198             loop dstNode
1199     loop mst
1200   dotRanges rangeToMstToGroupToClusters
1201 -}