1 {-# LANGUAGE OverloadedLists #-}
2 {-# LANGUAGE OverloadedStrings #-}
4 module Language.Chinese where
6 import Control.Applicative (Alternative (..))
7 import Control.Monad.ST qualified as ST
8 import Control.Monad.Trans.State qualified as MT
9 import Data.Aeson qualified as JSON
10 import Data.Aeson.Key qualified as JSON.DictKey
11 import Data.Aeson.KeyMap qualified as JSON
12 import Data.ByteString (ByteString)
13 import Data.ByteString qualified as ByteString
14 import Data.ByteString.Builder qualified as BS
15 import Data.Char qualified as Char
16 import Data.Csv ((.!))
17 import Data.Csv qualified as CSV
18 import Data.Csv.Incremental qualified as CSV.Incremental
19 import Data.List qualified as List
20 import Data.Map.Strict qualified as Map
21 import Data.STRef qualified as ST
22 import Data.Set qualified as Set
23 import Data.Text qualified as Text
24 import Data.Text.Short qualified as ShortText
25 import Graph.DOT qualified as DOT
26 import Numeric.Decimal.BoundedArithmetic (arithError, arithM)
27 import Paths_worksheets qualified as Self
28 import System.Exit qualified as Sys
29 import System.FilePath qualified as Sys
30 import System.FilePath.Posix ((</>))
31 import System.FilePath.Posix qualified as File
32 import System.IO qualified as Sys
33 import Text.Read (read, readMaybe)
34 import Worksheets.Utils.HTML (className, classes, styles, (!))
35 import Worksheets.Utils.HTML qualified as HTML
36 import Prelude (error, fromIntegral, undefined, (/))
39 import Text.Blaze.Html5 qualified as HTML
40 import Text.Blaze.Html5.Attributes qualified as HA
41 import Text.Blaze.Renderer.Utf8 qualified as Blaze
43 import Language.Pronunciation qualified as Pron
44 import Worksheets.Utils.Char qualified as Char
45 import Worksheets.Utils.JSON
46 import Worksheets.Utils.Prelude
47 import Worksheets.Utils.Probability
49 newtype ChineseDict = ChineseDict (Map ShortText ChineseDictEntries)
51 unChineseDict (ChineseDict x) = x
52 instance Semigroup ChineseDict where
53 ChineseDict x <> ChineseDict y =
54 ChineseDict (Map.unionWithKey merge x y)
56 merge _k !xV !yV = xV <> yV
58 keyLengthToSumOfEntries :: ChineseDict -> Map Natural Natural
59 keyLengthToSumOfEntries (ChineseDict d) =
60 [ ( k & ShortText.length & fromIntegral
65 & Map.fromListWith (+)
67 chineseFrequencySum :: ChineseDict -> Double
68 chineseFrequencySum (ChineseDict d) =
71 , freq <- e & chineseFrequency & maybeToList
76 & fromRational @Double
78 instance Monoid ChineseDict where
79 mempty = ChineseDict mempty
81 -- FIXME: use a sum type Char/Word
82 data ChineseDictEntries = ChineseDictEntries
83 { chinesePinyins :: ![ShortText]
84 , -- TODO: , chineseIPA :: !IPA.IPA
85 chineseEnglish :: ![ShortText]
86 , chineseFrequency :: !(Maybe Probability)
87 , chineseComponents :: !(Maybe [Char])
88 , chineseStrokes :: !(Maybe Natural)
89 -- ^ FIXME: use Natural
91 deriving (Generic, Show)
92 chineseDecomp :: ShortText -> ChineseDictEntries -> [Char]
93 chineseDecomp key ChineseDictEntries{chineseComponents} =
95 & fromMaybe (if 1 < ShortText.length key then ShortText.unpack key else [])
97 instance Semigroup ChineseDictEntries where
101 if null (chinesePinyins x)
102 then chinesePinyins y
104 if null (chinesePinyins y)
107 then chinesePinyins x
110 -- [ "Semigroup: chinesePinyins: mismatch: "
111 -- , show (chinesePinyins x)
113 -- , show (chinesePinyins y)
117 , chineseEnglish = chineseEnglish x <> chineseEnglish y
118 , chineseFrequency = chineseFrequency x <|> chineseFrequency y
119 , chineseComponents =
120 case chineseComponents x of
121 Nothing -> chineseComponents y
123 case chineseComponents y of
124 Nothing -> chineseComponents x
126 | xS == yS -> chineseComponents x
127 | otherwise -> error $ "Semigroup: chineseComponents: mismatch: " <> show xS <> " <> " <> show yS
129 case chineseStrokes x of
130 Nothing -> chineseStrokes y
132 case chineseStrokes y of
133 Nothing -> chineseStrokes x
135 | xS == yS -> chineseStrokes x
136 | otherwise -> error $ "Semigroup: chineseStrokes: mismatch: " <> show xS <> " <> " <> show yS
138 instance Monoid ChineseDictEntries where
141 { chinesePinyins = mempty
142 , chineseEnglish = mempty
143 , chineseFrequency = Nothing
144 , chineseComponents = Nothing
145 , chineseStrokes = Nothing
148 readLoachWangFreq :: IO ChineseDict
149 readLoachWangFreq = do
150 readJSON "data/langs/mandarin/LoachWang/word_freq.json" $
151 JSON.withObject "WordFreq" \kv ->
154 <$> forM (kv & JSON.toMap & Map.toList) \(k, v) ->
155 v & JSON.withScientific "Frequency" \freqSci -> do
156 let freq = freqSci & toRational & probability & arithError
158 ( k & JSON.DictKey.toShortText
159 , mempty{chineseFrequency = Just freq}
162 readLoachWangStrokes :: IO ChineseDict
163 readLoachWangStrokes =
164 readJSON ("data/langs/mandarin/LoachWang/char_strokes.json") $
165 JSON.withObject "WordFreq" \kv ->
168 <$> forM (kv & JSON.toMap & Map.toList) \(k, v) -> do
169 strokes <- JSON.parseJSON v
171 ( k & JSON.DictKey.toShortText
172 , mempty{chineseStrokes = Just strokes}
175 readChiseStrokes :: IO ChineseDict
177 withDataFile "data/langs/mandarin/cjkvi-ids/ucs-strokes.txt" \dataHandle -> do
178 let loop accDict@(ChineseDict !acc) = do
179 isEOF <- dataHandle & Sys.hIsEOF
183 line <- ByteString.hGetLine dataHandle
184 let decodeUtf8 = ShortText.fromByteString >>> fromMaybe (error "invalid UTF-8")
185 let splitOnChar c = ByteString.split (fromIntegral (fromEnum c))
186 case line & splitOnChar '\t' of
187 [unicodeLit, unicodeChar, chineseStrokesLit]
188 | unicodeLit & ByteString.isPrefixOf "U+" ->
193 (unicodeChar & decodeUtf8)
194 mempty{chineseStrokes}
196 chineseStrokes :: Maybe Natural =
201 >>> readMaybe @Natural
203 -- CorrectnessWarning: conservative bias toward the greatest number of strokes.
209 readChiseDecomp :: IO ChineseDict
211 withDataFile "data/langs/mandarin/cjkvi-ids/ids.txt" $ (`loop` mempty)
213 loop dataHandle accDict@(ChineseDict !acc) = do
214 isEOF <- dataHandle & Sys.hIsEOF
218 line <- ByteString.hGetLine dataHandle
219 let decodeUtf8 = ShortText.fromByteString >>> fromMaybe (error "invalid UTF-8")
220 let splitOnChar c = ByteString.split (fromIntegral (fromEnum c))
221 case line & splitOnChar '\t' of
222 unicodeLit : (decodeUtf8 -> unicodeChar) : decomps
223 | unicodeLit & ByteString.isPrefixOf "U+" ->
225 decompWithCodes :: [([Char], Set Char)] =
226 decomps <&> \decomp ->
227 case decomp & ByteString.stripSuffix "]" of
229 ( decomp & decodeUtf8 & ShortText.unpack
232 Just decompWithCode ->
235 -- ExplanationNote: remove the final '['
237 , decompCodes & ShortText.unpack & fromList
240 (decompChars, decompCodes) =
243 & ShortText.breakEnd (== '[')
246 -- FIXME: CorrectnessWarning: maybe select using the Codes
247 -- instead of just taking the first.
250 -- ExplanationNote: remove IDCs (Ideographic Description Characters)
251 -- namely "⿰" (U+2FF0) to "⿻" (U+2FFB).
252 & List.filter (\c -> not ('\x2FF0' <= c && c <= '\x2FFB'))
253 & List.filter (\c -> ShortText.singleton c /= unicodeChar)
258 acc & Map.insert unicodeChar mempty{chineseComponents}
259 _ -> loop dataHandle accDict
261 readChineseDict :: IO ChineseDict
265 [ return extraEntries
269 , -- , readChineseLexicalDatabase
271 -- , readLoachWangStrokes
272 -- , readOutlierDecomp
274 extraEntries :: ChineseDict
281 { chineseStrokes = Just 1
282 , chineseComponents = Just ['·']
288 { chinesePinyins = ["ku4", "meng3", "shi4"]
289 , chineseEnglish = ["culicoides"]
295 { chinesePinyins = ["shou4", "guo3"]
296 , chineseEnglish = ["achene"]
302 { chinesePinyins = ["mei3", "zhou1", "ye3", "ma3"]
303 , chineseEnglish = ["mustang horse"]
309 { chinesePinyins = ["gou1", "fen3", "die2"]
310 , chineseEnglish = ["brimstone butterfly"]
316 { chinesePinyins = ["he2", "guo3"]
317 , chineseEnglish = ["drupe"]
323 { chineseStrokes = Just 14
324 , chinesePinyins = ["ling2"] -- 零
330 { chineseStrokes = Just 2
331 , chinesePinyins = ["yi1"]
337 { chineseStrokes = Just 2
338 , chinesePinyins = ["er4"]
344 { chineseStrokes = Just 2
345 , chinesePinyins = ["san1"]
351 { chineseStrokes = Just 2
352 , chinesePinyins = ["si4"]
358 { chineseStrokes = Just 2
359 , chinesePinyins = ["wu3"]
365 { chineseStrokes = Just 1
366 , chinesePinyins = ["liu4"]
372 { chineseStrokes = Just 1
373 , chinesePinyins = ["qi1"]
379 { chineseStrokes = Just 2
380 , chinesePinyins = ["ba1"]
386 { chineseStrokes = Just 1
387 , chinesePinyins = ["jiu3"]
391 , [ ( letter & ShortText.singleton
393 { chineseStrokes = Just 1
394 , chinesePinyins = []
408 readLoachWangWordOrder :: IO JSON.Array
409 readLoachWangWordOrder =
410 readJSON "data/langs/mandarin/LoachWang/loach_word_order.json" $
411 JSON.withArray "LoachWordOrder" \arr -> return arr
413 readOutlierDecomp :: IO ChineseDict
415 readJSON "data/langs/mandarin/LoachWang/outlier_decomp.json" $
416 JSON.withObject "OutlierDecomp" \kv ->
419 <$> forM (kv & JSON.toMap & Map.toList) \(k, v) ->
420 v & JSON.withArray "Decomp" \decomp ->
424 sub & JSON.withText "Atom" \atom ->
425 return $ atom & Text.unpack
428 ( k & JSON.DictKey.toShortText
429 , mempty{chineseComponents = Just $ atoms & mconcat}
432 reverseDecomp :: ChineseDict -> Map {-def-} ShortText {-refs-} [ShortText]
433 reverseDecomp (ChineseDict dict) =
435 | (key, entry) <- dict & Map.toList
436 , sub <- entry & chineseDecomp key <&> ShortText.singleton
438 & Map.fromListWith (<>)
440 type Benefit = Probability
442 type Weight = Probability
443 type DictKey = ShortText
444 data Weights = Weights
445 { weightsMap :: Map DictKey (Benefit, Cost)
450 -- Zhao, K., zhao, D., Yang, J., Ma, W., & Yu, B. (2019).
451 -- Efficient Learning Strategy of Chinese Characters Based on
452 -- Usage Frequency and Structural Relationship.
453 -- 2019 IEEE 4th International Conference on Big Data Analytics (ICBDA).
454 -- doi:10.1109/icbda.2019.8713245
455 -- https://sci-hub.se/10.1109/icbda.2019.8713245
456 dictToWeights :: ChineseDict -> Weights
457 dictToWeights cn@(ChineseDict dict) = ST.runST do
458 -- ExplanationNote: memo tables, to avoid recomputing benefits and costs
459 keyToBenefitST :: ST.STRef st (Map DictKey Benefit) <- ST.newSTRef Map.empty
460 keyToCostST :: ST.STRef st (Map DictKey Cost) <- ST.newSTRef Map.empty
461 let refsMap :: Map ShortText [ShortText] = reverseDecomp cn
463 keyToBenefitLoop :: DictKey -> ST.ST st Benefit
464 keyToBenefitLoop key = do
465 -- traceShowM ("keyToBenefitLoop"::String, key)
466 keyToBenefitMap <- keyToBenefitST & ST.readSTRef
470 Just bene -> return bene
472 let refs :: [ShortText] =
478 -- traceShowM ("keyToBenefitLoop"::String, key, refs)
482 & maybe proba0 (chineseFrequency >>> fromMaybe proba0)
483 refsBC <- forM refs \ref -> do
484 bene <- keyToBenefitLoop ref
485 cost <- keyToCostLoop ref
487 cost <- keyToCostLoop key
494 [ runProbability b / toRational c
498 ST.modifySTRef keyToBenefitST $ Map.insert key bene
500 keyToCostLoop :: DictKey -> ST.ST st Cost
501 keyToCostLoop key = do
502 -- traceShowM ("keyToCostLoop"::String, key)
503 keyToCostMap <- keyToCostST & ST.readSTRef
507 Just cost -> return cost
509 let entry = dict & Map.lookup key & fromMaybe mempty
512 if ShortText.length key == 1
517 & List.delete (ShortText.unpack key & List.head)
518 -- & (`Set.difference` (ShortText.unpack key & Set.fromList))
525 else -- & fromMaybe (error $ "dictToWeights: missing chineseStrokes for: " <> ShortText.unpack key)
528 -- traceShowM ("composite"::String, show key, show <$> Set.toList deps
529 -- , deps & (`Set.difference` (ShortText.unpack key & Set.fromList))
532 & foldMapM (\dep -> Sum <$> keyToCostLoop (dep & ShortText.singleton))
536 -- traceShowM ("word"::String, key)
539 & foldMapM (\dep -> Sum <$> keyToCostLoop (dep & ShortText.singleton))
541 ST.modifySTRef keyToCostST $ Map.insert key cost
544 dict & Map.traverseWithKey \key _entry -> do
545 -- traceShowM ("dictToWeights"::String, key)
546 bene <- keyToBenefitLoop key
547 cost <- keyToCostLoop key
554 newtype ChineseOrder = ChineseOrder (Map (Down Weight) Weights)
555 unChineseOrder (ChineseOrder x) = x
556 dictOrder :: Weights -> ChineseOrder
558 [ ( probability (runProbability b / toRational c) & arithError & Down
560 { weightsMap = Map.singleton k v
563 | (k, v@(b, c)) <- w & weightsMap & Map.toList
565 & Map.fromListWith (\x y -> Weights{weightsMap = Map.unionWith (error "dictOrder") (weightsMap x) (weightsMap y)})
568 dictOrderIO = readChineseDict <&> (dictToWeights >>> dictOrder)
570 -- order100 :: IO [(DictKey, Maybe ChineseDictEntries)]
572 dict@(ChineseDict d) <- readChineseDict
573 let ChineseOrder o = dict & dictToWeights & dictOrder
578 & foldMap (weightsMap >>> Map.keys)
579 <&> \k -> (k, d & Map.lookup k <&> \r -> (chinesePinyins r, chineseEnglish r))
581 dictOrderBySameWeight (ChineseOrder o) =
582 [ ( Map.size (weightsMap v)
585 | (k, v) <- o & Map.toList
587 & Map.fromListWith (<>)
590 data UnfoldedGraph node lbl edge = UnfoldedGraph
591 { unfoldedVertexToEdges :: !(Array.Array Vertex (lbl, Map edge IntSet))
592 , unfoldedNodeToVertex :: !(Map node Vertex)
594 deriving (Show, Generic)
596 forall node edge nodeLabel.
598 (node -> (nodeLabel, Map edge (Set node))) ->
600 UnfoldedGraph node nodeLabel edge
601 unfoldGMany gen roots = ST.runST do
602 nodeToVertexST :: ST.STRef st (Map node Vertex) <- ST.newSTRef Map.empty
603 vertexToEdgesST :: ST.STRef st [(Vertex, (nodeLabel, Map edge IntSet))] <- ST.newSTRef []
604 lastVertexST <- ST.newSTRef firstVertex
606 v <- ST.readSTRef lastVertexST
607 ST.writeSTRef lastVertexST (v + 1)
610 nodeToVertexDepthFirst :: node -> ST.ST st Vertex
611 nodeToVertexDepthFirst src =
616 -- DescriptionNote: `src` has already been seen,
617 -- just return its unique `Vertex`.
619 -- DescriptionNote: `src` has never been seen yet,
620 -- allocate a `newVertex` for it
623 ST.modifySTRef nodeToVertexST $ Map.insert src dst
624 let (nodeLabel, edgeToDsts) = gen src
625 edgeToDstsV :: Map edge IntSet <-
626 edgeToDsts & Map.traverseWithKey \_edge dsts ->
627 forM (dsts & Set.toList) nodeToVertexDepthFirst
629 let res = (dst, (nodeLabel, edgeToDstsV))
630 ST.modifySTRef vertexToEdgesST (res :)
632 forM_ roots nodeToVertexDepthFirst
633 nodeToVertex <- ST.readSTRef nodeToVertexST
634 vertexToEdges <- ST.readSTRef vertexToEdgesST
635 lastId <- ST.readSTRef lastVertexST
638 { unfoldedVertexToEdges = vertexToEdges & Array.array (firstVertex, lastId - 1)
639 , unfoldedNodeToVertex = nodeToVertex
642 firstVertex :: Vertex = 0
647 -- | d & reverseDecomp & Map.toList
648 -- ] & Map.fromListWith (<>)
656 deriving (Eq, Ord, Show, Enum)
658 numberedPinyinToDiacriticPiniyn :: ShortText -> ShortText
659 numberedPinyinToDiacriticPiniyn numPin =
669 addUmlaut ('u' : ':' : cs) = 'ü' : addUmlaut cs
670 addUmlaut ('v' : cs) = 'ü' : addUmlaut cs
671 addUmlaut ('U' : ':' : cs) = 'Ü' : addUmlaut cs
672 addUmlaut ('V' : cs) = 'Ü' : addUmlaut cs
673 addUmlaut (c : cs) = c : addUmlaut cs
675 -- CorrectnessNote: a, e or the o in ou always get the marker
676 setDiacritic (c@'a' : cs) = convert tone c : cs
677 setDiacritic (c@'e' : cs) = convert tone c : cs
678 setDiacritic (c@'o' : 'u' : cs) = convert tone c : 'u' : cs
679 setDiacritic (c : cs)
680 -- CorrectnessNote: if no a, e, or ou found, the tone mark goes on the last vowel
681 | List.elem c vowels && all (`List.notElem` vowels) cs = convert tone c : cs
682 | otherwise = c : setDiacritic cs
684 vowels = lowerVowels <> (Char.toUpper <$> lowerVowels)
685 lowerVowels = ['a', 'e', 'i', 'o', 'u', 'ü'] & list
687 (pinRoot, pinTone) ::
688 (ShortText, Maybe ChineseTone) =
689 numPin & ShortText.spanEnd Char.isDigit & second \case
691 ds -> ds & ShortText.unpack & read & (\x -> x - (1 :: Int)) & toEnum & Just
692 convert :: ChineseTone -> Char -> Char
693 convert t c = case t of
763 deriving (Eq, Ord, Enum, Show)
765 -- instance CSV.FromRecord ChineseDictEntries
766 -- instance CSV.ToRecord ChineseDictEntries
767 -- instance CSV.FromNamedRecord ChineseDictEntries
768 -- instance CSV.ToNamedRecord ChineseDictEntries
769 -- instance CSV.DefaultOrdered ChineseDictEntries
771 feed :: (ByteString -> r) -> Sys.Handle -> Sys.IO r
773 Sys.hIsEOF csvFile >>= \case
774 True -> return $ k ""
775 False -> k <$> ByteString.hGetSome csvFile 4096
777 readHSK :: HskLevel -> IO ChineseDict
778 readHSK chineseHSK = do
779 withDataFile ("data/langs/mandarin/HSK/hsk" <> show hskIndex Sys.<.> "csv") \fileHandle -> do
780 loop fileHandle mempty $
781 CSV.Incremental.decodeWithP parser decodeOpts CSV.NoHeader
783 hskIndex = chineseHSK & fromEnum & (+ 1)
784 decodeOpts = CSV.defaultDecodeOptions
785 parser :: CSV.Record -> CSV.Parser (ShortText, ChineseDictEntries)
789 chinesePinyins <- v .! 1 <&> ShortText.split (== ' ')
790 chineseEnglish <- v .! 2 <&> pure
791 let chineseFrequency = Nothing
792 let chineseComponents = Nothing
793 let chineseStrokes = Nothing
794 pure (chinese, ChineseDictEntries{..})
797 either (\x -> Sys.print x >> return mempty) \(chinese, e) ->
798 return $ ChineseDict $ Map.singleton chinese e
799 loop fileHandle !acc = \case
800 CSV.Incremental.Fail _ errMsg -> do
803 CSV.Incremental.Many rs k -> do
804 ok <- rs & foldMapM check
805 t <- feed k fileHandle
806 loop fileHandle (acc <> ok) t
807 CSV.Incremental.Done rs -> do
808 ok <- rs & foldMapM check
811 -- | Sun, C.C., Hendrix, P., Ma, J. et al.
812 -- Chinese lexical database (CLD).
813 -- Behav Res 50, 2606–2629 (2018).
814 -- https://doi.org/10.3758/s13428-018-1038-3
815 readChineseLexicalDatabase :: IO ChineseDict
816 readChineseLexicalDatabase = do
817 withDataFile "data/langs/mandarin/chineselexicaldatabase2.1.csv" \fileHandle -> do
818 loop fileHandle mempty $
819 CSV.Incremental.decodeWithP parser decodeOpts CSV.HasHeader
821 decodeOpts = CSV.defaultDecodeOptions
822 parser :: CSV.Record -> CSV.Parser (DictKey, ChineseDictEntries)
824 | length v == 269 = do
826 keyLength :: Int <- v .! 5
832 [pinyin1, pinyin2, pinyin3, pinyin4]
833 & List.take keyLength
834 freqPerMillion :: Double <- v .! 39
835 let chineseFrequency =
840 & fromMaybe (error "readChineseLexicalDatabase: Frequency")
842 pure (key, mempty{chineseFrequency, chinesePinyins})
843 | otherwise = error $ "readChineseLexicalDatabase: wrong number of columns: " <> show v
845 either (\x -> Sys.print x >> return mempty) \(key, val) ->
846 return $ ChineseDict $ Map.singleton key val
847 loop fileHandle !acc = \case
848 CSV.Incremental.Fail _ errMsg -> do
851 CSV.Incremental.Many rs k -> do
852 ok <- rs & foldMapM check
853 t <- feed k fileHandle
854 loop fileHandle (acc <> ok) t
855 CSV.Incremental.Done rs -> do
856 ok <- rs & foldMapM check
859 readCEDICT :: IO ChineseDict
861 withDataFile "data/langs/mandarin/CEDICT/cedict_ts.u8" \cedictHandle -> do
863 isEOF <- cedictHandle & Sys.hIsEOF
867 lineBS <- ByteString.hGetLine cedictHandle
868 let lineST = lineBS & ShortText.fromByteString & fromMaybe (error "invalid UTF-8")
869 let begin = lineST & ShortText.take 1
870 when (begin == "#") do
874 isEOF <- cedictHandle & Sys.hIsEOF
878 line <- ByteString.hGetLine cedictHandle
879 let decodeUtf8 = ShortText.fromByteString >>> fromMaybe (error "invalid UTF-8")
880 -- DescriptionNote: each line is formatted as: #(.+) (.+) \[(.+)] /(.*)/#'
881 let skipChar c = void $ MT.state $ ByteString.span (== fromIntegral (fromEnum c))
884 ByteString.stripPrefix p
885 >>> fromMaybe (error $ "skipPrefix fail to match: " <> show p)
886 let breakOnChar c = ByteString.break (== fromIntegral (fromEnum c))
887 let breakOnSpace = breakOnChar ' '
890 ByteString.stripSuffix p
893 "skipSuffix: mismatch: "
896 <> ShortText.unpack (decodeUtf8 line)
898 <> show (ShortText.unpack (decodeUtf8 line))
900 let (dict, leftover) = (`MT.runState` line) do
901 _chineseTrad <- MT.state $ breakOnSpace >>> first decodeUtf8
903 chineseSimpl <- MT.state $ breakOnSpace >>> first decodeUtf8
912 & ByteString.split (fromIntegral (fromEnum ' '))
913 -- CorrectnessWarning: convert pinyin to lowercase, and remove dashes
914 <&> (decodeUtf8 >>> ShortText.unpack >>> (<&> Char.toLower) >>> ShortText.pack)
915 & List.filter (/= "-")
918 -- CorrectnessNote: some lines do not end with \r
919 -- hence make it optional.
920 MT.modify' \s -> s & ByteString.stripSuffix "\r" & fromMaybe s
922 chineseEnglish <- MT.gets \s -> s & ByteString.split (fromIntegral (fromEnum '/')) <&> decodeUtf8
924 let chinese = chineseSimpl
927 Map.singleton chinese $
932 if not (ByteString.null leftover)
933 then error $ "parserLine: leftover: " <> show leftover
934 else loop (acc <> dict)
937 lookupPinyins :: ChineseDict -> ShortText -> [ShortText]
938 lookupPinyins (ChineseDict dict) word =
940 & (`Map.lookup` dict)
941 & fromMaybe (error $ "lookupPinyins: no entry for: {" <> wordString <> "}")
943 & (\ps -> if null ps then error $ "lookupPinyins: empty entry for: " <> wordString else ps)
945 wordString = word & ShortText.unpack
947 pronunciation :: ChineseDict -> [Pron.Lexeme] -> [[Either Char Pron.Pron]]
948 pronunciation dict inp =
949 inp & Pron.lexemesChars & Text.pack & Text.words <&> wordToPron
951 wordToPron :: Text -> [Either Char Pron.Pron]
952 wordToPron wordText =
953 [ pronun chars charCateg charBlock
954 | (charCateg, uniBlockToChars) <-
957 & List.map (\c -> (Char.generalCategory c, (Char.unicodeBlock c, c)))
958 & Char.consecutiveGroups
959 <&> (<&> Char.consecutiveGroups)
960 , (charBlock, chars) <- uniBlockToChars
964 pronun :: [Char] -> Char.GeneralCategory -> Maybe Char.UnicodeBlock -> [Either Char Pron.Pron]
965 pronun chars Char.DecimalNumber _ =
968 { pronInput = [Pron.LexemeChar char]
973 { Pron.unPronunciations =
974 [ Pron.RuleLexemes [Pron.LexemeChar char] :=
976 { Pron.pronunciationIPABroad = []
977 , Pron.pronunciationText = txt char
988 & ShortText.singleton
992 & numberedPinyinToDiacriticPiniyn
996 & Text.intercalate " "
997 pronun chars Char.OpenPunctuation _ =
1001 pronun chars Char.ClosePunctuation _ =
1005 pronun chars Char.OtherPunctuation _ =
1009 pronun chars _ (Just Char.UnicodeBlockCJK{}) =
1012 { pronInput = Pron.LexemeChar <$> chars
1017 { Pron.unPronunciations =
1018 [ Pron.RuleLexemes (Pron.LexemeChar <$> chars) :=
1020 { Pron.pronunciationIPABroad = []
1021 , Pron.pronunciationText = txt
1032 & lookupPinyins dict
1036 & numberedPinyinToDiacriticPiniyn
1040 & Text.intercalate " "
1041 -- if List.length wordPinyins == Text.length wordText = wordPinyins
1043 -- wordPinyins = chars
1045 -- & lookupPinyins dict
1046 pronun chars categ block = errorShow ("pronunciation" :: Text, chars, categ, block)
1048 orderHTML dict@(ChineseDict d) = do
1049 dataPath <- Self.getDataDir <&> File.normalise
1050 return $ Blaze.renderMarkupBuilder do
1053 HTML.title $ ("Chinese Order" :: Text) & HTML.toHtml
1055 ( [ "styles/Paper.css"
1056 , "styles/Rosetta/Common.css"
1057 , "styles/Rosetta/Reading.css"
1063 ! HA.rel "stylesheet"
1064 ! HA.type_ "text/css"
1065 ! HA.href (dataPath </> cssFile & HTML.toValue)
1066 -- HTML.styleCSS $ pagesDifficulties & difficultyCSS
1068 let ChineseOrder o = dict & dictToWeights & dictOrder
1074 & foldMap (weightsMap >>> Map.keys)
1079 [ "break-inside" := "avoid"
1080 , "list-style-position" := "inside"
1083 HTML.span $ k & HTML.toHtml
1084 forM_ (d & Map.lookup k) \v -> do
1085 HTML.ul ! HTML.styles [] $ do
1087 HTML.span $ v & chinesePinyins <&> ShortText.toText & Text.unwords & HTML.toHtml
1089 HTML.span $ v & chineseDecomp k & Text.pack & HTML.toHtml
1091 HTML.span $ v & chineseEnglish <&> ShortText.toText & Text.intercalate "; " & HTML.toHtml
1093 orderDOT :: ChineseDict -> IO BS.Builder
1094 orderDOT dict@(ChineseDict d) = do
1096 -- dotComments [(BS.lazyByteString $ LazyText.encodeUtf8 $ pShow $ rangeToMstToGroupToClusters & Map.map Map.keys)]
1097 -- pTraceShow ("num of nodes", Map.size nodeToBranch, "num of branches", Map.size msf) $
1098 DOT.dotLine "digraph g"
1102 dotLine "splines=\"ortho\""
1103 indexFrom1M_ (rangeToMstToGroupToClusters & Map.toList) \(srcR, mstToGroupToClusters) srcRI -> do
1104 let srcRB = "r" <> BS.intDec srcRI
1105 dotLine $ "subgraph cluster_" <> srcRB
1107 dotComments ["Create a node for the range " <> srcRB]
1111 , ("label", builderQuotedString (showHuman srcR))
1113 , ("style", "filled")
1114 , ("fillcolor", "gray")
1116 dotLine "color=gray"
1119 dotComments ["Create the cluster nodes within the range " <> srcRB]
1120 forM_ (mstToGroupToClusters & Map.toList) \(mstI, groupToClusters) -> do
1121 forM_ (groupToClusters & Map.toList) \(srcGroup, srcClusters) -> do
1128 , builderQuotedString $
1129 (srcClusters & toList <&> showHuman & List.unlines)
1131 <> printf "%03d" mstI
1134 -- <> {-maybe ""-} (("\n" <>) . showSimilarity) minSimil
1136 , ("style", "filled")
1137 , -- , minSimil & {-maybe ("", "")-} (\s -> ("fillcolor", 1 + ((floor (runProbability s * 10)) `mod` 10) & BS.intDec))
1138 ("colorscheme", "ylorrd9")
1141 dotComments ["Horizontally align the cluster nodes within the same range"]
1144 | (mstI, groupToClusters) <- mstToGroupToClusters & Map.toList
1145 , (group, _clusters) <- groupToClusters & Map.toList
1149 c@(firstMst, firstGroup) : cs -> do
1151 [srcRB, srcRB <> "t" <> BS.intDec firstMst <> "c" <> BS.intDec firstGroup]
1152 [ ("style", "invis")
1154 cs & (`foldM_` c) \(srcMst, srcGroup) dst@(dstMst, dstGroup) -> do
1156 [(srcRI, srcMst, srcGroup), (srcRI, dstMst, dstGroup)]
1158 , ("style", "invis")
1161 indexFrom1M_ sortedMSF \mst mstI -> do
1162 dotComments ["Create the edges of the MST " <> BS.intDec mstI]
1163 -- pTraceShowM (mstI, List.length (Tree.flatten mst))
1164 let loop (Tree.Node MSTNode{mstNodeRangeCluster = src} dsts) = do
1165 forM_ dsts \dstNode@(Tree.Node MSTNode{mstNodeRangeCluster = dst, mstNodeSimilarity = simil} _) -> do
1166 -- let similB = BS.stringUtf8 $ showFFloat (Just 2) (simil & runProbability & fromRational @Double) ""
1167 let indexRangeCluster (r, c) =
1168 let clusterToGroup :: cluster :-> ClusterGroup =
1171 | (group, clusters) <-
1172 rangeToMstToGroupToClusters
1174 & fromMaybe Map.empty
1176 & fromMaybe Map.empty
1178 , cluster <- clusters & Set.toList
1180 in ( 1 + Map.findIndex r rangeToMstToGroupToClusters
1182 , Map.lookup c clusterToGroup
1183 & fromMaybe (error (LazyText.unpack (pShow ("r", r, "c", c {-, "clusterToGroup", clusterToGroup, "rangeToMstToGroupToClusters", rangeToMstToGroupToClusters-}))))
1186 [ indexRangeCluster src
1187 , indexRangeCluster dst
1189 [ ("constraint", "false")
1190 , ("color", (floor (runProbability simil * 10)) `mod` 10 & BS.intDec)
1191 , ("colorscheme", "ylorrd9")
1192 , -- , ("label", similB)
1193 ("fontcolor", "blue")
1195 , ("arrowhead", "dot")
1196 , ("arrowtail", "dot")
1200 dotRanges rangeToMstToGroupToClusters