1 {-# LANGUAGE OverloadedLists #-}
2 {-# LANGUAGE OverloadedStrings #-}
4 module Language.Chinese where
6 import Control.Applicative (Alternative (..))
7 import Control.Monad.ST qualified as ST
8 import Control.Monad.Trans.State qualified as MT
9 import Data.Aeson qualified as JSON
10 import Data.Aeson.Key qualified as JSON.DictKey
11 import Data.Aeson.KeyMap qualified as JSON
12 import Data.ByteString (ByteString)
13 import Data.ByteString qualified as ByteString
14 import Data.ByteString.Builder qualified as BS
15 import Data.ByteString.Char8 qualified as ByteString.Char8
16 import Data.Char qualified as Char
17 import Data.Csv ((.!))
18 import Data.Csv qualified as CSV
19 import Data.Csv.Incremental qualified as CSV.Incremental
20 import Data.List qualified as List
21 import Data.Map.Strict qualified as Map
22 import Data.STRef qualified as ST
23 import Data.Set qualified as Set
24 import Data.Text qualified as Text
25 import Data.Text.Short qualified as ShortText
26 import Graph.DOT qualified as DOT
27 import Numeric.Decimal.BoundedArithmetic (arithError, arithM)
28 import Paths_worksheets qualified as Self
29 import System.Exit qualified as Sys
30 import System.FilePath qualified as Sys
31 import System.FilePath.Posix ((</>))
32 import System.FilePath.Posix qualified as File
33 import System.IO qualified as Sys
34 import Text.Read (read, readMaybe)
35 import Worksheets.Utils.HTML (className, classes, styles, (!))
36 import Worksheets.Utils.HTML qualified as HTML
37 import Prelude (error, fromIntegral, undefined, (/))
40 import Text.Blaze.Html5 qualified as HTML
41 import Text.Blaze.Html5.Attributes qualified as HA
42 import Text.Blaze.Renderer.Utf8 qualified as Blaze
44 import Language.Pronunciation qualified as Pron
45 import Worksheets.Utils.Char qualified as Char
46 import Worksheets.Utils.JSON
47 import Worksheets.Utils.Prelude
48 import Worksheets.Utils.Probability
50 newtype ChineseDict = ChineseDict (Map ShortText ChineseDictEntries)
52 unChineseDict (ChineseDict x) = x
53 instance Semigroup ChineseDict where
54 ChineseDict x <> ChineseDict y =
55 ChineseDict (Map.unionWithKey merge x y)
57 merge _k !xV !yV = xV <> yV
59 keyLengthToSumOfEntries :: ChineseDict -> Map Natural Natural
60 keyLengthToSumOfEntries (ChineseDict d) =
61 [ ( k & ShortText.length & fromIntegral
66 & Map.fromListWith (+)
68 chineseFrequencySum :: ChineseDict -> Double
69 chineseFrequencySum (ChineseDict d) =
72 , freq <- e & chineseFrequency & maybeToList
77 & fromRational @Double
79 instance Monoid ChineseDict where
80 mempty = ChineseDict mempty
82 -- FIXME: use a sum type Char/Word
83 data ChineseDictEntries = ChineseDictEntries
84 { chinesePinyins :: ![ShortText]
85 , -- TODO: , chineseIPA :: !IPA.IPA
86 chineseEnglish :: ![ShortText]
87 , chineseFrequency :: !(Maybe Probability)
88 , chineseComponents :: !(Maybe [Char])
89 , chineseStrokes :: !(Maybe Natural)
90 -- ^ FIXME: use Natural
92 deriving (Generic, Show)
93 chineseDecomp :: ShortText -> ChineseDictEntries -> [Char]
94 chineseDecomp key ChineseDictEntries{chineseComponents} =
96 & fromMaybe (if 1 < ShortText.length key then ShortText.unpack key else [])
98 instance Semigroup ChineseDictEntries where
102 if null (chinesePinyins x)
103 then chinesePinyins y
105 if null (chinesePinyins y)
108 then chinesePinyins x
111 -- [ "Semigroup: chinesePinyins: mismatch: "
112 -- , show (chinesePinyins x)
114 -- , show (chinesePinyins y)
118 , chineseEnglish = chineseEnglish x <> chineseEnglish y
119 , chineseFrequency = chineseFrequency x <|> chineseFrequency y
120 , chineseComponents =
121 case chineseComponents x of
122 Nothing -> chineseComponents y
124 case chineseComponents y of
125 Nothing -> chineseComponents x
127 | xS == yS -> chineseComponents x
128 | otherwise -> error $ "Semigroup: chineseComponents: mismatch: " <> show xS <> " <> " <> show yS
130 case chineseStrokes x of
131 Nothing -> chineseStrokes y
133 case chineseStrokes y of
134 Nothing -> chineseStrokes x
136 | xS == yS -> chineseStrokes x
137 | otherwise -> error $ "Semigroup: chineseStrokes: mismatch: " <> show xS <> " <> " <> show yS
139 instance Monoid ChineseDictEntries where
142 { chinesePinyins = mempty
143 , chineseEnglish = mempty
144 , chineseFrequency = Nothing
145 , chineseComponents = Nothing
146 , chineseStrokes = Nothing
149 readLoachWangFreq :: IO ChineseDict
150 readLoachWangFreq = do
151 readJSON "data/langs/mandarin/LoachWang/word_freq.json" $
152 JSON.withObject "WordFreq" \kv ->
155 <$> forM (kv & JSON.toMap & Map.toList) \(k, v) ->
156 v & JSON.withScientific "Frequency" \freqSci -> do
157 let freq = freqSci & toRational & probability & arithError
159 ( k & JSON.DictKey.toShortText
160 , mempty{chineseFrequency = Just freq}
163 readLoachWangStrokes :: IO ChineseDict
164 readLoachWangStrokes =
165 readJSON ("data/langs/mandarin/LoachWang/char_strokes.json") $
166 JSON.withObject "WordFreq" \kv ->
169 <$> forM (kv & JSON.toMap & Map.toList) \(k, v) -> do
170 strokes <- JSON.parseJSON v
172 ( k & JSON.DictKey.toShortText
173 , mempty{chineseStrokes = Just strokes}
176 circledNumbers :: Set ShortText
177 circledNumbers = ["①", "②", "③", "④", "⑤", "⑥", "⑦", "⑧", "⑨", "⑩", "⑪", "⑫", "⑬", "⑭", "⑮", "⑯", "⑰", "⑱", "⑲", "⑳"]
179 readChiseStrokes :: IO ChineseDict
181 withDataFile "data/langs/mandarin/cjkvi-ids/ucs-strokes.txt" \dataHandle -> do
182 let loop accDict@(ChineseDict !acc) = do
183 isEOF <- dataHandle & Sys.hIsEOF
187 line <- ByteString.Char8.hGetLine dataHandle
188 let decodeUtf8 = ShortText.fromByteString >>> fromMaybe (error "invalid UTF-8")
189 let splitOnChar c = ByteString.split (fromIntegral (fromEnum c))
190 case line & splitOnChar '\t' of
191 [unicodeLit, decodeUtf8 -> unicodeChar, chineseStrokesLit]
192 | unicodeLit & ByteString.isPrefixOf "U+" ->
195 acc & Map.insert unicodeChar mempty{chineseStrokes}
197 chineseStrokes :: Maybe Natural =
202 >>> readMaybe @Natural
204 -- CorrectnessWarning: conservative bias toward the greatest number of strokes.
210 readChiseDecomp :: IO ChineseDict
212 withDataFile "data/langs/mandarin/cjkvi-ids/ids.txt" $ (`loop` mempty)
214 loop dataHandle accDict@(ChineseDict !acc) = do
215 isEOF <- dataHandle & Sys.hIsEOF
219 line <- ByteString.Char8.hGetLine dataHandle
220 let decodeUtf8 = ShortText.fromByteString >>> fromMaybe (error "invalid UTF-8")
221 let splitOnChar c = ByteString.split (fromIntegral (fromEnum c))
222 case line & splitOnChar '\t' of
223 unicodeLit : (decodeUtf8 -> unicodeChar) : decomps
224 | unicodeLit & ByteString.isPrefixOf "U+" ->
226 decompWithCodes :: [([Char], Set Char)] =
227 decomps <&> \decomp ->
228 case decomp & ByteString.stripSuffix "]" of
230 ( decomp & decodeUtf8 & ShortText.unpack
233 Just decompWithCode ->
236 -- ExplanationNote: remove the final '['
238 , decompCodes & ShortText.unpack & fromList
241 (decompChars, decompCodes) =
244 & ShortText.breakEnd (== '[')
247 -- FIXME: CorrectnessWarning: maybe select using the Codes
248 -- instead of just taking the first.
251 -- ExplanationNote: remove IDCs (Ideographic Description Characters)
252 -- namely "⿰" (U+2FF0) to "⿻" (U+2FFB).
253 & List.filter (\c -> not ('\x2FF0' <= c && c <= '\x2FFB'))
254 & List.filter (\c -> ShortText.singleton c /= unicodeChar)
259 acc & Map.insert unicodeChar mempty{chineseComponents}
260 _ -> loop dataHandle accDict
262 readChineseDict :: IO ChineseDict
266 [ return extraEntries
270 , -- , readChineseLexicalDatabase
272 -- , readLoachWangStrokes
273 -- , readOutlierDecomp
275 extraEntries :: ChineseDict
282 { chineseStrokes = Just 1
283 , chineseComponents = Just ['·']
289 { chinesePinyins = ["ku4", "meng3", "shi4"]
290 , chineseEnglish = ["culicoides"]
296 { chinesePinyins = ["shou4", "guo3"]
297 , chineseEnglish = ["achene"]
303 { chinesePinyins = ["mei3", "zhou1", "ye3", "ma3"]
304 , chineseEnglish = ["mustang horse"]
310 { chinesePinyins = ["gou1", "fen3", "die2"]
311 , chineseEnglish = ["brimstone butterfly"]
317 { chinesePinyins = ["he2", "guo3"]
318 , chineseEnglish = ["drupe"]
324 { chineseStrokes = Just 14
325 , chinesePinyins = ["ling2"] -- 零
331 { chineseStrokes = Just 2
332 , chinesePinyins = ["yi1"]
338 { chineseStrokes = Just 2
339 , chinesePinyins = ["er4"]
345 { chineseStrokes = Just 2
346 , chinesePinyins = ["san1"]
352 { chineseStrokes = Just 2
353 , chinesePinyins = ["si4"]
359 { chineseStrokes = Just 2
360 , chinesePinyins = ["wu3"]
366 { chineseStrokes = Just 1
367 , chinesePinyins = ["liu4"]
373 { chineseStrokes = Just 1
374 , chinesePinyins = ["qi1"]
380 { chineseStrokes = Just 2
381 , chinesePinyins = ["ba1"]
387 { chineseStrokes = Just 1
388 , chinesePinyins = ["jiu3"]
392 , [ ( letter & ShortText.singleton
394 { chineseStrokes = Just 1
395 , chinesePinyins = []
409 readLoachWangWordOrder :: IO JSON.Array
410 readLoachWangWordOrder =
411 readJSON "data/langs/mandarin/LoachWang/loach_word_order.json" $
412 JSON.withArray "LoachWordOrder" \arr -> return arr
414 readOutlierDecomp :: IO ChineseDict
416 readJSON "data/langs/mandarin/LoachWang/outlier_decomp.json" $
417 JSON.withObject "OutlierDecomp" \kv ->
420 <$> forM (kv & JSON.toMap & Map.toList) \(k, v) ->
421 v & JSON.withArray "Decomp" \decomp ->
425 sub & JSON.withText "Atom" \atom ->
426 return $ atom & Text.unpack
429 ( k & JSON.DictKey.toShortText
430 , mempty{chineseComponents = Just $ atoms & mconcat}
433 reverseDecomp :: ChineseDict -> Map {-def-} ShortText {-refs-} [ShortText]
434 reverseDecomp (ChineseDict dict) =
436 | (key, entry) <- dict & Map.toList
437 , sub <- entry & chineseDecomp key <&> ShortText.singleton
439 & Map.fromListWith (<>)
441 type Benefit = Probability
443 type Weight = Probability
444 type DictKey = ShortText
445 data Weights = Weights
446 { weightsMap :: Map DictKey (Benefit, Cost)
451 -- Zhao, K., zhao, D., Yang, J., Ma, W., & Yu, B. (2019).
452 -- Efficient Learning Strategy of Chinese Characters Based on
453 -- Usage Frequency and Structural Relationship.
454 -- 2019 IEEE 4th International Conference on Big Data Analytics (ICBDA).
455 -- doi:10.1109/icbda.2019.8713245
456 -- https://sci-hub.se/10.1109/icbda.2019.8713245
457 dictToWeights :: ChineseDict -> Weights
458 dictToWeights cn@(ChineseDict dict) = ST.runST do
459 -- Explanation: memo tables, to avoid recomputing benefits and costs
460 keyToBenefitST :: ST.STRef st (Map DictKey Benefit) <- ST.newSTRef Map.empty
461 keyToCostST :: ST.STRef st (Map DictKey Cost) <- ST.newSTRef Map.empty
462 let refsMap :: Map ShortText [ShortText] = reverseDecomp cn
464 keyToBenefitLoop :: DictKey -> ST.ST st Benefit
465 keyToBenefitLoop key = do
466 -- traceShowM ("keyToBenefitLoop"::String, key)
467 keyToBenefitMap <- keyToBenefitST & ST.readSTRef
468 if Set.member key circledNumbers
470 -- Explanation: circledNumbers induce a cost but zero benefit
476 Just bene -> return bene
478 let refs :: [ShortText] =
484 -- traceShowM ("keyToBenefitLoop"::String, key, refs)
488 & maybe proba0 (chineseFrequency >>> fromMaybe proba0)
489 refsBC <- forM refs \ref -> do
490 bene <- keyToBenefitLoop ref
491 cost <- keyToCostLoop ref
493 cost <- keyToCostLoop key
500 [ runProbability b / toRational c
504 ST.modifySTRef keyToBenefitST $ Map.insert key bene
506 keyToCostLoop :: DictKey -> ST.ST st Cost
507 keyToCostLoop key = do
508 -- traceShowM ("keyToCostLoop"::String, key)
509 keyToCostMap <- keyToCostST & ST.readSTRef
510 if Set.member key circledNumbers
512 -- Explanation: circledNumbers induce a cost given by their number
513 return $ dict & Map.lookup key & maybe 1 (chineseStrokes >>> fromMaybe 1)
518 Just cost -> return cost
520 let entry = dict & Map.lookup key & fromMaybe mempty
523 if ShortText.length key == 1
528 & List.delete (ShortText.unpack key & List.head)
529 -- & (`Set.difference` (ShortText.unpack key & Set.fromList))
536 else -- & fromMaybe (error $ "dictToWeights: missing chineseStrokes for: " <> ShortText.unpack key)
539 -- traceShowM ("composite"::String, show key, show <$> Set.toList deps
540 -- , deps & (`Set.difference` (ShortText.unpack key & Set.fromList))
543 & foldMapM (\dep -> Sum <$> keyToCostLoop (dep & ShortText.singleton))
547 -- traceShowM ("word"::String, key)
550 & foldMapM (\dep -> Sum <$> keyToCostLoop (dep & ShortText.singleton))
552 ST.modifySTRef keyToCostST $ Map.insert key cost
555 dict & Map.traverseWithKey \key _entry -> do
556 -- traceShowM ("dictToWeights"::String, key)
557 bene <- keyToBenefitLoop key
558 cost <- keyToCostLoop key
565 newtype ChineseOrder = ChineseOrder (Map (Down Weight) Weights)
566 unChineseOrder (ChineseOrder x) = x
567 dictOrder :: Weights -> ChineseOrder
569 [ ( probability (runProbability b / toRational c) & arithError & Down
571 { weightsMap = Map.singleton k v
574 | (k, v@(b, c)) <- w & weightsMap & Map.toList
576 & Map.fromListWith (\x y -> Weights{weightsMap = Map.unionWith (error "dictOrder") (weightsMap x) (weightsMap y)})
579 dictOrderIO = readChineseDict <&> (dictToWeights >>> dictOrder)
581 -- order100 :: IO [(DictKey, Maybe ChineseDictEntries)]
583 dict@(ChineseDict d) <- readChineseDict
584 let ChineseOrder o = dict & dictToWeights & dictOrder
589 & foldMap (weightsMap >>> Map.keys)
590 <&> \k -> (k, d & Map.lookup k <&> \r -> (chinesePinyins r, chineseEnglish r))
592 dictOrderBySameWeight (ChineseOrder o) =
593 [ ( Map.size (weightsMap v)
596 | (k, v) <- o & Map.toList
598 & Map.fromListWith (<>)
601 data UnfoldedGraph node lbl edge = UnfoldedGraph
602 { unfoldedVertexToEdges :: !(Array.Array Vertex (lbl, Map edge IntSet))
603 , unfoldedNodeToVertex :: !(Map node Vertex)
605 deriving (Show, Generic)
607 forall node edge nodeLabel.
609 (node -> (nodeLabel, Map edge (Set node))) ->
611 UnfoldedGraph node nodeLabel edge
612 unfoldGMany gen roots = ST.runST do
613 nodeToVertexST :: ST.STRef st (Map node Vertex) <- ST.newSTRef Map.empty
614 vertexToEdgesST :: ST.STRef st [(Vertex, (nodeLabel, Map edge IntSet))] <- ST.newSTRef []
615 lastVertexST <- ST.newSTRef firstVertex
617 v <- ST.readSTRef lastVertexST
618 ST.writeSTRef lastVertexST (v + 1)
621 nodeToVertexDepthFirst :: node -> ST.ST st Vertex
622 nodeToVertexDepthFirst src =
627 -- DescriptionNote: `src` has already been seen,
628 -- just return its unique `Vertex`.
630 -- DescriptionNote: `src` has never been seen yet,
631 -- allocate a `newVertex` for it
634 ST.modifySTRef nodeToVertexST $ Map.insert src dst
635 let (nodeLabel, edgeToDsts) = gen src
636 edgeToDstsV :: Map edge IntSet <-
637 edgeToDsts & Map.traverseWithKey \_edge dsts ->
638 forM (dsts & Set.toList) nodeToVertexDepthFirst
640 let res = (dst, (nodeLabel, edgeToDstsV))
641 ST.modifySTRef vertexToEdgesST (res :)
643 forM_ roots nodeToVertexDepthFirst
644 nodeToVertex <- ST.readSTRef nodeToVertexST
645 vertexToEdges <- ST.readSTRef vertexToEdgesST
646 lastId <- ST.readSTRef lastVertexST
649 { unfoldedVertexToEdges = vertexToEdges & Array.array (firstVertex, lastId - 1)
650 , unfoldedNodeToVertex = nodeToVertex
653 firstVertex :: Vertex = 0
658 -- | d & reverseDecomp & Map.toList
659 -- ] & Map.fromListWith (<>)
667 deriving (Eq, Ord, Show, Enum)
669 numberedPinyinToDiacriticPiniyn :: ShortText -> ShortText
670 numberedPinyinToDiacriticPiniyn numPin =
680 addUmlaut ('u' : ':' : cs) = 'ü' : addUmlaut cs
681 addUmlaut ('v' : cs) = 'ü' : addUmlaut cs
682 addUmlaut ('U' : ':' : cs) = 'Ü' : addUmlaut cs
683 addUmlaut ('V' : cs) = 'Ü' : addUmlaut cs
684 addUmlaut (c : cs) = c : addUmlaut cs
686 -- CorrectnessNote: a, e or the o in ou always get the marker
687 setDiacritic (c@'a' : cs) = convert tone c : cs
688 setDiacritic (c@'e' : cs) = convert tone c : cs
689 setDiacritic (c@'o' : 'u' : cs) = convert tone c : 'u' : cs
690 setDiacritic (c : cs)
691 -- CorrectnessNote: if no a, e, or ou found, the tone mark goes on the last vowel
692 | List.elem c vowels && all (`List.notElem` vowels) cs = convert tone c : cs
693 | otherwise = c : setDiacritic cs
695 vowels = lowerVowels <> (Char.toUpper <$> lowerVowels)
696 lowerVowels = ['a', 'e', 'i', 'o', 'u', 'ü'] & list
698 (pinRoot, pinTone) ::
699 (ShortText, Maybe ChineseTone) =
700 numPin & ShortText.spanEnd Char.isDigit & second \case
702 ds -> ds & ShortText.unpack & read & (\x -> x - (1 :: Int)) & toEnum & Just
703 convert :: ChineseTone -> Char -> Char
704 convert t c = case t of
774 deriving (Eq, Ord, Enum, Show)
776 -- instance CSV.FromRecord ChineseDictEntries
777 -- instance CSV.ToRecord ChineseDictEntries
778 -- instance CSV.FromNamedRecord ChineseDictEntries
779 -- instance CSV.ToNamedRecord ChineseDictEntries
780 -- instance CSV.DefaultOrdered ChineseDictEntries
782 feed :: (ByteString -> r) -> Sys.Handle -> Sys.IO r
784 Sys.hIsEOF csvFile >>= \case
785 True -> return $ k ""
786 False -> k <$> ByteString.hGetSome csvFile 4096
788 readHSK :: HskLevel -> IO ChineseDict
789 readHSK chineseHSK = do
790 withDataFile ("data/langs/mandarin/HSK/hsk" <> show hskIndex Sys.<.> "csv") \fileHandle -> do
791 loop fileHandle mempty $
792 CSV.Incremental.decodeWithP parser decodeOpts CSV.NoHeader
794 hskIndex = chineseHSK & fromEnum & (+ 1)
795 decodeOpts = CSV.defaultDecodeOptions
796 parser :: CSV.Record -> CSV.Parser (ShortText, ChineseDictEntries)
800 chinesePinyins <- v .! 1 <&> ShortText.split (== ' ')
801 chineseEnglish <- v .! 2 <&> pure
802 let chineseFrequency = Nothing
803 let chineseComponents = Nothing
804 let chineseStrokes = Nothing
805 pure (chinese, ChineseDictEntries{..})
808 either (\x -> Sys.print x >> return mempty) \(chinese, e) ->
809 return $ ChineseDict $ Map.singleton chinese e
810 loop fileHandle !acc = \case
811 CSV.Incremental.Fail _ errMsg -> do
814 CSV.Incremental.Many rs k -> do
815 ok <- rs & foldMapM check
816 t <- feed k fileHandle
817 loop fileHandle (acc <> ok) t
818 CSV.Incremental.Done rs -> do
819 ok <- rs & foldMapM check
822 -- | Sun, C.C., Hendrix, P., Ma, J. et al.
823 -- Chinese lexical database (CLD).
824 -- Behav Res 50, 2606–2629 (2018).
825 -- https://doi.org/10.3758/s13428-018-1038-3
826 readChineseLexicalDatabase :: IO ChineseDict
827 readChineseLexicalDatabase = do
828 withDataFile "data/langs/mandarin/chineselexicaldatabase2.1.csv" \fileHandle -> do
829 loop fileHandle mempty $
830 CSV.Incremental.decodeWithP parser decodeOpts CSV.HasHeader
832 decodeOpts = CSV.defaultDecodeOptions
833 parser :: CSV.Record -> CSV.Parser (DictKey, ChineseDictEntries)
835 | length v == 269 = do
837 keyLength :: Int <- v .! 5
843 [pinyin1, pinyin2, pinyin3, pinyin4]
844 & List.take keyLength
845 freqPerMillion :: Double <- v .! 39
846 let chineseFrequency =
851 & fromMaybe (error "readChineseLexicalDatabase: Frequency")
853 pure (key, mempty{chineseFrequency, chinesePinyins})
854 | otherwise = error $ "readChineseLexicalDatabase: wrong number of columns: " <> show v
856 either (\x -> Sys.print x >> return mempty) \(key, val) ->
857 return $ ChineseDict $ Map.singleton key val
858 loop fileHandle !acc = \case
859 CSV.Incremental.Fail _ errMsg -> do
862 CSV.Incremental.Many rs k -> do
863 ok <- rs & foldMapM check
864 t <- feed k fileHandle
865 loop fileHandle (acc <> ok) t
866 CSV.Incremental.Done rs -> do
867 ok <- rs & foldMapM check
870 readCEDICT :: IO ChineseDict
872 withDataFile "data/langs/mandarin/CEDICT/cedict_ts.u8" \cedictHandle -> do
874 isEOF <- cedictHandle & Sys.hIsEOF
878 lineBS <- ByteString.hGetLine cedictHandle
879 let lineST = lineBS & ShortText.fromByteString & fromMaybe (error "invalid UTF-8")
880 let begin = lineST & ShortText.take 1
881 when (begin == "#") do
885 isEOF <- cedictHandle & Sys.hIsEOF
889 line <- ByteString.hGetLine cedictHandle
890 let decodeUtf8 = ShortText.fromByteString >>> fromMaybe (error "invalid UTF-8")
891 -- DescriptionNote: each line is formatted as: #(.+) (.+) \[(.+)] /(.*)/#'
892 let skipChar c = void $ MT.state $ ByteString.span (== fromIntegral (fromEnum c))
895 ByteString.stripPrefix p
896 >>> fromMaybe (error $ "skipPrefix fail to match: " <> show p)
897 let breakOnChar c = ByteString.break (== fromIntegral (fromEnum c))
898 let breakOnSpace = breakOnChar ' '
901 ByteString.stripSuffix p
904 "skipSuffix: mismatch: "
907 <> ShortText.unpack (decodeUtf8 line)
909 <> show (ShortText.unpack (decodeUtf8 line))
911 let (dict, leftover) = (`MT.runState` line) do
912 _chineseTrad <- MT.state $ breakOnSpace >>> first decodeUtf8
914 chineseSimpl <- MT.state $ breakOnSpace >>> first decodeUtf8
923 & ByteString.split (fromIntegral (fromEnum ' '))
924 -- CorrectnessWarning: convert pinyin to lowercase, and remove dashes
925 <&> (decodeUtf8 >>> ShortText.unpack >>> (<&> Char.toLower) >>> ShortText.pack)
926 & List.filter (/= "-")
929 -- CorrectnessNote: some lines do not end with \r
930 -- hence make it optional.
931 MT.modify' \s -> s & ByteString.stripSuffix "\r" & fromMaybe s
933 chineseEnglish <- MT.gets \s -> s & ByteString.split (fromIntegral (fromEnum '/')) <&> decodeUtf8
935 let chinese = chineseSimpl
938 Map.singleton chinese $
943 if not (ByteString.null leftover)
944 then error $ "parserLine: leftover: " <> show leftover
945 else loop (acc <> dict)
948 lookupPinyins :: ChineseDict -> ShortText -> [ShortText]
949 lookupPinyins (ChineseDict dict) word =
951 & (`Map.lookup` dict)
952 & fromMaybe (error $ "lookupPinyins: no entry for: {" <> wordString <> "}")
954 & (\ps -> if null ps then error $ "lookupPinyins: empty entry for: " <> wordString else ps)
956 wordString = word & ShortText.unpack
958 pronunciation :: ChineseDict -> [Pron.Lexeme] -> [[Either Char Pron.Pron]]
959 pronunciation dict inp =
960 inp & Pron.lexemesChars & Text.pack & Text.words <&> wordToPron
962 wordToPron :: Text -> [Either Char Pron.Pron]
963 wordToPron wordText =
964 [ pronun chars charCateg charBlock
965 | (charCateg, uniBlockToChars) <-
968 & List.map (\c -> (Char.generalCategory c, (Char.unicodeBlock c, c)))
969 & Char.consecutiveGroups
970 <&> (<&> Char.consecutiveGroups)
971 , (charBlock, chars) <- uniBlockToChars
975 pronun :: [Char] -> Char.GeneralCategory -> Maybe Char.UnicodeBlock -> [Either Char Pron.Pron]
976 pronun chars Char.DecimalNumber _ =
979 { pronInput = [Pron.LexemeChar char]
984 { Pron.unPronunciations =
985 [ Pron.RuleLexemes [Pron.LexemeChar char] :=
987 { Pron.pronunciationIPABroad = []
988 , Pron.pronunciationText = txt char
999 & ShortText.singleton
1000 & lookupPinyins dict
1003 & numberedPinyinToDiacriticPiniyn
1007 & Text.intercalate " "
1008 pronun chars Char.OpenPunctuation _ =
1012 pronun chars Char.ClosePunctuation _ =
1016 pronun chars Char.OtherPunctuation _ =
1020 pronun chars _ (Just Char.UnicodeBlockCJK{}) =
1023 { pronInput = Pron.LexemeChar <$> chars
1028 { Pron.unPronunciations =
1029 [ Pron.RuleLexemes (Pron.LexemeChar <$> chars) :=
1031 { Pron.pronunciationIPABroad = []
1032 , Pron.pronunciationText = txt
1043 & lookupPinyins dict
1047 & numberedPinyinToDiacriticPiniyn
1051 & Text.intercalate " "
1052 -- if List.length wordPinyins == Text.length wordText = wordPinyins
1054 -- wordPinyins = chars
1056 -- & lookupPinyins dict
1057 pronun chars categ block = errorShow ("pronunciation" :: Text, chars, categ, block)
1059 orderHTML :: Maybe Int -> ChineseDict -> IO _
1060 orderHTML limit dict@(ChineseDict d) = do
1061 dataPath <- Self.getDataDir <&> File.normalise
1062 return $ Blaze.renderMarkupBuilder do
1065 HTML.title $ ("Chinese Order" <> maybe "" (\l -> " (first " <> show l <> ")") limit) & HTML.toHtml
1067 ( [ "styles/Paper.css"
1068 , "styles/Rosetta/Common.css"
1069 , "styles/Rosetta/Reading.css"
1075 ! HA.rel "stylesheet"
1076 ! HA.type_ "text/css"
1077 ! HA.href (dataPath </> cssFile & HTML.toValue)
1078 -- HTML.styleCSS $ pagesDifficulties & difficultyCSS
1086 & foldMap (weightsMap >>> Map.keys)
1087 -- Explanation: several entries can share the same 'Weight"
1088 -- hence compute ranks after merging.
1090 let keyToRank = [(key, rank) | (rank, key) <- order] & Map.fromListWith undefined
1091 let rankId rank = "entry-" <> HTML.toValue (show rank)
1096 , "text-decoration" := "none"
1098 ! HA.href ("#" <> rankId rank)
1102 HTML.div ! styles ["display" := "flex", "flex-direction" := "column"] $ do
1105 & maybe id List.take limit
1107 \(rank :: Int, k) ->
1108 forM_ (d & Map.lookup k) \v -> do
1110 ! styles ["text-align" := "justify", "break-inside" := "avoid"]
1111 ! HA.id (rankId rank)
1115 HTML.span $ k & HTML.toHtml
1116 unless (v & chinesePinyins & List.null) do
1118 HTML.span $ v & chinesePinyins <&> ShortText.toText & Text.unwords & HTML.toHtml
1120 unless (v & chineseDecomp k & List.null) do
1123 [ (component, componentRankMaybe) :: (ShortText, Maybe Int)
1124 | component <- chineseDecomp k v <&> ShortText.singleton
1125 , let componentRankMaybe = keyToRank & Map.lookup component
1129 <&> ( \(component, componentRankMaybe) -> do
1131 component & HTML.toHtml
1132 case componentRankMaybe of
1135 | Set.member component circledNumbers -> ""
1136 | otherwise -> rankLink componentRank
1138 & List.intersperse " "
1141 HTML.span ! styles ["color" := "#666"] $ do
1142 v & chineseEnglish <&> ShortText.toText & Text.intercalate "; " & HTML.toHtml
1144 orderDOT :: ChineseDict -> IO BS.Builder
1145 orderDOT dict@(ChineseDict d) = do
1147 -- dotComments [(BS.lazyByteString $ LazyText.encodeUtf8 $ pShow $ rangeToMstToGroupToClusters & Map.map Map.keys)]
1148 -- pTraceShow ("num of nodes", Map.size nodeToBranch, "num of branches", Map.size msf) $
1149 DOT.dotLine "digraph g"
1153 dotLine "splines=\"ortho\""
1154 indexFrom1M_ (rangeToMstToGroupToClusters & Map.toList) \(srcR, mstToGroupToClusters) srcRI -> do
1155 let srcRB = "r" <> BS.intDec srcRI
1156 dotLine $ "subgraph cluster_" <> srcRB
1158 dotComments ["Create a node for the range " <> srcRB]
1162 , ("label", builderQuotedString (showHuman srcR))
1164 , ("style", "filled")
1165 , ("fillcolor", "gray")
1167 dotLine "color=gray"
1170 dotComments ["Create the cluster nodes within the range " <> srcRB]
1171 forM_ (mstToGroupToClusters & Map.toList) \(mstI, groupToClusters) -> do
1172 forM_ (groupToClusters & Map.toList) \(srcGroup, srcClusters) -> do
1179 , builderQuotedString $
1180 (srcClusters & toList <&> showHuman & List.unlines)
1182 <> printf "%03d" mstI
1185 -- <> {-maybe ""-} (("\n" <>) . showSimilarity) minSimil
1187 , ("style", "filled")
1188 , -- , minSimil & {-maybe ("", "")-} (\s -> ("fillcolor", 1 + ((floor (runProbability s * 10)) `mod` 10) & BS.intDec))
1189 ("colorscheme", "ylorrd9")
1192 dotComments ["Horizontally align the cluster nodes within the same range"]
1195 | (mstI, groupToClusters) <- mstToGroupToClusters & Map.toList
1196 , (group, _clusters) <- groupToClusters & Map.toList
1200 c@(firstMst, firstGroup) : cs -> do
1202 [srcRB, srcRB <> "t" <> BS.intDec firstMst <> "c" <> BS.intDec firstGroup]
1203 [ ("style", "invis")
1205 cs & (`foldM_` c) \(srcMst, srcGroup) dst@(dstMst, dstGroup) -> do
1207 [(srcRI, srcMst, srcGroup), (srcRI, dstMst, dstGroup)]
1209 , ("style", "invis")
1212 indexFrom1M_ sortedMSF \mst mstI -> do
1213 dotComments ["Create the edges of the MST " <> BS.intDec mstI]
1214 -- pTraceShowM (mstI, List.length (Tree.flatten mst))
1215 let loop (Tree.Node MSTNode{mstNodeRangeCluster = src} dsts) = do
1216 forM_ dsts \dstNode@(Tree.Node MSTNode{mstNodeRangeCluster = dst, mstNodeSimilarity = simil} _) -> do
1217 -- let similB = BS.stringUtf8 $ showFFloat (Just 2) (simil & runProbability & fromRational @Double) ""
1218 let indexRangeCluster (r, c) =
1219 let clusterToGroup :: cluster :-> ClusterGroup =
1222 | (group, clusters) <-
1223 rangeToMstToGroupToClusters
1225 & fromMaybe Map.empty
1227 & fromMaybe Map.empty
1229 , cluster <- clusters & Set.toList
1231 in ( 1 + Map.findIndex r rangeToMstToGroupToClusters
1233 , Map.lookup c clusterToGroup
1234 & fromMaybe (error (LazyText.unpack (pShow ("r", r, "c", c {-, "clusterToGroup", clusterToGroup, "rangeToMstToGroupToClusters", rangeToMstToGroupToClusters-}))))
1237 [ indexRangeCluster src
1238 , indexRangeCluster dst
1240 [ ("constraint", "false")
1241 , ("color", (floor (runProbability simil * 10)) `mod` 10 & BS.intDec)
1242 , ("colorscheme", "ylorrd9")
1243 , -- , ("label", similB)
1244 ("fontcolor", "blue")
1246 , ("arrowhead", "dot")
1247 , ("arrowtail", "dot")
1251 dotRanges rangeToMstToGroupToClusters