]> Git — Sourcephile - julm/worksheets.git/blob - src/Language/Chinese.hs
update
[julm/worksheets.git] / src / Language / Chinese.hs
1 {-# LANGUAGE OverloadedLists #-}
2 {-# LANGUAGE OverloadedStrings #-}
3
4 module Language.Chinese where
5
6 import Control.Applicative (Alternative (..))
7 import Control.Monad.ST qualified as ST
8 import Control.Monad.Trans.State qualified as MT
9 import Data.Aeson qualified as JSON
10 import Data.Aeson.Key qualified as JSON.DictKey
11 import Data.Aeson.KeyMap qualified as JSON
12 import Data.ByteString (ByteString)
13 import Data.ByteString qualified as ByteString
14 import Data.ByteString.Builder qualified as BS
15 import Data.Char qualified as Char
16 import Data.Csv ((.!))
17 import Data.Csv qualified as CSV
18 import Data.Csv.Incremental qualified as CSV.Incremental
19 import Data.List qualified as List
20 import Data.Map.Strict qualified as Map
21 import Data.STRef qualified as ST
22 import Data.Set qualified as Set
23 import Data.Text qualified as Text
24 import Data.Text.Short qualified as ShortText
25 import Graph.DOT qualified as DOT
26 import Numeric.Decimal.BoundedArithmetic (arithError, arithM)
27 import Paths_worksheets qualified as Self
28 import System.Exit qualified as Sys
29 import System.FilePath qualified as Sys
30 import System.FilePath.Posix ((</>))
31 import System.FilePath.Posix qualified as File
32 import System.IO qualified as Sys
33 import Text.Read (read, readMaybe)
34 import Worksheets.Utils.HTML (className, classes, styles, (!))
35 import Worksheets.Utils.HTML qualified as HTML
36 import Prelude (error, fromIntegral, undefined, (/))
37
38 -- import Text.Blaze
39 import Text.Blaze.Html5 qualified as HTML
40 import Text.Blaze.Html5.Attributes qualified as HA
41 import Text.Blaze.Renderer.Utf8 qualified as Blaze
42
43 import Language.Pronunciation qualified as Pron
44 import Worksheets.Utils.Char qualified as Char
45 import Worksheets.Utils.JSON
46 import Worksheets.Utils.Prelude
47 import Worksheets.Utils.Probability
48
49 newtype ChineseDict = ChineseDict (Map ShortText ChineseDictEntries)
50 deriving (Show)
51 unChineseDict (ChineseDict x) = x
52 instance Semigroup ChineseDict where
53 ChineseDict x <> ChineseDict y =
54 ChineseDict (Map.unionWithKey merge x y)
55 where
56 merge _k !xV !yV = xV <> yV
57
58 keyLengthToSumOfEntries :: ChineseDict -> Map Natural Natural
59 keyLengthToSumOfEntries (ChineseDict d) =
60 [ ( k & ShortText.length & fromIntegral
61 , 1
62 )
63 | k <- d & Map.keys
64 ]
65 & Map.fromListWith (+)
66
67 chineseFrequencySum :: ChineseDict -> Double
68 chineseFrequencySum (ChineseDict d) =
69 [ pure freq
70 | e <- d & Map.elems
71 , freq <- e & chineseFrequency & maybeToList
72 ]
73 & sum
74 & arithError
75 & runProbability
76 & fromRational @Double
77
78 instance Monoid ChineseDict where
79 mempty = ChineseDict mempty
80
81 -- FIXME: use a sum type Char/Word
82 data ChineseDictEntries = ChineseDictEntries
83 { chinesePinyins :: ![ShortText]
84 , -- TODO: , chineseIPA :: !IPA.IPA
85 chineseEnglish :: ![ShortText]
86 , chineseFrequency :: !(Maybe Probability)
87 , chineseComponents :: !(Maybe [Char])
88 , chineseStrokes :: !(Maybe Natural)
89 -- ^ FIXME: use Natural
90 }
91 deriving (Generic, Show)
92 chineseDecomp :: ShortText -> ChineseDictEntries -> [Char]
93 chineseDecomp key ChineseDictEntries{chineseComponents} =
94 chineseComponents
95 & fromMaybe (if 1 < ShortText.length key then ShortText.unpack key else [])
96
97 instance Semigroup ChineseDictEntries where
98 x <> y =
99 ChineseDictEntries
100 { chinesePinyins =
101 if null (chinesePinyins x)
102 then chinesePinyins y
103 else
104 if null (chinesePinyins y)
105 || chinesePinyins x
106 == chinesePinyins y
107 then chinesePinyins x
108 else -- traceString
109 -- ( mconcat $
110 -- [ "Semigroup: chinesePinyins: mismatch: "
111 -- , show (chinesePinyins x)
112 -- , " <> "
113 -- , show (chinesePinyins y)
114 -- ]
115 -- )
116 (chinesePinyins x)
117 , chineseEnglish = chineseEnglish x <> chineseEnglish y
118 , chineseFrequency = chineseFrequency x <|> chineseFrequency y
119 , chineseComponents =
120 case chineseComponents x of
121 Nothing -> chineseComponents y
122 Just xS ->
123 case chineseComponents y of
124 Nothing -> chineseComponents x
125 Just yS
126 | xS == yS -> chineseComponents x
127 | otherwise -> error $ "Semigroup: chineseComponents: mismatch: " <> show xS <> " <> " <> show yS
128 , chineseStrokes =
129 case chineseStrokes x of
130 Nothing -> chineseStrokes y
131 Just xS ->
132 case chineseStrokes y of
133 Nothing -> chineseStrokes x
134 Just yS
135 | xS == yS -> chineseStrokes x
136 | otherwise -> error $ "Semigroup: chineseStrokes: mismatch: " <> show xS <> " <> " <> show yS
137 }
138 instance Monoid ChineseDictEntries where
139 mempty =
140 ChineseDictEntries
141 { chinesePinyins = mempty
142 , chineseEnglish = mempty
143 , chineseFrequency = Nothing
144 , chineseComponents = Nothing
145 , chineseStrokes = Nothing
146 }
147
148 readLoachWangFreq :: IO ChineseDict
149 readLoachWangFreq = do
150 readJSON "data/langs/mandarin/LoachWang/word_freq.json" $
151 JSON.withObject "WordFreq" \kv ->
152 ChineseDict
153 . Map.fromAscList
154 <$> forM (kv & JSON.toMap & Map.toList) \(k, v) ->
155 v & JSON.withScientific "Frequency" \freqSci -> do
156 let freq = freqSci & toRational & probability & arithError
157 return
158 ( k & JSON.DictKey.toShortText
159 , mempty{chineseFrequency = Just freq}
160 )
161
162 readLoachWangStrokes :: IO ChineseDict
163 readLoachWangStrokes =
164 readJSON ("data/langs/mandarin/LoachWang/char_strokes.json") $
165 JSON.withObject "WordFreq" \kv ->
166 ChineseDict
167 . Map.fromAscList
168 <$> forM (kv & JSON.toMap & Map.toList) \(k, v) -> do
169 strokes <- JSON.parseJSON v
170 return
171 ( k & JSON.DictKey.toShortText
172 , mempty{chineseStrokes = Just strokes}
173 )
174
175 readChiseStrokes :: IO ChineseDict
176 readChiseStrokes =
177 withDataFile "data/langs/mandarin/cjkvi-ids/ucs-strokes.txt" \dataHandle -> do
178 let loop accDict@(ChineseDict !acc) = do
179 isEOF <- dataHandle & Sys.hIsEOF
180 if isEOF
181 then return accDict
182 else do
183 line <- ByteString.hGetLine dataHandle
184 let decodeUtf8 = ShortText.fromByteString >>> fromMaybe (error "invalid UTF-8")
185 let splitOnChar c = ByteString.split (fromIntegral (fromEnum c))
186 case line & splitOnChar '\t' of
187 [unicodeLit, unicodeChar, chineseStrokesLit]
188 | unicodeLit & ByteString.isPrefixOf "U+" ->
189 loop $
190 ChineseDict $
191 acc
192 & Map.insert
193 (unicodeChar & decodeUtf8)
194 mempty{chineseStrokes}
195 where
196 chineseStrokes :: Maybe Natural =
197 chineseStrokesLit
198 & splitOnChar ','
199 <&> ( decodeUtf8
200 >>> ShortText.unpack
201 >>> readMaybe @Natural
202 )
203 -- CorrectnessWarning: conservative bias toward the greatest number of strokes.
204 & List.maximum
205 <&> fromIntegral
206 _ -> loop accDict
207 loop mempty
208
209 readChiseDecomp :: IO ChineseDict
210 readChiseDecomp =
211 withDataFile "data/langs/mandarin/cjkvi-ids/ids.txt" $ (`loop` mempty)
212 where
213 loop dataHandle accDict@(ChineseDict !acc) = do
214 isEOF <- dataHandle & Sys.hIsEOF
215 if isEOF
216 then return accDict
217 else do
218 line <- ByteString.hGetLine dataHandle
219 let decodeUtf8 = ShortText.fromByteString >>> fromMaybe (error "invalid UTF-8")
220 let splitOnChar c = ByteString.split (fromIntegral (fromEnum c))
221 case line & splitOnChar '\t' of
222 unicodeLit : (decodeUtf8 -> unicodeChar) : decomps
223 | unicodeLit & ByteString.isPrefixOf "U+" ->
224 let
225 decompWithCodes :: [([Char], Set Char)] =
226 decomps <&> \decomp ->
227 case decomp & ByteString.stripSuffix "]" of
228 Nothing ->
229 ( decomp & decodeUtf8 & ShortText.unpack
230 , mempty
231 )
232 Just decompWithCode ->
233 ( decompChars
234 & ShortText.unpack
235 -- ExplanationNote: remove the final '['
236 & List.init
237 , decompCodes & ShortText.unpack & fromList
238 )
239 where
240 (decompChars, decompCodes) =
241 decompWithCode
242 & decodeUtf8
243 & ShortText.breakEnd (== '[')
244 chineseComponents =
245 decompWithCodes
246 -- FIXME: CorrectnessWarning: maybe select using the Codes
247 -- instead of just taking the first.
248 & List.head
249 & fst
250 -- ExplanationNote: remove IDCs (Ideographic Description Characters)
251 -- namely "⿰" (U+2FF0) to "⿻" (U+2FFB).
252 & List.filter (\c -> not ('\x2FF0' <= c && c <= '\x2FFB'))
253 & List.filter (\c -> ShortText.singleton c /= unicodeChar)
254 & Just
255 in
256 loop dataHandle $
257 ChineseDict $
258 acc & Map.insert unicodeChar mempty{chineseComponents}
259 _ -> loop dataHandle accDict
260
261 readChineseDict :: IO ChineseDict
262 readChineseDict =
263 fold $
264 list
265 [ return extraEntries
266 , readLoachWangFreq
267 , readChiseStrokes
268 , readChiseDecomp
269 , -- , readChineseLexicalDatabase
270 readCEDICT
271 -- , readLoachWangStrokes
272 -- , readOutlierDecomp
273 ]
274 extraEntries :: ChineseDict
275 extraEntries =
276 ChineseDict $
277 [
278 [
279 ( "·"
280 , mempty
281 { chineseStrokes = Just 1
282 , chineseComponents = Just ['·']
283 }
284 )
285 ,
286 ( "库蠓属"
287 , mempty
288 { chinesePinyins = ["ku4", "meng3", "shi4"]
289 , chineseEnglish = ["culicoides"]
290 }
291 )
292 ,
293 ( "瘦果"
294 , mempty
295 { chinesePinyins = ["shou4", "guo3"]
296 , chineseEnglish = ["achene"]
297 }
298 )
299 ,
300 ( "美洲野马"
301 , mempty
302 { chinesePinyins = ["mei3", "zhou1", "ye3", "ma3"]
303 , chineseEnglish = ["mustang horse"]
304 }
305 )
306 ,
307 ( "钩粉蝶"
308 , mempty
309 { chinesePinyins = ["gou1", "fen3", "die2"]
310 , chineseEnglish = ["brimstone butterfly"]
311 }
312 )
313 ,
314 ( "核果"
315 , mempty
316 { chinesePinyins = ["he2", "guo3"]
317 , chineseEnglish = ["drupe"]
318 }
319 )
320 ,
321 ( "0"
322 , mempty
323 { chineseStrokes = Just 14
324 , chinesePinyins = ["ling2"] -- 零
325 }
326 )
327 ,
328 ( "1"
329 , mempty
330 { chineseStrokes = Just 2
331 , chinesePinyins = ["yi1"]
332 }
333 )
334 ,
335 ( "2"
336 , mempty
337 { chineseStrokes = Just 2
338 , chinesePinyins = ["er4"]
339 }
340 )
341 ,
342 ( "3"
343 , mempty
344 { chineseStrokes = Just 2
345 , chinesePinyins = ["san1"]
346 }
347 )
348 ,
349 ( "4"
350 , mempty
351 { chineseStrokes = Just 2
352 , chinesePinyins = ["si4"]
353 }
354 )
355 ,
356 ( "5"
357 , mempty
358 { chineseStrokes = Just 2
359 , chinesePinyins = ["wu3"]
360 }
361 )
362 ,
363 ( "6"
364 , mempty
365 { chineseStrokes = Just 1
366 , chinesePinyins = ["liu4"]
367 }
368 )
369 ,
370 ( "7"
371 , mempty
372 { chineseStrokes = Just 1
373 , chinesePinyins = ["qi1"]
374 }
375 )
376 ,
377 ( "8"
378 , mempty
379 { chineseStrokes = Just 2
380 , chinesePinyins = ["ba1"]
381 }
382 )
383 ,
384 ( "9"
385 , mempty
386 { chineseStrokes = Just 1
387 , chinesePinyins = ["jiu3"]
388 }
389 )
390 ]
391 , [ ( letter & ShortText.singleton
392 , mempty
393 { chineseStrokes = Just 1
394 , chinesePinyins = []
395 }
396 )
397 | letter <-
398 [ ['A' .. 'Z']
399 , ['a' .. 'z']
400 , ['π', '□', '○']
401 ]
402 & mconcat
403 ]
404 ]
405 & mconcat
406 & Map.fromList
407
408 readLoachWangWordOrder :: IO JSON.Array
409 readLoachWangWordOrder =
410 readJSON "data/langs/mandarin/LoachWang/loach_word_order.json" $
411 JSON.withArray "LoachWordOrder" \arr -> return arr
412
413 readOutlierDecomp :: IO ChineseDict
414 readOutlierDecomp =
415 readJSON "data/langs/mandarin/LoachWang/outlier_decomp.json" $
416 JSON.withObject "OutlierDecomp" \kv ->
417 ChineseDict
418 . Map.fromAscList
419 <$> forM (kv & JSON.toMap & Map.toList) \(k, v) ->
420 v & JSON.withArray "Decomp" \decomp ->
421 forM
422 (decomp & toList)
423 ( \sub ->
424 sub & JSON.withText "Atom" \atom ->
425 return $ atom & Text.unpack
426 )
427 <&> \atoms ->
428 ( k & JSON.DictKey.toShortText
429 , mempty{chineseComponents = Just $ atoms & mconcat}
430 )
431
432 reverseDecomp :: ChineseDict -> Map {-def-} ShortText {-refs-} [ShortText]
433 reverseDecomp (ChineseDict dict) =
434 [ sub := [key]
435 | (key, entry) <- dict & Map.toList
436 , sub <- entry & chineseDecomp key <&> ShortText.singleton
437 ]
438 & Map.fromListWith (<>)
439
440 type Benefit = Probability
441 type Cost = Natural
442 type Weight = Probability
443 type DictKey = ShortText
444 data Weights = Weights
445 { weightsMap :: Map DictKey (Benefit, Cost)
446 }
447 deriving (Show)
448
449 -- |
450 -- Zhao, K., zhao, D., Yang, J., Ma, W., & Yu, B. (2019).
451 -- Efficient Learning Strategy of Chinese Characters Based on
452 -- Usage Frequency and Structural Relationship.
453 -- 2019 IEEE 4th International Conference on Big Data Analytics (ICBDA).
454 -- doi:10.1109/icbda.2019.8713245
455 -- https://sci-hub.se/10.1109/icbda.2019.8713245
456 dictToWeights :: ChineseDict -> Weights
457 dictToWeights cn@(ChineseDict dict) = ST.runST do
458 -- ExplanationNote: memo tables, to avoid recomputing benefits and costs
459 keyToBenefitST :: ST.STRef st (Map DictKey Benefit) <- ST.newSTRef Map.empty
460 keyToCostST :: ST.STRef st (Map DictKey Cost) <- ST.newSTRef Map.empty
461 let refsMap :: Map ShortText [ShortText] = reverseDecomp cn
462 let
463 keyToBenefitLoop :: DictKey -> ST.ST st Benefit
464 keyToBenefitLoop key = do
465 -- traceShowM ("keyToBenefitLoop"::String, key)
466 keyToBenefitMap <- keyToBenefitST & ST.readSTRef
467 keyToBenefitMap
468 & Map.lookup key
469 & \case
470 Just bene -> return bene
471 Nothing -> do
472 let refs :: [ShortText] =
473 refsMap
474 & Map.lookup key
475 & fromMaybe []
476 -- & fromMaybe []
477 & List.delete key
478 -- traceShowM ("keyToBenefitLoop"::String, key, refs)
479 let freq =
480 dict
481 & Map.lookup key
482 & maybe proba0 (chineseFrequency >>> fromMaybe proba0)
483 refsBC <- forM refs \ref -> do
484 bene <- keyToBenefitLoop ref
485 cost <- keyToCostLoop ref
486 return (bene, cost)
487 cost <- keyToCostLoop key
488 bene <-
489 arithM $
490 probability $
491 runProbability freq
492 + toRational cost
493 * sum
494 [ runProbability b / toRational c
495 | (b, c) <- refsBC
496 , c /= 0
497 ]
498 ST.modifySTRef keyToBenefitST $ Map.insert key bene
499 return bene
500 keyToCostLoop :: DictKey -> ST.ST st Cost
501 keyToCostLoop key = do
502 -- traceShowM ("keyToCostLoop"::String, key)
503 keyToCostMap <- keyToCostST & ST.readSTRef
504 keyToCostMap
505 & Map.lookup key
506 & \case
507 Just cost -> return cost
508 Nothing -> do
509 let entry = dict & Map.lookup key & fromMaybe mempty
510 cost <-
511 -- single char
512 if ShortText.length key == 1
513 then do
514 let deps =
515 entry
516 & chineseDecomp key
517 & List.delete (ShortText.unpack key & List.head)
518 -- & (`Set.difference` (ShortText.unpack key & Set.fromList))
519 if null deps
520 then -- atom char
521 return $
522 entry
523 & chineseStrokes
524 & fromMaybe 1
525 else -- & fromMaybe (error $ "dictToWeights: missing chineseStrokes for: " <> ShortText.unpack key)
526 do
527 -- composite char
528 -- traceShowM ("composite"::String, show key, show <$> Set.toList deps
529 -- , deps & (`Set.difference` (ShortText.unpack key & Set.fromList))
530 -- )
531 deps
532 & foldMapM (\dep -> Sum <$> keyToCostLoop (dep & ShortText.singleton))
533 <&> getSum
534 else -- word
535 do
536 -- traceShowM ("word"::String, key)
537 key
538 & ShortText.unpack
539 & foldMapM (\dep -> Sum <$> keyToCostLoop (dep & ShortText.singleton))
540 <&> getSum
541 ST.modifySTRef keyToCostST $ Map.insert key cost
542 return cost
543 weightsMap <-
544 dict & Map.traverseWithKey \key _entry -> do
545 -- traceShowM ("dictToWeights"::String, key)
546 bene <- keyToBenefitLoop key
547 cost <- keyToCostLoop key
548 return (bene, cost)
549 return
550 Weights
551 { weightsMap
552 }
553
554 newtype ChineseOrder = ChineseOrder (Map (Down Weight) Weights)
555 unChineseOrder (ChineseOrder x) = x
556 dictOrder :: Weights -> ChineseOrder
557 dictOrder w =
558 [ ( probability (runProbability b / toRational c) & arithError & Down
559 , Weights
560 { weightsMap = Map.singleton k v
561 }
562 )
563 | (k, v@(b, c)) <- w & weightsMap & Map.toList
564 ]
565 & Map.fromListWith (\x y -> Weights{weightsMap = Map.unionWith (error "dictOrder") (weightsMap x) (weightsMap y)})
566 & ChineseOrder
567
568 dictOrderIO = readChineseDict <&> (dictToWeights >>> dictOrder)
569
570 -- order100 :: IO [(DictKey, Maybe ChineseDictEntries)]
571 order100 = do
572 dict@(ChineseDict d) <- readChineseDict
573 let ChineseOrder o = dict & dictToWeights & dictOrder
574 return $
575 o
576 & Map.take 100
577 & Map.elems
578 & foldMap (weightsMap >>> Map.keys)
579 <&> \k -> (k, d & Map.lookup k <&> \r -> (chinesePinyins r, chineseEnglish r))
580
581 dictOrderBySameWeight (ChineseOrder o) =
582 [ ( Map.size (weightsMap v)
583 , Map.singleton k v
584 )
585 | (k, v) <- o & Map.toList
586 ]
587 & Map.fromListWith (<>)
588
589 {-
590 data UnfoldedGraph node lbl edge = UnfoldedGraph
591 { unfoldedVertexToEdges :: !(Array.Array Vertex (lbl, Map edge IntSet))
592 , unfoldedNodeToVertex :: !(Map node Vertex)
593 }
594 deriving (Show, Generic)
595 unfoldGMany ::
596 forall node edge nodeLabel.
597 Ord node =>
598 (node -> (nodeLabel, Map edge (Set node))) ->
599 [node] ->
600 UnfoldedGraph node nodeLabel edge
601 unfoldGMany gen roots = ST.runST do
602 nodeToVertexST :: ST.STRef st (Map node Vertex) <- ST.newSTRef Map.empty
603 vertexToEdgesST :: ST.STRef st [(Vertex, (nodeLabel, Map edge IntSet))] <- ST.newSTRef []
604 lastVertexST <- ST.newSTRef firstVertex
605 let newVertex = do
606 v <- ST.readSTRef lastVertexST
607 ST.writeSTRef lastVertexST (v + 1)
608 return v
609 let
610 nodeToVertexDepthFirst :: node -> ST.ST st Vertex
611 nodeToVertexDepthFirst src =
612 nodeToVertexST
613 & ST.readSTRef
614 <&> Map.lookup src
615 >>= \case
616 -- DescriptionNote: `src` has already been seen,
617 -- just return its unique `Vertex`.
618 Just v -> return v
619 -- DescriptionNote: `src` has never been seen yet,
620 -- allocate a `newVertex` for it
621 Nothing -> do
622 dst <- newVertex
623 ST.modifySTRef nodeToVertexST $ Map.insert src dst
624 let (nodeLabel, edgeToDsts) = gen src
625 edgeToDstsV :: Map edge IntSet <-
626 edgeToDsts & Map.traverseWithKey \_edge dsts ->
627 forM (dsts & Set.toList) nodeToVertexDepthFirst
628 <&> IntSet.fromList
629 let res = (dst, (nodeLabel, edgeToDstsV))
630 ST.modifySTRef vertexToEdgesST (res :)
631 return dst
632 forM_ roots nodeToVertexDepthFirst
633 nodeToVertex <- ST.readSTRef nodeToVertexST
634 vertexToEdges <- ST.readSTRef vertexToEdgesST
635 lastId <- ST.readSTRef lastVertexST
636 return
637 UnfoldedGraph
638 { unfoldedVertexToEdges = vertexToEdges & Array.array (firstVertex, lastId - 1)
639 , unfoldedNodeToVertex = nodeToVertex
640 }
641 where
642 firstVertex :: Vertex = 0
643 -}
644
645 -- mostComp d =
646 -- [
647 -- | d & reverseDecomp & Map.toList
648 -- ] & Map.fromListWith (<>)
649
650 data ChineseTone
651 = ChineseTone1
652 | ChineseTone2
653 | ChineseTone3
654 | ChineseTone4
655 | ChineseTone5
656 deriving (Eq, Ord, Show, Enum)
657
658 numberedPinyinToDiacriticPiniyn :: ShortText -> ShortText
659 numberedPinyinToDiacriticPiniyn numPin =
660 case pinTone of
661 Nothing -> pinRoot
662 Just tone ->
663 pinRoot
664 & ShortText.unpack
665 & addUmlaut
666 & setDiacritic
667 & ShortText.pack
668 where
669 addUmlaut ('u' : ':' : cs) = 'ü' : addUmlaut cs
670 addUmlaut ('v' : cs) = 'ü' : addUmlaut cs
671 addUmlaut ('U' : ':' : cs) = 'Ü' : addUmlaut cs
672 addUmlaut ('V' : cs) = 'Ü' : addUmlaut cs
673 addUmlaut (c : cs) = c : addUmlaut cs
674 addUmlaut [] = []
675 -- CorrectnessNote: a, e or the o in ou always get the marker
676 setDiacritic (c@'a' : cs) = convert tone c : cs
677 setDiacritic (c@'e' : cs) = convert tone c : cs
678 setDiacritic (c@'o' : 'u' : cs) = convert tone c : 'u' : cs
679 setDiacritic (c : cs)
680 -- CorrectnessNote: if no a, e, or ou found, the tone mark goes on the last vowel
681 | List.elem c vowels && all (`List.notElem` vowels) cs = convert tone c : cs
682 | otherwise = c : setDiacritic cs
683 setDiacritic [] = []
684 vowels = lowerVowels <> (Char.toUpper <$> lowerVowels)
685 lowerVowels = ['a', 'e', 'i', 'o', 'u', 'ü'] & list
686 where
687 (pinRoot, pinTone) ::
688 (ShortText, Maybe ChineseTone) =
689 numPin & ShortText.spanEnd Char.isDigit & second \case
690 "" -> Nothing
691 ds -> ds & ShortText.unpack & read & (\x -> x - (1 :: Int)) & toEnum & Just
692 convert :: ChineseTone -> Char -> Char
693 convert t c = case t of
694 ChineseTone1 ->
695 case c of
696 'a' -> 'ā'
697 'e' -> 'ē'
698 'i' -> 'ī'
699 'o' -> 'ō'
700 'u' -> 'ū'
701 'ü' -> 'ǖ'
702 'A' -> 'Ā'
703 'E' -> 'Ē'
704 'I' -> 'Ī'
705 'O' -> 'Ō'
706 'U' -> 'Ū'
707 'Ü' -> 'Ǖ'
708 _ -> undefined
709 ChineseTone2 ->
710 case c of
711 'a' -> 'á'
712 'e' -> 'é'
713 'i' -> 'í'
714 'o' -> 'ó'
715 'u' -> 'ú'
716 'ü' -> 'ǘ'
717 'A' -> 'Á'
718 'E' -> 'É'
719 'I' -> 'Í'
720 'O' -> 'Ó'
721 'U' -> 'Ú'
722 'Ü' -> 'Ǘ'
723 _ -> undefined
724 ChineseTone3 ->
725 case c of
726 'a' -> 'ǎ'
727 'e' -> 'ě'
728 'i' -> 'ǐ'
729 'o' -> 'ǒ'
730 'u' -> 'ǔ'
731 'ü' -> 'ǚ'
732 'A' -> 'Ǎ'
733 'E' -> 'Ě'
734 'I' -> 'Ǐ'
735 'O' -> 'Ǒ'
736 'U' -> 'Ǔ'
737 'Ü' -> 'Ǚ'
738 _ -> undefined
739 ChineseTone4 ->
740 case c of
741 'a' -> 'à'
742 'e' -> 'è'
743 'i' -> 'ì'
744 'o' -> 'ò'
745 'u' -> 'ù'
746 'ü' -> 'ǜ'
747 'A' -> 'À'
748 'E' -> 'È'
749 'I' -> 'Ì'
750 'O' -> 'Ò'
751 'U' -> 'Ù'
752 'Ü' -> 'Ǜ'
753 _ -> undefined
754 ChineseTone5 -> c
755
756 data HskLevel
757 = HskLevel301
758 | HskLevel302
759 | HskLevel303
760 | HskLevel304
761 | HskLevel305
762 | HskLevel306
763 deriving (Eq, Ord, Enum, Show)
764
765 -- instance CSV.FromRecord ChineseDictEntries
766 -- instance CSV.ToRecord ChineseDictEntries
767 -- instance CSV.FromNamedRecord ChineseDictEntries
768 -- instance CSV.ToNamedRecord ChineseDictEntries
769 -- instance CSV.DefaultOrdered ChineseDictEntries
770
771 feed :: (ByteString -> r) -> Sys.Handle -> Sys.IO r
772 feed k csvFile = do
773 Sys.hIsEOF csvFile >>= \case
774 True -> return $ k ""
775 False -> k <$> ByteString.hGetSome csvFile 4096
776
777 readHSK :: HskLevel -> IO ChineseDict
778 readHSK chineseHSK = do
779 withDataFile ("data/langs/mandarin/HSK/hsk" <> show hskIndex Sys.<.> "csv") \fileHandle -> do
780 loop fileHandle mempty $
781 CSV.Incremental.decodeWithP parser decodeOpts CSV.NoHeader
782 where
783 hskIndex = chineseHSK & fromEnum & (+ 1)
784 decodeOpts = CSV.defaultDecodeOptions
785 parser :: CSV.Record -> CSV.Parser (ShortText, ChineseDictEntries)
786 parser v
787 | length v == 3 = do
788 chinese <- v .! 0
789 chinesePinyins <- v .! 1 <&> ShortText.split (== ' ')
790 chineseEnglish <- v .! 2 <&> pure
791 let chineseFrequency = Nothing
792 let chineseComponents = Nothing
793 let chineseStrokes = Nothing
794 pure (chinese, ChineseDictEntries{..})
795 | otherwise = empty
796 check =
797 either (\x -> Sys.print x >> return mempty) \(chinese, e) ->
798 return $ ChineseDict $ Map.singleton chinese e
799 loop fileHandle !acc = \case
800 CSV.Incremental.Fail _ errMsg -> do
801 Sys.putStrLn errMsg
802 Sys.exitFailure
803 CSV.Incremental.Many rs k -> do
804 ok <- rs & foldMapM check
805 t <- feed k fileHandle
806 loop fileHandle (acc <> ok) t
807 CSV.Incremental.Done rs -> do
808 ok <- rs & foldMapM check
809 return (acc <> ok)
810
811 -- | Sun, C.C., Hendrix, P., Ma, J. et al.
812 -- Chinese lexical database (CLD).
813 -- Behav Res 50, 2606–2629 (2018).
814 -- https://doi.org/10.3758/s13428-018-1038-3
815 readChineseLexicalDatabase :: IO ChineseDict
816 readChineseLexicalDatabase = do
817 withDataFile "data/langs/mandarin/chineselexicaldatabase2.1.csv" \fileHandle -> do
818 loop fileHandle mempty $
819 CSV.Incremental.decodeWithP parser decodeOpts CSV.HasHeader
820 where
821 decodeOpts = CSV.defaultDecodeOptions
822 parser :: CSV.Record -> CSV.Parser (DictKey, ChineseDictEntries)
823 parser v
824 | length v == 269 = do
825 key <- v .! 0
826 keyLength :: Int <- v .! 5
827 pinyin1 <- v .! 15
828 pinyin2 <- v .! 16
829 pinyin3 <- v .! 17
830 pinyin4 <- v .! 18
831 let chinesePinyins =
832 [pinyin1, pinyin2, pinyin3, pinyin4]
833 & List.take keyLength
834 freqPerMillion :: Double <- v .! 39
835 let chineseFrequency =
836 freqPerMillion
837 & toRational
838 & (/ 1000000)
839 & probability
840 & fromMaybe (error "readChineseLexicalDatabase: Frequency")
841 & Just
842 pure (key, mempty{chineseFrequency, chinesePinyins})
843 | otherwise = error $ "readChineseLexicalDatabase: wrong number of columns: " <> show v
844 check =
845 either (\x -> Sys.print x >> return mempty) \(key, val) ->
846 return $ ChineseDict $ Map.singleton key val
847 loop fileHandle !acc = \case
848 CSV.Incremental.Fail _ errMsg -> do
849 Sys.putStrLn errMsg
850 Sys.exitFailure
851 CSV.Incremental.Many rs k -> do
852 ok <- rs & foldMapM check
853 t <- feed k fileHandle
854 loop fileHandle (acc <> ok) t
855 CSV.Incremental.Done rs -> do
856 ok <- rs & foldMapM check
857 return (acc <> ok)
858
859 readCEDICT :: IO ChineseDict
860 readCEDICT = do
861 withDataFile "data/langs/mandarin/CEDICT/cedict_ts.u8" \cedictHandle -> do
862 let skipHeader = do
863 isEOF <- cedictHandle & Sys.hIsEOF
864 if isEOF
865 then return ()
866 else do
867 lineBS <- ByteString.hGetLine cedictHandle
868 let lineST = lineBS & ShortText.fromByteString & fromMaybe (error "invalid UTF-8")
869 let begin = lineST & ShortText.take 1
870 when (begin == "#") do
871 skipHeader
872 skipHeader
873 let loop !acc = do
874 isEOF <- cedictHandle & Sys.hIsEOF
875 if isEOF
876 then return acc
877 else do
878 line <- ByteString.hGetLine cedictHandle
879 let decodeUtf8 = ShortText.fromByteString >>> fromMaybe (error "invalid UTF-8")
880 -- DescriptionNote: each line is formatted as: #(.+) (.+) \[(.+)] /(.*)/#'
881 let skipChar c = void $ MT.state $ ByteString.span (== fromIntegral (fromEnum c))
882 let skipPrefix p =
883 MT.modify' $
884 ByteString.stripPrefix p
885 >>> fromMaybe (error $ "skipPrefix fail to match: " <> show p)
886 let breakOnChar c = ByteString.break (== fromIntegral (fromEnum c))
887 let breakOnSpace = breakOnChar ' '
888 let skipSuffix p =
889 MT.modify' $
890 ByteString.stripSuffix p
891 >>> fromMaybe
892 ( error $
893 "skipSuffix: mismatch: "
894 <> show p
895 <> "\n on line: "
896 <> ShortText.unpack (decodeUtf8 line)
897 <> "\n escaped: "
898 <> show (ShortText.unpack (decodeUtf8 line))
899 )
900 let (dict, leftover) = (`MT.runState` line) do
901 _chineseTrad <- MT.state $ breakOnSpace >>> first decodeUtf8
902 skipChar ' '
903 chineseSimpl <- MT.state $ breakOnSpace >>> first decodeUtf8
904 skipChar ' '
905 skipPrefix "["
906 chinesePinyins <-
907 MT.state $
908 breakOnChar ']'
909 >>> first
910 ( \s ->
911 s
912 & ByteString.split (fromIntegral (fromEnum ' '))
913 -- CorrectnessWarning: convert pinyin to lowercase, and remove dashes
914 <&> (decodeUtf8 >>> ShortText.unpack >>> (<&> Char.toLower) >>> ShortText.pack)
915 & List.filter (/= "-")
916 )
917 skipPrefix "] /"
918 -- CorrectnessNote: some lines do not end with \r
919 -- hence make it optional.
920 MT.modify' \s -> s & ByteString.stripSuffix "\r" & fromMaybe s
921 skipSuffix "/"
922 chineseEnglish <- MT.gets \s -> s & ByteString.split (fromIntegral (fromEnum '/')) <&> decodeUtf8
923 MT.put mempty
924 let chinese = chineseSimpl
925 return $
926 ChineseDict $
927 Map.singleton chinese $
928 mempty
929 { chinesePinyins
930 , chineseEnglish
931 }
932 if not (ByteString.null leftover)
933 then error $ "parserLine: leftover: " <> show leftover
934 else loop (acc <> dict)
935 loop mempty
936
937 lookupPinyins :: ChineseDict -> ShortText -> [ShortText]
938 lookupPinyins (ChineseDict dict) word =
939 word
940 & (`Map.lookup` dict)
941 & fromMaybe (error $ "lookupPinyins: no entry for: {" <> wordString <> "}")
942 & chinesePinyins
943 & (\ps -> if null ps then error $ "lookupPinyins: empty entry for: " <> wordString else ps)
944 where
945 wordString = word & ShortText.unpack
946
947 pronunciation :: ChineseDict -> [Pron.Lexeme] -> [[Either Char Pron.Pron]]
948 pronunciation dict inp =
949 inp & Pron.lexemesChars & Text.pack & Text.words <&> wordToPron
950 where
951 wordToPron :: Text -> [Either Char Pron.Pron]
952 wordToPron wordText =
953 [ pronun chars charCateg charBlock
954 | (charCateg, uniBlockToChars) <-
955 wordText
956 & Text.unpack
957 & List.map (\c -> (Char.generalCategory c, (Char.unicodeBlock c, c)))
958 & Char.consecutiveGroups
959 <&> (<&> Char.consecutiveGroups)
960 , (charBlock, chars) <- uniBlockToChars
961 ]
962 & mconcat
963 where
964 pronun :: [Char] -> Char.GeneralCategory -> Maybe Char.UnicodeBlock -> [Either Char Pron.Pron]
965 pronun chars Char.DecimalNumber _ =
966 [ Right
967 Pron.Pron
968 { pronInput = [Pron.LexemeChar char]
969 , pronRule =
970 Pron.rule
971 { Pron.rulePron =
972 Pron.Pronunciations
973 { Pron.unPronunciations =
974 [ Pron.RuleLexemes [Pron.LexemeChar char] :=
975 Pron.Pronunciation
976 { Pron.pronunciationIPABroad = []
977 , Pron.pronunciationText = txt char
978 }
979 ]
980 }
981 }
982 }
983 | char <- chars
984 ]
985 where
986 txt c =
987 c
988 & ShortText.singleton
989 & lookupPinyins dict
990 <&> ( \p ->
991 p
992 & numberedPinyinToDiacriticPiniyn
993 & ShortText.toText
994 & Text.toLower
995 )
996 & Text.intercalate " "
997 pronun chars Char.OpenPunctuation _ =
998 [ Left c
999 | c <- chars
1000 ]
1001 pronun chars Char.ClosePunctuation _ =
1002 [ Left c
1003 | c <- chars
1004 ]
1005 pronun chars Char.OtherPunctuation _ =
1006 [ Left c
1007 | c <- chars
1008 ]
1009 pronun chars _ (Just Char.UnicodeBlockCJK{}) =
1010 [ Right
1011 Pron.Pron
1012 { pronInput = Pron.LexemeChar <$> chars
1013 , pronRule =
1014 Pron.rule
1015 { Pron.rulePron =
1016 Pron.Pronunciations
1017 { Pron.unPronunciations =
1018 [ Pron.RuleLexemes (Pron.LexemeChar <$> chars) :=
1019 Pron.Pronunciation
1020 { Pron.pronunciationIPABroad = []
1021 , Pron.pronunciationText = txt
1022 }
1023 ]
1024 }
1025 }
1026 }
1027 ]
1028 where
1029 txt =
1030 chars
1031 & ShortText.pack
1032 & lookupPinyins dict
1033 & List.map
1034 ( \pinyin ->
1035 pinyin
1036 & numberedPinyinToDiacriticPiniyn
1037 & ShortText.toText
1038 & Text.toLower
1039 )
1040 & Text.intercalate " "
1041 -- if List.length wordPinyins == Text.length wordText = wordPinyins
1042 -- where
1043 -- wordPinyins = chars
1044 -- & ShortText.pack
1045 -- & lookupPinyins dict
1046 pronun chars categ block = errorShow ("pronunciation" :: Text, chars, categ, block)
1047
1048 orderHTML dict@(ChineseDict d) = do
1049 dataPath <- Self.getDataDir <&> File.normalise
1050 return $ Blaze.renderMarkupBuilder do
1051 HTML.docTypeHtml do
1052 HTML.head do
1053 HTML.title $ ("Chinese Order" :: Text) & HTML.toHtml
1054 forM_
1055 ( [ "styles/Paper.css"
1056 , "styles/Rosetta/Common.css"
1057 , "styles/Rosetta/Reading.css"
1058 ]
1059 & list
1060 )
1061 \cssFile ->
1062 HTML.link
1063 ! HA.rel "stylesheet"
1064 ! HA.type_ "text/css"
1065 ! HA.href (dataPath </> cssFile & HTML.toValue)
1066 -- HTML.styleCSS $ pagesDifficulties & difficultyCSS
1067 HTML.body do
1068 let ChineseOrder o = dict & dictToWeights & dictOrder
1069 HTML.ol do
1070 forM_
1071 ( o
1072 & Map.take 5000
1073 & Map.elems
1074 & foldMap (weightsMap >>> Map.keys)
1075 )
1076 \k ->
1077 HTML.li
1078 ! HTML.styles
1079 [ "break-inside" := "avoid"
1080 , "list-style-position" := "inside"
1081 ]
1082 $ do
1083 HTML.span $ k & HTML.toHtml
1084 forM_ (d & Map.lookup k) \v -> do
1085 HTML.ul ! HTML.styles [] $ do
1086 HTML.li do
1087 HTML.span $ v & chinesePinyins <&> ShortText.toText & Text.unwords & HTML.toHtml
1088 HTML.li do
1089 HTML.span $ v & chineseDecomp k & Text.pack & HTML.toHtml
1090 HTML.li do
1091 HTML.span $ v & chineseEnglish <&> ShortText.toText & Text.intercalate "; " & HTML.toHtml
1092
1093 orderDOT :: ChineseDict -> IO BS.Builder
1094 orderDOT dict@(ChineseDict d) = do
1095 DOT.runDOT do
1096 -- dotComments [(BS.lazyByteString $ LazyText.encodeUtf8 $ pShow $ rangeToMstToGroupToClusters & Map.map Map.keys)]
1097 -- pTraceShow ("num of nodes", Map.size nodeToBranch, "num of branches", Map.size msf) $
1098 DOT.dotLine "digraph g"
1099
1100 {-
1101 dotBlock do
1102 dotLine "splines=\"ortho\""
1103 indexFrom1M_ (rangeToMstToGroupToClusters & Map.toList) \(srcR, mstToGroupToClusters) srcRI -> do
1104 let srcRB = "r" <> BS.intDec srcRI
1105 dotLine $ "subgraph cluster_" <> srcRB
1106 dotBlock do
1107 dotComments ["Create a node for the range " <> srcRB]
1108 dotNode
1109 srcRB
1110 [ ("shape", "box")
1111 , ("label", builderQuotedString (showHuman srcR))
1112 , ("color", "gray")
1113 , ("style", "filled")
1114 , ("fillcolor", "gray")
1115 ]
1116 dotLine "color=gray"
1117 dotBlock do
1118 dotLine "rank=same"
1119 dotComments ["Create the cluster nodes within the range " <> srcRB]
1120 forM_ (mstToGroupToClusters & Map.toList) \(mstI, groupToClusters) -> do
1121 forM_ (groupToClusters & Map.toList) \(srcGroup, srcClusters) -> do
1122 dotNodeCluster
1123 srcRI
1124 mstI
1125 srcGroup
1126 [
1127 ( "label"
1128 , builderQuotedString $
1129 (srcClusters & toList <&> showHuman & List.unlines)
1130 <> "\nT"
1131 <> printf "%03d" mstI
1132 <> "\nS"
1133 <> show scaleI
1134 -- <> {-maybe ""-} (("\n" <>) . showSimilarity) minSimil
1135 )
1136 , ("style", "filled")
1137 , -- , minSimil & {-maybe ("", "")-} (\s -> ("fillcolor", 1 + ((floor (runProbability s * 10)) `mod` 10) & BS.intDec))
1138 ("colorscheme", "ylorrd9")
1139 , ("shape", "box")
1140 ]
1141 dotComments ["Horizontally align the cluster nodes within the same range"]
1142 let row =
1143 [ (mstI, group)
1144 | (mstI, groupToClusters) <- mstToGroupToClusters & Map.toList
1145 , (group, _clusters) <- groupToClusters & Map.toList
1146 ]
1147 case row of
1148 [] -> return ()
1149 c@(firstMst, firstGroup) : cs -> do
1150 dotEdges
1151 [srcRB, srcRB <> "t" <> BS.intDec firstMst <> "c" <> BS.intDec firstGroup]
1152 [ ("style", "invis")
1153 ]
1154 cs & (`foldM_` c) \(srcMst, srcGroup) dst@(dstMst, dstGroup) -> do
1155 dotEdgesCluster
1156 [(srcRI, srcMst, srcGroup), (srcRI, dstMst, dstGroup)]
1157 [ ("weight", "10")
1158 , ("style", "invis")
1159 ]
1160 return dst
1161 indexFrom1M_ sortedMSF \mst mstI -> do
1162 dotComments ["Create the edges of the MST " <> BS.intDec mstI]
1163 -- pTraceShowM (mstI, List.length (Tree.flatten mst))
1164 let loop (Tree.Node MSTNode{mstNodeRangeCluster = src} dsts) = do
1165 forM_ dsts \dstNode@(Tree.Node MSTNode{mstNodeRangeCluster = dst, mstNodeSimilarity = simil} _) -> do
1166 -- let similB = BS.stringUtf8 $ showFFloat (Just 2) (simil & runProbability & fromRational @Double) ""
1167 let indexRangeCluster (r, c) =
1168 let clusterToGroup :: cluster :-> ClusterGroup =
1169 Map.fromList
1170 [ (cluster, group)
1171 | (group, clusters) <-
1172 rangeToMstToGroupToClusters
1173 & Map.lookup r
1174 & fromMaybe Map.empty
1175 & Map.lookup mstI
1176 & fromMaybe Map.empty
1177 & Map.toList
1178 , cluster <- clusters & Set.toList
1179 ]
1180 in ( 1 + Map.findIndex r rangeToMstToGroupToClusters
1181 , mstI
1182 , Map.lookup c clusterToGroup
1183 & fromMaybe (error (LazyText.unpack (pShow ("r", r, "c", c {-, "clusterToGroup", clusterToGroup, "rangeToMstToGroupToClusters", rangeToMstToGroupToClusters-}))))
1184 )
1185 dotEdgesCluster
1186 [ indexRangeCluster src
1187 , indexRangeCluster dst
1188 ]
1189 [ ("constraint", "false")
1190 , ("color", (floor (runProbability simil * 10)) `mod` 10 & BS.intDec)
1191 , ("colorscheme", "ylorrd9")
1192 , -- , ("label", similB)
1193 ("fontcolor", "blue")
1194 , ("dir", "both")
1195 , ("arrowhead", "dot")
1196 , ("arrowtail", "dot")
1197 ]
1198 loop dstNode
1199 loop mst
1200 dotRanges rangeToMstToGroupToClusters
1201 -}