]> Git — Sourcephile - julm/worksheets.git/blob - src/Language/Chinese.hs
update
[julm/worksheets.git] / src / Language / Chinese.hs
1 {-# LANGUAGE OverloadedLists #-}
2 {-# LANGUAGE OverloadedStrings #-}
3
4 module Language.Chinese where
5
6 import Control.Applicative (Alternative (..))
7 import Control.Monad.ST qualified as ST
8 import Control.Monad.Trans.State qualified as MT
9 import Data.Aeson qualified as JSON
10 import Data.Aeson.Key qualified as JSON.DictKey
11 import Data.Aeson.KeyMap qualified as JSON
12 import Data.ByteString (ByteString)
13 import Data.ByteString qualified as ByteString
14 import Data.ByteString.Builder qualified as BS
15 import Data.ByteString.Char8 qualified as ByteString.Char8
16 import Data.Char qualified as Char
17 import Data.Csv ((.!))
18 import Data.Csv qualified as CSV
19 import Data.Csv.Incremental qualified as CSV.Incremental
20 import Data.List qualified as List
21 import Data.Map.Strict qualified as Map
22 import Data.STRef qualified as ST
23 import Data.Set qualified as Set
24 import Data.Text qualified as Text
25 import Data.Text.Short qualified as ShortText
26 import Graph.DOT qualified as DOT
27 import Numeric.Decimal.BoundedArithmetic (arithError, arithM)
28 import Paths_worksheets qualified as Self
29 import System.Exit qualified as Sys
30 import System.FilePath qualified as Sys
31 import System.FilePath.Posix ((</>))
32 import System.FilePath.Posix qualified as File
33 import System.IO qualified as Sys
34 import Text.Read (read, readMaybe)
35 import Worksheets.Utils.HTML (className, classes, styles, (!))
36 import Worksheets.Utils.HTML qualified as HTML
37 import Prelude (error, fromIntegral, undefined, (/))
38
39 -- import Text.Blaze
40 import Text.Blaze.Html5 qualified as HTML
41 import Text.Blaze.Html5.Attributes qualified as HA
42 import Text.Blaze.Renderer.Utf8 qualified as Blaze
43
44 import Language.Pronunciation qualified as Pron
45 import Worksheets.Utils.Char qualified as Char
46 import Worksheets.Utils.JSON
47 import Worksheets.Utils.Prelude
48 import Worksheets.Utils.Probability
49
50 newtype ChineseDict = ChineseDict (Map ShortText ChineseDictEntries)
51 deriving (Show)
52 unChineseDict (ChineseDict x) = x
53 instance Semigroup ChineseDict where
54 ChineseDict x <> ChineseDict y =
55 ChineseDict (Map.unionWithKey merge x y)
56 where
57 merge _k !xV !yV = xV <> yV
58
59 keyLengthToSumOfEntries :: ChineseDict -> Map Natural Natural
60 keyLengthToSumOfEntries (ChineseDict d) =
61 [ ( k & ShortText.length & fromIntegral
62 , 1
63 )
64 | k <- d & Map.keys
65 ]
66 & Map.fromListWith (+)
67
68 chineseFrequencySum :: ChineseDict -> Double
69 chineseFrequencySum (ChineseDict d) =
70 [ pure freq
71 | e <- d & Map.elems
72 , freq <- e & chineseFrequency & maybeToList
73 ]
74 & sum
75 & arithError
76 & runProbability
77 & fromRational @Double
78
79 instance Monoid ChineseDict where
80 mempty = ChineseDict mempty
81
82 -- FIXME: use a sum type Char/Word
83 data ChineseDictEntries = ChineseDictEntries
84 { chinesePinyins :: ![ShortText]
85 , -- TODO: , chineseIPA :: !IPA.IPA
86 chineseEnglish :: ![ShortText]
87 , chineseFrequency :: !(Maybe Probability)
88 , chineseComponents :: !(Maybe [Char])
89 , chineseStrokes :: !(Maybe Natural)
90 -- ^ FIXME: use Natural
91 }
92 deriving (Generic, Show)
93 chineseDecomp :: ShortText -> ChineseDictEntries -> [Char]
94 chineseDecomp key ChineseDictEntries{chineseComponents} =
95 chineseComponents
96 & fromMaybe (if 1 < ShortText.length key then ShortText.unpack key else [])
97
98 instance Semigroup ChineseDictEntries where
99 x <> y =
100 ChineseDictEntries
101 { chinesePinyins =
102 if null (chinesePinyins x)
103 then chinesePinyins y
104 else
105 if null (chinesePinyins y)
106 || chinesePinyins x
107 == chinesePinyins y
108 then chinesePinyins x
109 else -- traceString
110 -- ( mconcat $
111 -- [ "Semigroup: chinesePinyins: mismatch: "
112 -- , show (chinesePinyins x)
113 -- , " <> "
114 -- , show (chinesePinyins y)
115 -- ]
116 -- )
117 (chinesePinyins x)
118 , chineseEnglish = chineseEnglish x <> chineseEnglish y
119 , chineseFrequency = chineseFrequency x <|> chineseFrequency y
120 , chineseComponents =
121 case chineseComponents x of
122 Nothing -> chineseComponents y
123 Just xS ->
124 case chineseComponents y of
125 Nothing -> chineseComponents x
126 Just yS
127 | xS == yS -> chineseComponents x
128 | otherwise -> error $ "Semigroup: chineseComponents: mismatch: " <> show xS <> " <> " <> show yS
129 , chineseStrokes =
130 case chineseStrokes x of
131 Nothing -> chineseStrokes y
132 Just xS ->
133 case chineseStrokes y of
134 Nothing -> chineseStrokes x
135 Just yS
136 | xS == yS -> chineseStrokes x
137 | otherwise -> error $ "Semigroup: chineseStrokes: mismatch: " <> show xS <> " <> " <> show yS
138 }
139 instance Monoid ChineseDictEntries where
140 mempty =
141 ChineseDictEntries
142 { chinesePinyins = mempty
143 , chineseEnglish = mempty
144 , chineseFrequency = Nothing
145 , chineseComponents = Nothing
146 , chineseStrokes = Nothing
147 }
148
149 readLoachWangFreq :: IO ChineseDict
150 readLoachWangFreq = do
151 readJSON "data/langs/mandarin/LoachWang/word_freq.json" $
152 JSON.withObject "WordFreq" \kv ->
153 ChineseDict
154 . Map.fromAscList
155 <$> forM (kv & JSON.toMap & Map.toList) \(k, v) ->
156 v & JSON.withScientific "Frequency" \freqSci -> do
157 let freq = freqSci & toRational & probability & arithError
158 return
159 ( k & JSON.DictKey.toShortText
160 , mempty{chineseFrequency = Just freq}
161 )
162
163 readLoachWangStrokes :: IO ChineseDict
164 readLoachWangStrokes =
165 readJSON ("data/langs/mandarin/LoachWang/char_strokes.json") $
166 JSON.withObject "WordFreq" \kv ->
167 ChineseDict
168 . Map.fromAscList
169 <$> forM (kv & JSON.toMap & Map.toList) \(k, v) -> do
170 strokes <- JSON.parseJSON v
171 return
172 ( k & JSON.DictKey.toShortText
173 , mempty{chineseStrokes = Just strokes}
174 )
175
176 circledNumbers :: Set ShortText
177 circledNumbers = ["①", "②", "③", "④", "⑤", "⑥", "⑦", "⑧", "⑨", "⑩", "⑪", "⑫", "⑬", "⑭", "⑮", "⑯", "⑰", "⑱", "⑲", "⑳"]
178
179 readChiseStrokes :: IO ChineseDict
180 readChiseStrokes =
181 withDataFile "data/langs/mandarin/cjkvi-ids/ucs-strokes.txt" \dataHandle -> do
182 let loop accDict@(ChineseDict !acc) = do
183 isEOF <- dataHandle & Sys.hIsEOF
184 if isEOF
185 then return accDict
186 else do
187 line <- ByteString.Char8.hGetLine dataHandle
188 let decodeUtf8 = ShortText.fromByteString >>> fromMaybe (error "invalid UTF-8")
189 let splitOnChar c = ByteString.split (fromIntegral (fromEnum c))
190 case line & splitOnChar '\t' of
191 [unicodeLit, decodeUtf8 -> unicodeChar, chineseStrokesLit]
192 | unicodeLit & ByteString.isPrefixOf "U+" ->
193 loop $
194 ChineseDict $
195 acc & Map.insert unicodeChar mempty{chineseStrokes}
196 where
197 chineseStrokes :: Maybe Natural =
198 chineseStrokesLit
199 & splitOnChar ','
200 <&> ( decodeUtf8
201 >>> ShortText.unpack
202 >>> readMaybe @Natural
203 )
204 -- CorrectnessWarning: conservative bias toward the greatest number of strokes.
205 & List.maximum
206 <&> fromIntegral
207 _ -> loop accDict
208 loop mempty
209
210 readChiseDecomp :: IO ChineseDict
211 readChiseDecomp =
212 withDataFile "data/langs/mandarin/cjkvi-ids/ids.txt" $ (`loop` mempty)
213 where
214 loop dataHandle accDict@(ChineseDict !acc) = do
215 isEOF <- dataHandle & Sys.hIsEOF
216 if isEOF
217 then return accDict
218 else do
219 line <- ByteString.Char8.hGetLine dataHandle
220 let decodeUtf8 = ShortText.fromByteString >>> fromMaybe (error "invalid UTF-8")
221 let splitOnChar c = ByteString.split (fromIntegral (fromEnum c))
222 case line & splitOnChar '\t' of
223 unicodeLit : (decodeUtf8 -> unicodeChar) : decomps
224 | unicodeLit & ByteString.isPrefixOf "U+" ->
225 let
226 decompWithCodes :: [([Char], Set Char)] =
227 decomps <&> \decomp ->
228 case decomp & ByteString.stripSuffix "]" of
229 Nothing ->
230 ( decomp & decodeUtf8 & ShortText.unpack
231 , mempty
232 )
233 Just decompWithCode ->
234 ( decompChars
235 & ShortText.unpack
236 -- ExplanationNote: remove the final '['
237 & List.init
238 , decompCodes & ShortText.unpack & fromList
239 )
240 where
241 (decompChars, decompCodes) =
242 decompWithCode
243 & decodeUtf8
244 & ShortText.breakEnd (== '[')
245 chineseComponents =
246 decompWithCodes
247 -- FIXME: CorrectnessWarning: maybe select using the Codes
248 -- instead of just taking the first.
249 & List.head
250 & fst
251 -- ExplanationNote: remove IDCs (Ideographic Description Characters)
252 -- namely "⿰" (U+2FF0) to "⿻" (U+2FFB).
253 & List.filter (\c -> not ('\x2FF0' <= c && c <= '\x2FFB'))
254 & List.filter (\c -> ShortText.singleton c /= unicodeChar)
255 & Just
256 in
257 loop dataHandle $
258 ChineseDict $
259 acc & Map.insert unicodeChar mempty{chineseComponents}
260 _ -> loop dataHandle accDict
261
262 readChineseDict :: IO ChineseDict
263 readChineseDict =
264 fold $
265 list
266 [ return extraEntries
267 , readLoachWangFreq
268 , readChiseStrokes
269 , readChiseDecomp
270 , -- , readChineseLexicalDatabase
271 readCEDICT
272 -- , readLoachWangStrokes
273 -- , readOutlierDecomp
274 ]
275 extraEntries :: ChineseDict
276 extraEntries =
277 ChineseDict $
278 [
279 [
280 ( "·"
281 , mempty
282 { chineseStrokes = Just 1
283 , chineseComponents = Just ['·']
284 }
285 )
286 ,
287 ( "库蠓属"
288 , mempty
289 { chinesePinyins = ["ku4", "meng3", "shi4"]
290 , chineseEnglish = ["culicoides"]
291 }
292 )
293 ,
294 ( "瘦果"
295 , mempty
296 { chinesePinyins = ["shou4", "guo3"]
297 , chineseEnglish = ["achene"]
298 }
299 )
300 ,
301 ( "美洲野马"
302 , mempty
303 { chinesePinyins = ["mei3", "zhou1", "ye3", "ma3"]
304 , chineseEnglish = ["mustang horse"]
305 }
306 )
307 ,
308 ( "钩粉蝶"
309 , mempty
310 { chinesePinyins = ["gou1", "fen3", "die2"]
311 , chineseEnglish = ["brimstone butterfly"]
312 }
313 )
314 ,
315 ( "核果"
316 , mempty
317 { chinesePinyins = ["he2", "guo3"]
318 , chineseEnglish = ["drupe"]
319 }
320 )
321 ,
322 ( "0"
323 , mempty
324 { chineseStrokes = Just 14
325 , chinesePinyins = ["ling2"] -- 零
326 }
327 )
328 ,
329 ( "1"
330 , mempty
331 { chineseStrokes = Just 2
332 , chinesePinyins = ["yi1"]
333 }
334 )
335 ,
336 ( "2"
337 , mempty
338 { chineseStrokes = Just 2
339 , chinesePinyins = ["er4"]
340 }
341 )
342 ,
343 ( "3"
344 , mempty
345 { chineseStrokes = Just 2
346 , chinesePinyins = ["san1"]
347 }
348 )
349 ,
350 ( "4"
351 , mempty
352 { chineseStrokes = Just 2
353 , chinesePinyins = ["si4"]
354 }
355 )
356 ,
357 ( "5"
358 , mempty
359 { chineseStrokes = Just 2
360 , chinesePinyins = ["wu3"]
361 }
362 )
363 ,
364 ( "6"
365 , mempty
366 { chineseStrokes = Just 1
367 , chinesePinyins = ["liu4"]
368 }
369 )
370 ,
371 ( "7"
372 , mempty
373 { chineseStrokes = Just 1
374 , chinesePinyins = ["qi1"]
375 }
376 )
377 ,
378 ( "8"
379 , mempty
380 { chineseStrokes = Just 2
381 , chinesePinyins = ["ba1"]
382 }
383 )
384 ,
385 ( "9"
386 , mempty
387 { chineseStrokes = Just 1
388 , chinesePinyins = ["jiu3"]
389 }
390 )
391 ]
392 , [ ( letter & ShortText.singleton
393 , mempty
394 { chineseStrokes = Just 1
395 , chinesePinyins = []
396 }
397 )
398 | letter <-
399 [ ['A' .. 'Z']
400 , ['a' .. 'z']
401 , ['π', '□', '○']
402 ]
403 & mconcat
404 ]
405 ]
406 & mconcat
407 & Map.fromList
408
409 readLoachWangWordOrder :: IO JSON.Array
410 readLoachWangWordOrder =
411 readJSON "data/langs/mandarin/LoachWang/loach_word_order.json" $
412 JSON.withArray "LoachWordOrder" \arr -> return arr
413
414 readOutlierDecomp :: IO ChineseDict
415 readOutlierDecomp =
416 readJSON "data/langs/mandarin/LoachWang/outlier_decomp.json" $
417 JSON.withObject "OutlierDecomp" \kv ->
418 ChineseDict
419 . Map.fromAscList
420 <$> forM (kv & JSON.toMap & Map.toList) \(k, v) ->
421 v & JSON.withArray "Decomp" \decomp ->
422 forM
423 (decomp & toList)
424 ( \sub ->
425 sub & JSON.withText "Atom" \atom ->
426 return $ atom & Text.unpack
427 )
428 <&> \atoms ->
429 ( k & JSON.DictKey.toShortText
430 , mempty{chineseComponents = Just $ atoms & mconcat}
431 )
432
433 reverseDecomp :: ChineseDict -> Map {-def-} ShortText {-refs-} [ShortText]
434 reverseDecomp (ChineseDict dict) =
435 [ sub := [key]
436 | (key, entry) <- dict & Map.toList
437 , sub <- entry & chineseDecomp key <&> ShortText.singleton
438 ]
439 & Map.fromListWith (<>)
440
441 type Benefit = Probability
442 type Cost = Natural
443 type Weight = Probability
444 type DictKey = ShortText
445 data Weights = Weights
446 { weightsMap :: Map DictKey (Benefit, Cost)
447 }
448 deriving (Show)
449
450 -- |
451 -- Zhao, K., zhao, D., Yang, J., Ma, W., & Yu, B. (2019).
452 -- Efficient Learning Strategy of Chinese Characters Based on
453 -- Usage Frequency and Structural Relationship.
454 -- 2019 IEEE 4th International Conference on Big Data Analytics (ICBDA).
455 -- doi:10.1109/icbda.2019.8713245
456 -- https://sci-hub.se/10.1109/icbda.2019.8713245
457 dictToWeights :: ChineseDict -> Weights
458 dictToWeights cn@(ChineseDict dict) = ST.runST do
459 -- Explanation: memo tables, to avoid recomputing benefits and costs
460 keyToBenefitST :: ST.STRef st (Map DictKey Benefit) <- ST.newSTRef Map.empty
461 keyToCostST :: ST.STRef st (Map DictKey Cost) <- ST.newSTRef Map.empty
462 let refsMap :: Map ShortText [ShortText] = reverseDecomp cn
463 let
464 keyToBenefitLoop :: DictKey -> ST.ST st Benefit
465 keyToBenefitLoop key = do
466 -- traceShowM ("keyToBenefitLoop"::String, key)
467 keyToBenefitMap <- keyToBenefitST & ST.readSTRef
468 if Set.member key circledNumbers
469 then
470 -- Explanation: circledNumbers induce a cost but zero benefit
471 return proba0
472 else
473 keyToBenefitMap
474 & Map.lookup key
475 & \case
476 Just bene -> return bene
477 Nothing -> do
478 let refs :: [ShortText] =
479 refsMap
480 & Map.lookup key
481 & fromMaybe []
482 -- & fromMaybe []
483 & List.delete key
484 -- traceShowM ("keyToBenefitLoop"::String, key, refs)
485 let freq =
486 dict
487 & Map.lookup key
488 & maybe proba0 (chineseFrequency >>> fromMaybe proba0)
489 refsBC <- forM refs \ref -> do
490 bene <- keyToBenefitLoop ref
491 cost <- keyToCostLoop ref
492 return (bene, cost)
493 cost <- keyToCostLoop key
494 bene <-
495 arithM $
496 probability $
497 runProbability freq
498 + toRational cost
499 * sum
500 [ runProbability b / toRational c
501 | (b, c) <- refsBC
502 , c /= 0
503 ]
504 ST.modifySTRef keyToBenefitST $ Map.insert key bene
505 return bene
506 keyToCostLoop :: DictKey -> ST.ST st Cost
507 keyToCostLoop key = do
508 -- traceShowM ("keyToCostLoop"::String, key)
509 keyToCostMap <- keyToCostST & ST.readSTRef
510 if Set.member key circledNumbers
511 then
512 -- Explanation: circledNumbers induce a cost given by their number
513 return $ dict & Map.lookup key & maybe 1 (chineseStrokes >>> fromMaybe 1)
514 else
515 keyToCostMap
516 & Map.lookup key
517 & \case
518 Just cost -> return cost
519 Nothing -> do
520 let entry = dict & Map.lookup key & fromMaybe mempty
521 cost <-
522 -- single char
523 if ShortText.length key == 1
524 then do
525 let deps =
526 entry
527 & chineseDecomp key
528 & List.delete (ShortText.unpack key & List.head)
529 -- & (`Set.difference` (ShortText.unpack key & Set.fromList))
530 if null deps
531 then -- atom char
532 return $
533 entry
534 & chineseStrokes
535 & fromMaybe 1
536 else -- & fromMaybe (error $ "dictToWeights: missing chineseStrokes for: " <> ShortText.unpack key)
537 do
538 -- composite char
539 -- traceShowM ("composite"::String, show key, show <$> Set.toList deps
540 -- , deps & (`Set.difference` (ShortText.unpack key & Set.fromList))
541 -- )
542 deps
543 & foldMapM (\dep -> Sum <$> keyToCostLoop (dep & ShortText.singleton))
544 <&> getSum
545 else -- word
546 do
547 -- traceShowM ("word"::String, key)
548 key
549 & ShortText.unpack
550 & foldMapM (\dep -> Sum <$> keyToCostLoop (dep & ShortText.singleton))
551 <&> getSum
552 ST.modifySTRef keyToCostST $ Map.insert key cost
553 return cost
554 weightsMap <-
555 dict & Map.traverseWithKey \key _entry -> do
556 -- traceShowM ("dictToWeights"::String, key)
557 bene <- keyToBenefitLoop key
558 cost <- keyToCostLoop key
559 return (bene, cost)
560 return
561 Weights
562 { weightsMap
563 }
564
565 newtype ChineseOrder = ChineseOrder (Map (Down Weight) Weights)
566 unChineseOrder (ChineseOrder x) = x
567 dictOrder :: Weights -> ChineseOrder
568 dictOrder w =
569 [ ( probability (runProbability b / toRational c) & arithError & Down
570 , Weights
571 { weightsMap = Map.singleton k v
572 }
573 )
574 | (k, v@(b, c)) <- w & weightsMap & Map.toList
575 ]
576 & Map.fromListWith (\x y -> Weights{weightsMap = Map.unionWith (error "dictOrder") (weightsMap x) (weightsMap y)})
577 & ChineseOrder
578
579 dictOrderIO = readChineseDict <&> (dictToWeights >>> dictOrder)
580
581 -- order100 :: IO [(DictKey, Maybe ChineseDictEntries)]
582 order100 = do
583 dict@(ChineseDict d) <- readChineseDict
584 let ChineseOrder o = dict & dictToWeights & dictOrder
585 return $
586 o
587 & Map.take 100
588 & Map.elems
589 & foldMap (weightsMap >>> Map.keys)
590 <&> \k -> (k, d & Map.lookup k <&> \r -> (chinesePinyins r, chineseEnglish r))
591
592 dictOrderBySameWeight (ChineseOrder o) =
593 [ ( Map.size (weightsMap v)
594 , Map.singleton k v
595 )
596 | (k, v) <- o & Map.toList
597 ]
598 & Map.fromListWith (<>)
599
600 {-
601 data UnfoldedGraph node lbl edge = UnfoldedGraph
602 { unfoldedVertexToEdges :: !(Array.Array Vertex (lbl, Map edge IntSet))
603 , unfoldedNodeToVertex :: !(Map node Vertex)
604 }
605 deriving (Show, Generic)
606 unfoldGMany ::
607 forall node edge nodeLabel.
608 Ord node =>
609 (node -> (nodeLabel, Map edge (Set node))) ->
610 [node] ->
611 UnfoldedGraph node nodeLabel edge
612 unfoldGMany gen roots = ST.runST do
613 nodeToVertexST :: ST.STRef st (Map node Vertex) <- ST.newSTRef Map.empty
614 vertexToEdgesST :: ST.STRef st [(Vertex, (nodeLabel, Map edge IntSet))] <- ST.newSTRef []
615 lastVertexST <- ST.newSTRef firstVertex
616 let newVertex = do
617 v <- ST.readSTRef lastVertexST
618 ST.writeSTRef lastVertexST (v + 1)
619 return v
620 let
621 nodeToVertexDepthFirst :: node -> ST.ST st Vertex
622 nodeToVertexDepthFirst src =
623 nodeToVertexST
624 & ST.readSTRef
625 <&> Map.lookup src
626 >>= \case
627 -- DescriptionNote: `src` has already been seen,
628 -- just return its unique `Vertex`.
629 Just v -> return v
630 -- DescriptionNote: `src` has never been seen yet,
631 -- allocate a `newVertex` for it
632 Nothing -> do
633 dst <- newVertex
634 ST.modifySTRef nodeToVertexST $ Map.insert src dst
635 let (nodeLabel, edgeToDsts) = gen src
636 edgeToDstsV :: Map edge IntSet <-
637 edgeToDsts & Map.traverseWithKey \_edge dsts ->
638 forM (dsts & Set.toList) nodeToVertexDepthFirst
639 <&> IntSet.fromList
640 let res = (dst, (nodeLabel, edgeToDstsV))
641 ST.modifySTRef vertexToEdgesST (res :)
642 return dst
643 forM_ roots nodeToVertexDepthFirst
644 nodeToVertex <- ST.readSTRef nodeToVertexST
645 vertexToEdges <- ST.readSTRef vertexToEdgesST
646 lastId <- ST.readSTRef lastVertexST
647 return
648 UnfoldedGraph
649 { unfoldedVertexToEdges = vertexToEdges & Array.array (firstVertex, lastId - 1)
650 , unfoldedNodeToVertex = nodeToVertex
651 }
652 where
653 firstVertex :: Vertex = 0
654 -}
655
656 -- mostComp d =
657 -- [
658 -- | d & reverseDecomp & Map.toList
659 -- ] & Map.fromListWith (<>)
660
661 data ChineseTone
662 = ChineseTone1
663 | ChineseTone2
664 | ChineseTone3
665 | ChineseTone4
666 | ChineseTone5
667 deriving (Eq, Ord, Show, Enum)
668
669 numberedPinyinToDiacriticPiniyn :: ShortText -> ShortText
670 numberedPinyinToDiacriticPiniyn numPin =
671 case pinTone of
672 Nothing -> pinRoot
673 Just tone ->
674 pinRoot
675 & ShortText.unpack
676 & addUmlaut
677 & setDiacritic
678 & ShortText.pack
679 where
680 addUmlaut ('u' : ':' : cs) = 'ü' : addUmlaut cs
681 addUmlaut ('v' : cs) = 'ü' : addUmlaut cs
682 addUmlaut ('U' : ':' : cs) = 'Ü' : addUmlaut cs
683 addUmlaut ('V' : cs) = 'Ü' : addUmlaut cs
684 addUmlaut (c : cs) = c : addUmlaut cs
685 addUmlaut [] = []
686 -- CorrectnessNote: a, e or the o in ou always get the marker
687 setDiacritic (c@'a' : cs) = convert tone c : cs
688 setDiacritic (c@'e' : cs) = convert tone c : cs
689 setDiacritic (c@'o' : 'u' : cs) = convert tone c : 'u' : cs
690 setDiacritic (c : cs)
691 -- CorrectnessNote: if no a, e, or ou found, the tone mark goes on the last vowel
692 | List.elem c vowels && all (`List.notElem` vowels) cs = convert tone c : cs
693 | otherwise = c : setDiacritic cs
694 setDiacritic [] = []
695 vowels = lowerVowels <> (Char.toUpper <$> lowerVowels)
696 lowerVowels = ['a', 'e', 'i', 'o', 'u', 'ü'] & list
697 where
698 (pinRoot, pinTone) ::
699 (ShortText, Maybe ChineseTone) =
700 numPin & ShortText.spanEnd Char.isDigit & second \case
701 "" -> Nothing
702 ds -> ds & ShortText.unpack & read & (\x -> x - (1 :: Int)) & toEnum & Just
703 convert :: ChineseTone -> Char -> Char
704 convert t c = case t of
705 ChineseTone1 ->
706 case c of
707 'a' -> 'ā'
708 'e' -> 'ē'
709 'i' -> 'ī'
710 'o' -> 'ō'
711 'u' -> 'ū'
712 'ü' -> 'ǖ'
713 'A' -> 'Ā'
714 'E' -> 'Ē'
715 'I' -> 'Ī'
716 'O' -> 'Ō'
717 'U' -> 'Ū'
718 'Ü' -> 'Ǖ'
719 _ -> undefined
720 ChineseTone2 ->
721 case c of
722 'a' -> 'á'
723 'e' -> 'é'
724 'i' -> 'í'
725 'o' -> 'ó'
726 'u' -> 'ú'
727 'ü' -> 'ǘ'
728 'A' -> 'Á'
729 'E' -> 'É'
730 'I' -> 'Í'
731 'O' -> 'Ó'
732 'U' -> 'Ú'
733 'Ü' -> 'Ǘ'
734 _ -> undefined
735 ChineseTone3 ->
736 case c of
737 'a' -> 'ǎ'
738 'e' -> 'ě'
739 'i' -> 'ǐ'
740 'o' -> 'ǒ'
741 'u' -> 'ǔ'
742 'ü' -> 'ǚ'
743 'A' -> 'Ǎ'
744 'E' -> 'Ě'
745 'I' -> 'Ǐ'
746 'O' -> 'Ǒ'
747 'U' -> 'Ǔ'
748 'Ü' -> 'Ǚ'
749 _ -> undefined
750 ChineseTone4 ->
751 case c of
752 'a' -> 'à'
753 'e' -> 'è'
754 'i' -> 'ì'
755 'o' -> 'ò'
756 'u' -> 'ù'
757 'ü' -> 'ǜ'
758 'A' -> 'À'
759 'E' -> 'È'
760 'I' -> 'Ì'
761 'O' -> 'Ò'
762 'U' -> 'Ù'
763 'Ü' -> 'Ǜ'
764 _ -> undefined
765 ChineseTone5 -> c
766
767 data HskLevel
768 = HskLevel301
769 | HskLevel302
770 | HskLevel303
771 | HskLevel304
772 | HskLevel305
773 | HskLevel306
774 deriving (Eq, Ord, Enum, Show)
775
776 -- instance CSV.FromRecord ChineseDictEntries
777 -- instance CSV.ToRecord ChineseDictEntries
778 -- instance CSV.FromNamedRecord ChineseDictEntries
779 -- instance CSV.ToNamedRecord ChineseDictEntries
780 -- instance CSV.DefaultOrdered ChineseDictEntries
781
782 feed :: (ByteString -> r) -> Sys.Handle -> Sys.IO r
783 feed k csvFile = do
784 Sys.hIsEOF csvFile >>= \case
785 True -> return $ k ""
786 False -> k <$> ByteString.hGetSome csvFile 4096
787
788 readHSK :: HskLevel -> IO ChineseDict
789 readHSK chineseHSK = do
790 withDataFile ("data/langs/mandarin/HSK/hsk" <> show hskIndex Sys.<.> "csv") \fileHandle -> do
791 loop fileHandle mempty $
792 CSV.Incremental.decodeWithP parser decodeOpts CSV.NoHeader
793 where
794 hskIndex = chineseHSK & fromEnum & (+ 1)
795 decodeOpts = CSV.defaultDecodeOptions
796 parser :: CSV.Record -> CSV.Parser (ShortText, ChineseDictEntries)
797 parser v
798 | length v == 3 = do
799 chinese <- v .! 0
800 chinesePinyins <- v .! 1 <&> ShortText.split (== ' ')
801 chineseEnglish <- v .! 2 <&> pure
802 let chineseFrequency = Nothing
803 let chineseComponents = Nothing
804 let chineseStrokes = Nothing
805 pure (chinese, ChineseDictEntries{..})
806 | otherwise = empty
807 check =
808 either (\x -> Sys.print x >> return mempty) \(chinese, e) ->
809 return $ ChineseDict $ Map.singleton chinese e
810 loop fileHandle !acc = \case
811 CSV.Incremental.Fail _ errMsg -> do
812 Sys.putStrLn errMsg
813 Sys.exitFailure
814 CSV.Incremental.Many rs k -> do
815 ok <- rs & foldMapM check
816 t <- feed k fileHandle
817 loop fileHandle (acc <> ok) t
818 CSV.Incremental.Done rs -> do
819 ok <- rs & foldMapM check
820 return (acc <> ok)
821
822 -- | Sun, C.C., Hendrix, P., Ma, J. et al.
823 -- Chinese lexical database (CLD).
824 -- Behav Res 50, 2606–2629 (2018).
825 -- https://doi.org/10.3758/s13428-018-1038-3
826 readChineseLexicalDatabase :: IO ChineseDict
827 readChineseLexicalDatabase = do
828 withDataFile "data/langs/mandarin/chineselexicaldatabase2.1.csv" \fileHandle -> do
829 loop fileHandle mempty $
830 CSV.Incremental.decodeWithP parser decodeOpts CSV.HasHeader
831 where
832 decodeOpts = CSV.defaultDecodeOptions
833 parser :: CSV.Record -> CSV.Parser (DictKey, ChineseDictEntries)
834 parser v
835 | length v == 269 = do
836 key <- v .! 0
837 keyLength :: Int <- v .! 5
838 pinyin1 <- v .! 15
839 pinyin2 <- v .! 16
840 pinyin3 <- v .! 17
841 pinyin4 <- v .! 18
842 let chinesePinyins =
843 [pinyin1, pinyin2, pinyin3, pinyin4]
844 & List.take keyLength
845 freqPerMillion :: Double <- v .! 39
846 let chineseFrequency =
847 freqPerMillion
848 & toRational
849 & (/ 1000000)
850 & probability
851 & fromMaybe (error "readChineseLexicalDatabase: Frequency")
852 & Just
853 pure (key, mempty{chineseFrequency, chinesePinyins})
854 | otherwise = error $ "readChineseLexicalDatabase: wrong number of columns: " <> show v
855 check =
856 either (\x -> Sys.print x >> return mempty) \(key, val) ->
857 return $ ChineseDict $ Map.singleton key val
858 loop fileHandle !acc = \case
859 CSV.Incremental.Fail _ errMsg -> do
860 Sys.putStrLn errMsg
861 Sys.exitFailure
862 CSV.Incremental.Many rs k -> do
863 ok <- rs & foldMapM check
864 t <- feed k fileHandle
865 loop fileHandle (acc <> ok) t
866 CSV.Incremental.Done rs -> do
867 ok <- rs & foldMapM check
868 return (acc <> ok)
869
870 readCEDICT :: IO ChineseDict
871 readCEDICT = do
872 withDataFile "data/langs/mandarin/CEDICT/cedict_ts.u8" \cedictHandle -> do
873 let skipHeader = do
874 isEOF <- cedictHandle & Sys.hIsEOF
875 if isEOF
876 then return ()
877 else do
878 lineBS <- ByteString.hGetLine cedictHandle
879 let lineST = lineBS & ShortText.fromByteString & fromMaybe (error "invalid UTF-8")
880 let begin = lineST & ShortText.take 1
881 when (begin == "#") do
882 skipHeader
883 skipHeader
884 let loop !acc = do
885 isEOF <- cedictHandle & Sys.hIsEOF
886 if isEOF
887 then return acc
888 else do
889 line <- ByteString.hGetLine cedictHandle
890 let decodeUtf8 = ShortText.fromByteString >>> fromMaybe (error "invalid UTF-8")
891 -- DescriptionNote: each line is formatted as: #(.+) (.+) \[(.+)] /(.*)/#'
892 let skipChar c = void $ MT.state $ ByteString.span (== fromIntegral (fromEnum c))
893 let skipPrefix p =
894 MT.modify' $
895 ByteString.stripPrefix p
896 >>> fromMaybe (error $ "skipPrefix fail to match: " <> show p)
897 let breakOnChar c = ByteString.break (== fromIntegral (fromEnum c))
898 let breakOnSpace = breakOnChar ' '
899 let skipSuffix p =
900 MT.modify' $
901 ByteString.stripSuffix p
902 >>> fromMaybe
903 ( error $
904 "skipSuffix: mismatch: "
905 <> show p
906 <> "\n on line: "
907 <> ShortText.unpack (decodeUtf8 line)
908 <> "\n escaped: "
909 <> show (ShortText.unpack (decodeUtf8 line))
910 )
911 let (dict, leftover) = (`MT.runState` line) do
912 _chineseTrad <- MT.state $ breakOnSpace >>> first decodeUtf8
913 skipChar ' '
914 chineseSimpl <- MT.state $ breakOnSpace >>> first decodeUtf8
915 skipChar ' '
916 skipPrefix "["
917 chinesePinyins <-
918 MT.state $
919 breakOnChar ']'
920 >>> first
921 ( \s ->
922 s
923 & ByteString.split (fromIntegral (fromEnum ' '))
924 -- CorrectnessWarning: convert pinyin to lowercase, and remove dashes
925 <&> (decodeUtf8 >>> ShortText.unpack >>> (<&> Char.toLower) >>> ShortText.pack)
926 & List.filter (/= "-")
927 )
928 skipPrefix "] /"
929 -- CorrectnessNote: some lines do not end with \r
930 -- hence make it optional.
931 MT.modify' \s -> s & ByteString.stripSuffix "\r" & fromMaybe s
932 skipSuffix "/"
933 chineseEnglish <- MT.gets \s -> s & ByteString.split (fromIntegral (fromEnum '/')) <&> decodeUtf8
934 MT.put mempty
935 let chinese = chineseSimpl
936 return $
937 ChineseDict $
938 Map.singleton chinese $
939 mempty
940 { chinesePinyins
941 , chineseEnglish
942 }
943 if not (ByteString.null leftover)
944 then error $ "parserLine: leftover: " <> show leftover
945 else loop (acc <> dict)
946 loop mempty
947
948 lookupPinyins :: ChineseDict -> ShortText -> [ShortText]
949 lookupPinyins (ChineseDict dict) word =
950 word
951 & (`Map.lookup` dict)
952 & fromMaybe (error $ "lookupPinyins: no entry for: {" <> wordString <> "}")
953 & chinesePinyins
954 & (\ps -> if null ps then error $ "lookupPinyins: empty entry for: " <> wordString else ps)
955 where
956 wordString = word & ShortText.unpack
957
958 pronunciation :: ChineseDict -> [Pron.Lexeme] -> [[Either Char Pron.Pron]]
959 pronunciation dict inp =
960 inp & Pron.lexemesChars & Text.pack & Text.words <&> wordToPron
961 where
962 wordToPron :: Text -> [Either Char Pron.Pron]
963 wordToPron wordText =
964 [ pronun chars charCateg charBlock
965 | (charCateg, uniBlockToChars) <-
966 wordText
967 & Text.unpack
968 & List.map (\c -> (Char.generalCategory c, (Char.unicodeBlock c, c)))
969 & Char.consecutiveGroups
970 <&> (<&> Char.consecutiveGroups)
971 , (charBlock, chars) <- uniBlockToChars
972 ]
973 & mconcat
974 where
975 pronun :: [Char] -> Char.GeneralCategory -> Maybe Char.UnicodeBlock -> [Either Char Pron.Pron]
976 pronun chars Char.DecimalNumber _ =
977 [ Right
978 Pron.Pron
979 { pronInput = [Pron.LexemeChar char]
980 , pronRule =
981 Pron.rule
982 { Pron.rulePron =
983 Pron.Pronunciations
984 { Pron.unPronunciations =
985 [ Pron.RuleLexemes [Pron.LexemeChar char] :=
986 Pron.Pronunciation
987 { Pron.pronunciationIPABroad = []
988 , Pron.pronunciationText = txt char
989 }
990 ]
991 }
992 }
993 }
994 | char <- chars
995 ]
996 where
997 txt c =
998 c
999 & ShortText.singleton
1000 & lookupPinyins dict
1001 <&> ( \p ->
1002 p
1003 & numberedPinyinToDiacriticPiniyn
1004 & ShortText.toText
1005 & Text.toLower
1006 )
1007 & Text.intercalate " "
1008 pronun chars Char.OpenPunctuation _ =
1009 [ Left c
1010 | c <- chars
1011 ]
1012 pronun chars Char.ClosePunctuation _ =
1013 [ Left c
1014 | c <- chars
1015 ]
1016 pronun chars Char.OtherPunctuation _ =
1017 [ Left c
1018 | c <- chars
1019 ]
1020 pronun chars _ (Just Char.UnicodeBlockCJK{}) =
1021 [ Right
1022 Pron.Pron
1023 { pronInput = Pron.LexemeChar <$> chars
1024 , pronRule =
1025 Pron.rule
1026 { Pron.rulePron =
1027 Pron.Pronunciations
1028 { Pron.unPronunciations =
1029 [ Pron.RuleLexemes (Pron.LexemeChar <$> chars) :=
1030 Pron.Pronunciation
1031 { Pron.pronunciationIPABroad = []
1032 , Pron.pronunciationText = txt
1033 }
1034 ]
1035 }
1036 }
1037 }
1038 ]
1039 where
1040 txt =
1041 chars
1042 & ShortText.pack
1043 & lookupPinyins dict
1044 & List.map
1045 ( \pinyin ->
1046 pinyin
1047 & numberedPinyinToDiacriticPiniyn
1048 & ShortText.toText
1049 & Text.toLower
1050 )
1051 & Text.intercalate " "
1052 -- if List.length wordPinyins == Text.length wordText = wordPinyins
1053 -- where
1054 -- wordPinyins = chars
1055 -- & ShortText.pack
1056 -- & lookupPinyins dict
1057 pronun chars categ block = errorShow ("pronunciation" :: Text, chars, categ, block)
1058
1059 orderHTML :: Maybe Int -> ChineseDict -> IO _
1060 orderHTML limit dict@(ChineseDict d) = do
1061 dataPath <- Self.getDataDir <&> File.normalise
1062 return $ Blaze.renderMarkupBuilder do
1063 HTML.docTypeHtml do
1064 HTML.head do
1065 HTML.title $ ("Chinese Order" <> maybe "" (\l -> " (first " <> show l <> ")") limit) & HTML.toHtml
1066 forM_
1067 ( [ "styles/Paper.css"
1068 , "styles/Rosetta/Common.css"
1069 , "styles/Rosetta/Reading.css"
1070 ]
1071 & list
1072 )
1073 \cssFile ->
1074 HTML.link
1075 ! HA.rel "stylesheet"
1076 ! HA.type_ "text/css"
1077 ! HA.href (dataPath </> cssFile & HTML.toValue)
1078 -- HTML.styleCSS $ pagesDifficulties & difficultyCSS
1079 HTML.body do
1080 let order =
1081 dict
1082 & dictToWeights
1083 & dictOrder
1084 & unChineseOrder
1085 & Map.elems
1086 & foldMap (weightsMap >>> Map.keys)
1087 -- Explanation: several entries can share the same 'Weight"
1088 -- hence compute ranks after merging.
1089 & ol1
1090 let keyToRank = [(key, rank) | (rank, key) <- order] & Map.fromListWith undefined
1091 let rankId rank = "entry-" <> HTML.toValue (show rank)
1092 let rankLink rank =
1093 HTML.a
1094 ! styles
1095 [ "color" := "blue"
1096 , "text-decoration" := "none"
1097 ]
1098 ! HA.href ("#" <> rankId rank)
1099 $ do
1100 "#"
1101 rank & HTML.toHtml
1102 HTML.div ! styles ["display" := "flex", "flex-direction" := "column"] $ do
1103 forM_
1104 ( order
1105 & maybe id List.take limit
1106 )
1107 \(rank :: Int, k) ->
1108 forM_ (d & Map.lookup k) \v -> do
1109 HTML.span
1110 ! styles ["text-align" := "justify", "break-inside" := "avoid"]
1111 ! HA.id (rankId rank)
1112 $ do
1113 rankLink rank
1114 ". "
1115 HTML.span $ k & HTML.toHtml
1116 unless (v & chinesePinyins & List.null) do
1117 "["
1118 HTML.span $ v & chinesePinyins <&> ShortText.toText & Text.unwords & HTML.toHtml
1119 "] "
1120 unless (v & chineseDecomp k & List.null) do
1121 "("
1122 let decomps =
1123 [ (component, componentRankMaybe) :: (ShortText, Maybe Int)
1124 | component <- chineseDecomp k v <&> ShortText.singleton
1125 , let componentRankMaybe = keyToRank & Map.lookup component
1126 ]
1127 HTML.span do
1128 decomps
1129 <&> ( \(component, componentRankMaybe) -> do
1130 HTML.span do
1131 component & HTML.toHtml
1132 case componentRankMaybe of
1133 Nothing -> ""
1134 Just componentRank
1135 | Set.member component circledNumbers -> ""
1136 | otherwise -> rankLink componentRank
1137 )
1138 & List.intersperse " "
1139 & mconcat
1140 ") "
1141 HTML.span ! styles ["color" := "#666"] $ do
1142 v & chineseEnglish <&> ShortText.toText & Text.intercalate "; " & HTML.toHtml
1143
1144 orderDOT :: ChineseDict -> IO BS.Builder
1145 orderDOT dict@(ChineseDict d) = do
1146 DOT.runDOT do
1147 -- dotComments [(BS.lazyByteString $ LazyText.encodeUtf8 $ pShow $ rangeToMstToGroupToClusters & Map.map Map.keys)]
1148 -- pTraceShow ("num of nodes", Map.size nodeToBranch, "num of branches", Map.size msf) $
1149 DOT.dotLine "digraph g"
1150
1151 {-
1152 dotBlock do
1153 dotLine "splines=\"ortho\""
1154 indexFrom1M_ (rangeToMstToGroupToClusters & Map.toList) \(srcR, mstToGroupToClusters) srcRI -> do
1155 let srcRB = "r" <> BS.intDec srcRI
1156 dotLine $ "subgraph cluster_" <> srcRB
1157 dotBlock do
1158 dotComments ["Create a node for the range " <> srcRB]
1159 dotNode
1160 srcRB
1161 [ ("shape", "box")
1162 , ("label", builderQuotedString (showHuman srcR))
1163 , ("color", "gray")
1164 , ("style", "filled")
1165 , ("fillcolor", "gray")
1166 ]
1167 dotLine "color=gray"
1168 dotBlock do
1169 dotLine "rank=same"
1170 dotComments ["Create the cluster nodes within the range " <> srcRB]
1171 forM_ (mstToGroupToClusters & Map.toList) \(mstI, groupToClusters) -> do
1172 forM_ (groupToClusters & Map.toList) \(srcGroup, srcClusters) -> do
1173 dotNodeCluster
1174 srcRI
1175 mstI
1176 srcGroup
1177 [
1178 ( "label"
1179 , builderQuotedString $
1180 (srcClusters & toList <&> showHuman & List.unlines)
1181 <> "\nT"
1182 <> printf "%03d" mstI
1183 <> "\nS"
1184 <> show scaleI
1185 -- <> {-maybe ""-} (("\n" <>) . showSimilarity) minSimil
1186 )
1187 , ("style", "filled")
1188 , -- , minSimil & {-maybe ("", "")-} (\s -> ("fillcolor", 1 + ((floor (runProbability s * 10)) `mod` 10) & BS.intDec))
1189 ("colorscheme", "ylorrd9")
1190 , ("shape", "box")
1191 ]
1192 dotComments ["Horizontally align the cluster nodes within the same range"]
1193 let row =
1194 [ (mstI, group)
1195 | (mstI, groupToClusters) <- mstToGroupToClusters & Map.toList
1196 , (group, _clusters) <- groupToClusters & Map.toList
1197 ]
1198 case row of
1199 [] -> return ()
1200 c@(firstMst, firstGroup) : cs -> do
1201 dotEdges
1202 [srcRB, srcRB <> "t" <> BS.intDec firstMst <> "c" <> BS.intDec firstGroup]
1203 [ ("style", "invis")
1204 ]
1205 cs & (`foldM_` c) \(srcMst, srcGroup) dst@(dstMst, dstGroup) -> do
1206 dotEdgesCluster
1207 [(srcRI, srcMst, srcGroup), (srcRI, dstMst, dstGroup)]
1208 [ ("weight", "10")
1209 , ("style", "invis")
1210 ]
1211 return dst
1212 indexFrom1M_ sortedMSF \mst mstI -> do
1213 dotComments ["Create the edges of the MST " <> BS.intDec mstI]
1214 -- pTraceShowM (mstI, List.length (Tree.flatten mst))
1215 let loop (Tree.Node MSTNode{mstNodeRangeCluster = src} dsts) = do
1216 forM_ dsts \dstNode@(Tree.Node MSTNode{mstNodeRangeCluster = dst, mstNodeSimilarity = simil} _) -> do
1217 -- let similB = BS.stringUtf8 $ showFFloat (Just 2) (simil & runProbability & fromRational @Double) ""
1218 let indexRangeCluster (r, c) =
1219 let clusterToGroup :: cluster :-> ClusterGroup =
1220 Map.fromList
1221 [ (cluster, group)
1222 | (group, clusters) <-
1223 rangeToMstToGroupToClusters
1224 & Map.lookup r
1225 & fromMaybe Map.empty
1226 & Map.lookup mstI
1227 & fromMaybe Map.empty
1228 & Map.toList
1229 , cluster <- clusters & Set.toList
1230 ]
1231 in ( 1 + Map.findIndex r rangeToMstToGroupToClusters
1232 , mstI
1233 , Map.lookup c clusterToGroup
1234 & fromMaybe (error (LazyText.unpack (pShow ("r", r, "c", c {-, "clusterToGroup", clusterToGroup, "rangeToMstToGroupToClusters", rangeToMstToGroupToClusters-}))))
1235 )
1236 dotEdgesCluster
1237 [ indexRangeCluster src
1238 , indexRangeCluster dst
1239 ]
1240 [ ("constraint", "false")
1241 , ("color", (floor (runProbability simil * 10)) `mod` 10 & BS.intDec)
1242 , ("colorscheme", "ylorrd9")
1243 , -- , ("label", similB)
1244 ("fontcolor", "blue")
1245 , ("dir", "both")
1246 , ("arrowhead", "dot")
1247 , ("arrowtail", "dot")
1248 ]
1249 loop dstNode
1250 loop mst
1251 dotRanges rangeToMstToGroupToClusters
1252 -}