1 {-# LANGUAGE BangPatterns #-}
2 {-# LANGUAGE DeriveGeneric #-}
3 {-# LANGUAGE PackageImports #-}
4 {-# OPTIONS_GHC -Wno-unused-imports #-}
6 module Worksheets.Writing.Rosetta where
8 import Control.Applicative (Alternative (..), Applicative (..))
9 import Control.Monad (when)
10 import Control.Monad.Trans.State qualified as MT
11 import Data.ByteString (ByteString)
12 import Data.ByteString qualified as ByteString
13 import Data.ByteString.Builder (Builder)
14 import Data.ByteString.Short qualified as ShortByteString
15 import Data.Char qualified as Char
16 import Data.Csv ((.!))
17 import Data.Csv qualified as CSV
18 import Data.Csv.Incremental qualified as CSV.Incremental
19 import Data.Foldable (Foldable (..))
21 import Data.List qualified as List
22 import Data.Map.Strict qualified as Map
23 import Data.Maybe (Maybe (..))
24 import Data.Monoid (Ap (..), Monoid (..))
25 import Data.Text (Text)
26 import Data.Text qualified as Text
27 import Data.Text.Encoding qualified as Text
28 import Data.Text.Lazy qualified as Text.Lazy
29 import Data.Text.Short (ShortText)
30 import Data.Text.Short qualified as ShortText
31 import Debug.Pretty.Simple (pTraceShow, pTraceShowId, pTraceShowM)
32 import GHC.Generics (Generic)
33 import Paths_worksheets qualified as Self
34 import System.Exit qualified as Sys
35 import System.FilePath qualified as Sys
36 import System.FilePath.Posix ((</>))
37 import System.FilePath.Posix qualified as File
38 import System.IO qualified as Sys
40 import Text.Blaze.Html5 qualified as H
41 import Text.Blaze.Html5.Attributes qualified as HA
42 import Text.Blaze.Renderer.Utf8 qualified as Blaze
43 import Text.Show (Show (..))
46 import "base" Prelude (error, fromIntegral)
48 newtype ChineseDict = ChineseDict (Map ShortText [ChineseDictEntry])
50 instance Semigroup ChineseDict where
51 ChineseDict x <> ChineseDict y =
52 ChineseDict (Map.unionWithKey merge x y)
54 merge _k !xV !yV = xV <> yV
57 -- & traceString (List.unlines $
58 -- [ "Semigroup ChineseDict: key collision: " <> ShortText.unpack k <> " ("<>show k<>")"
59 -- , xV & pShowNoColor & Text.Lazy.unpack
60 -- , yV & pShowNoColor & Text.Lazy.unpack
62 instance Monoid ChineseDict where
63 mempty = ChineseDict mempty
64 data ChineseDictEntry = ChineseDictEntry
65 { chinese :: !ShortText
66 , pinyins :: ![ShortText]
67 , english :: ![ShortText]
68 , hskLevel :: !(Maybe HskLevel)
70 deriving (Generic, Show)
79 deriving (Eq, Ord, Enum, Show)
81 -- instance CSV.FromRecord ChineseDictEntry
82 -- instance CSV.ToRecord ChineseDictEntry
83 -- instance CSV.FromNamedRecord ChineseDictEntry
84 -- instance CSV.ToNamedRecord ChineseDictEntry
85 -- instance CSV.DefaultOrdered ChineseDictEntry
87 feed :: (ByteString -> r) -> Sys.Handle -> Sys.IO r
89 Sys.hIsEOF csvFile >>= \case
91 False -> k <$> ByteString.hGetSome csvFile 4096
93 readHSK :: HskLevel -> IO ChineseDict
95 Sys.withFile ("data/langs/mandarin/hsk_csv/hsk" <> show hskIndex Sys.<.> "csv") Sys.ReadMode \hskHandle -> do
96 loop hskHandle mempty $
97 CSV.Incremental.decodeWithP parser decodeOpts CSV.NoHeader
99 hskIndex = hskLevel & fromEnum & (+ 1)
100 decodeOpts = CSV.defaultDecodeOptions
101 parser :: CSV.Record -> CSV.Parser ChineseDictEntry
105 pinyins <- v .! 1 <&> pure
106 english <- v .! 2 <&> pure
107 pure ChineseDictEntry{hskLevel = Just hskLevel, ..}
110 either (\x -> Sys.print x >> return mempty) \e@ChineseDictEntry{chinese} ->
111 return $ ChineseDict $ Map.singleton chinese [e]
112 loop hskHandle !acc = \case
113 CSV.Incremental.Fail _ errMsg -> do
116 CSV.Incremental.Many rs k -> do
117 ok <- rs & foldMapM check
118 t <- feed k hskHandle
119 loop hskHandle (acc <> ok) t
120 CSV.Incremental.Done rs -> do
121 ok <- rs & foldMapM check
124 readCEDICT :: IO ChineseDict
126 Sys.withFile ("data/langs/mandarin/cedict/cedict_ts.u8") Sys.ReadMode \cedictHandle -> do
128 isEOF <- cedictHandle & Sys.hIsEOF
132 lineBS <- ByteString.hGetLine cedictHandle
133 let lineST = lineBS & ShortText.fromByteString & fromMaybe (error "invalid UTF-8")
134 let begin = lineST & ShortText.take 1
135 when (begin == "#") do
139 isEOF <- cedictHandle & Sys.hIsEOF
143 line <- ByteString.hGetLine cedictHandle
144 let decodeUtf8 = ShortText.fromByteString >>> fromMaybe (error "invalid UTF-8")
145 -- DescriptionNote: each line is formatted as: #(.+) (.+) \[(.+)] /(.*)/#'
146 let skipChar c = void $ MT.state $ ByteString.span (== fromIntegral (fromEnum c))
149 ByteString.stripPrefix p
150 >>> fromMaybe (error $ "skipPrefix fail to match: " <> show p)
151 let breakOnChar c = ByteString.break (== fromIntegral (fromEnum c))
152 let breakOnSpace = breakOnChar ' '
155 ByteString.stripSuffix p
158 "skipSuffix: mismatch: "
161 <> ShortText.unpack (decodeUtf8 line)
163 <> show (ShortText.unpack (decodeUtf8 line))
165 let (dict, leftover) = (`MT.runState` line) do
166 chineseTrad <- MT.state $ breakOnSpace >>> first decodeUtf8
168 chineseSimpl <- MT.state $ breakOnSpace >>> first decodeUtf8
174 >>> first (\s -> s & ByteString.split (fromIntegral (fromEnum ' ')) <&> decodeUtf8)
176 -- CorrectnessNote: some lines do not end with \r
177 -- hence make it optional.
178 MT.modify' \s -> s & ByteString.stripSuffix "\r" & fromMaybe s
180 english <- MT.gets \s -> s & ByteString.split (fromIntegral (fromEnum '/')) <&> decodeUtf8
182 let chinese = chineseSimpl
185 Map.singleton chinese $
193 if not (ByteString.null leftover)
194 then error $ "parserLine: leftover: " <> show leftover
195 else loop (acc <> dict)
198 readChineseDict :: IO ChineseDict
200 0 & toEnum & enumFrom & foldMapM readHSK
204 manualDict = ChineseDict $ Map.fromListWith (<>) [ (chinese, e) | e@ChineseDictEntry{chinese} <- manualEntries ]
209 , english = ["heart"]
215 , english = ["fruit"]
245 , english = ["empty"]
250 , pinyins = ["zhōng"]
251 , english = ["middle"]
257 , english = ["shelf"]
269 , english = ["machine"]
276 readHSK :: Sys.IO [HSK]
278 Sys.withFile "data/langs/mandarin/hsk.csv" Sys.ReadMode \csvFile -> do
279 let loopH !_ (CSV.Incremental.FailH _ errMsg) = do Sys.putStrLn errMsg; Sys.exitFailure
280 loopH acc (CSV.Incremental.PartialH k) = feed k csvFile >>= loopH acc
281 loopH _acc (CSV.Incremental.DoneH !h p) = p
282 p <- loopH [] CSV.Incremental.decodeByName
283 let loop !_ (CSV.Incremental.Fail _ errMsg) = do Sys.putStrLn errMsg; Sys.exitFailure
284 loop acc (CSV.Incremental.Many rs k) = feed k csvFile >>= loop (acc <> rs)
285 loop acc (CSV.Incremental.Done rs) = loop (acc <> rs)
293 | LangueMandarinPinyin
295 deriving (Eq, Ord, Show)
297 data Rosetta = Rosetta
298 { rosettaPictures :: [File.FilePath]
299 , rosettaEntries :: [RosettaEntry]
301 deriving (Eq, Ord, Show)
303 data RosettaEntry = RosettaEntry
304 { rosettaEntryGlyphsText :: ShortText
305 , rosettaEntryGlyphsLangue :: Langue
307 deriving (Eq, Ord, Show)
309 rosetta :: ChineseDict -> Text -> Rosetta -> IO Builder
310 rosetta (ChineseDict chineseDict) title Rosetta{..} = do
311 -- FIXME: this absolute path is not reproducible out of my system
312 dataPath <- Self.getDataDir <&> File.normalise
313 return $ Blaze.renderMarkupBuilder do
316 H.title $ title & H.toHtml
318 ! HA.rel "stylesheet"
319 ! HA.type_ "text/css"
320 ! HA.href (toValue $ dataPath </> "styles/rosetta.css")
323 H.div ! classes ["rosetta"] $ do
324 forM_ (List.zip rosettaPictures rosettaEntries) \(rosettaPicture, RosettaEntry{..}) -> do
325 -- pTraceShowM (rosettaPicture, RosettaEntry{..})
327 H.div ! classes ["rosetta-row", "rosetta-row-" <> show rosettaEntryGlyphsLangue] $ do
328 H.div ! classes ["rosetta-cell", "rosetta-cell-picture"] $ do
329 H.img ! HA.src ("file://" <> dataPath </> "images" </> "thumbnails" </> rosettaPicture & toValue)
330 H.div ! classes ["rosetta-cell", "rosetta-cell-words"] $ do
331 let atomWidth = case rosettaEntryGlyphsLangue of
332 LangueMandarin -> "1.5cm"
334 let cellSpace = \case
335 ' ' -> "writing-words-cell-space"
337 H.div ! classes ["writing-words"] $ do
338 forM_ (rosettaEntryGlyphsText & ShortText.split (== ' ')) \writingWord -> do
339 let atomLength = writingWord & ShortText.length & show
342 [ "writing-words-word"
345 when (rosettaEntryGlyphsLangue == LangueMandarin) do
348 [ "writing-words-row"
349 , "writing-words-" <> show LangueMandarinPinyin
351 ! HA.style ("grid-template-columns: repeat(" <> atomLength <> ", " <> atomWidth <> ");" & toValue)
353 forM_ (writingWord & ShortText.unpack) \writingChar -> do
354 let ChineseDictEntry{pinyins} =
356 & Map.lookup (ShortText.singleton writingChar)
357 & fromMaybe (error $ "chineseDict missing: " <> [writingChar])
359 H.div ! classes ["writing-words-cell", cellSpace writingChar] $ do
360 forM_ pinyins H.toHtml
363 [ "writing-words-row"
364 , "writing-words-" <> show rosettaEntryGlyphsLangue
365 , "writing-words-row-model"
367 ! HA.style ("grid-template-columns: repeat(" <> atomLength <> ", " <> atomWidth <> ");" & toValue)
369 forM_ (writingWord & ShortText.unpack) \writingChar -> do
370 H.div ! classes ["writing-words-cell", cellSpace writingChar] $ do
371 fromString [writingChar]
375 [ "writing-words-row"
376 , "writing-words-" <> show rosettaEntryGlyphsLangue
377 , "writing-words-row-input"
379 ! HA.style ("grid-template-columns: repeat(" <> atomLength <> ", " <> atomWidth <> ");" & toValue)
381 forM_ (writingWord & ShortText.unpack) \writingChar -> do
382 H.div ! classes ["writing-words-cell", cellSpace writingChar] $ do
383 case rosettaEntryGlyphsLangue of
384 LangueMandarin -> fromString [writingChar]