{-# LANGUAGE BangPatterns #-} {-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE PackageImports #-} {-# OPTIONS_GHC -Wno-unused-imports #-} module Worksheets.Writing.Rosetta where import Control.Applicative (Alternative (..), Applicative (..)) import Control.Monad (when) import Control.Monad.Trans.State qualified as MT import Data.ByteString (ByteString) import Data.ByteString qualified as ByteString import Data.ByteString.Builder (Builder) import Data.ByteString.Short qualified as ShortByteString import Data.Char qualified as Char import Data.Csv ((.!)) import Data.Csv qualified as CSV import Data.Csv.Incremental qualified as CSV.Incremental import Data.Foldable (Foldable (..)) import Data.Int (Int) import Data.List qualified as List import Data.Map.Strict qualified as Map import Data.Maybe (Maybe (..)) import Data.Monoid (Ap (..), Monoid (..)) import Data.Text (Text) import Data.Text qualified as Text import Data.Text.Encoding qualified as Text import Data.Text.Lazy qualified as Text.Lazy import Data.Text.Short (ShortText) import Data.Text.Short qualified as ShortText import Debug.Pretty.Simple (pTraceShow, pTraceShowId, pTraceShowM) import GHC.Generics (Generic) import Paths_worksheets qualified as Self import System.Exit qualified as Sys import System.FilePath qualified as Sys import System.FilePath.Posix (()) import System.FilePath.Posix qualified as File import System.IO qualified as Sys import Text.Blaze import Text.Blaze.Html5 qualified as H import Text.Blaze.Html5.Attributes qualified as HA import Text.Blaze.Renderer.Utf8 qualified as Blaze import Text.Show (Show (..)) import Utils.Blaze import Prelude import "base" Prelude (error, fromIntegral) newtype ChineseDict = ChineseDict (Map ShortText [ChineseDictEntry]) deriving (Show) instance Semigroup ChineseDict where ChineseDict x <> ChineseDict y = ChineseDict (Map.unionWithKey merge x y) where merge _k !xV !yV = xV <> yV -- xV -- & traceString (List.unlines $ -- [ "Semigroup ChineseDict: key collision: " <> ShortText.unpack k <> " ("<>show k<>")" -- , xV & pShowNoColor & Text.Lazy.unpack -- , yV & pShowNoColor & Text.Lazy.unpack -- ]) instance Monoid ChineseDict where mempty = ChineseDict mempty data ChineseDictEntry = ChineseDictEntry { chinese :: !ShortText , pinyins :: ![ShortText] , english :: ![ShortText] , hskLevel :: !(Maybe HskLevel) } deriving (Generic, Show) data HskLevel = HskLevel301 | HskLevel302 | HskLevel303 | HskLevel304 | HskLevel305 | HskLevel306 deriving (Eq, Ord, Enum, Show) -- instance CSV.FromRecord ChineseDictEntry -- instance CSV.ToRecord ChineseDictEntry -- instance CSV.FromNamedRecord ChineseDictEntry -- instance CSV.ToNamedRecord ChineseDictEntry -- instance CSV.DefaultOrdered ChineseDictEntry feed :: (ByteString -> r) -> Sys.Handle -> Sys.IO r feed k csvFile = do Sys.hIsEOF csvFile >>= \case True -> return $ k "" False -> k <$> ByteString.hGetSome csvFile 4096 readHSK :: HskLevel -> IO ChineseDict readHSK hskLevel = do Sys.withFile ("data/langs/mandarin/hsk_csv/hsk" <> show hskIndex Sys.<.> "csv") Sys.ReadMode \hskHandle -> do loop hskHandle mempty $ CSV.Incremental.decodeWithP parser decodeOpts CSV.NoHeader where hskIndex = hskLevel & fromEnum & (+ 1) decodeOpts = CSV.defaultDecodeOptions parser :: CSV.Record -> CSV.Parser ChineseDictEntry parser v | length v == 3 = do chinese <- v .! 0 pinyins <- v .! 1 <&> pure english <- v .! 2 <&> pure pure ChineseDictEntry{hskLevel = Just hskLevel, ..} | otherwise = empty check = either (\x -> Sys.print x >> return mempty) \e@ChineseDictEntry{chinese} -> return $ ChineseDict $ Map.singleton chinese [e] loop hskHandle !acc = \case CSV.Incremental.Fail _ errMsg -> do Sys.putStrLn errMsg Sys.exitFailure CSV.Incremental.Many rs k -> do ok <- rs & foldMapM check t <- feed k hskHandle loop hskHandle (acc <> ok) t CSV.Incremental.Done rs -> do ok <- rs & foldMapM check return (acc <> ok) readCEDICT :: IO ChineseDict readCEDICT = do Sys.withFile ("data/langs/mandarin/cedict/cedict_ts.u8") Sys.ReadMode \cedictHandle -> do let skipHeader = do isEOF <- cedictHandle & Sys.hIsEOF if isEOF then return () else do lineBS <- ByteString.hGetLine cedictHandle let lineST = lineBS & ShortText.fromByteString & fromMaybe (error "invalid UTF-8") let begin = lineST & ShortText.take 1 when (begin == "#") do skipHeader skipHeader let loop !acc = do isEOF <- cedictHandle & Sys.hIsEOF if isEOF then return acc else do line <- ByteString.hGetLine cedictHandle let decodeUtf8 = ShortText.fromByteString >>> fromMaybe (error "invalid UTF-8") -- DescriptionNote: each line is formatted as: #(.+) (.+) \[(.+)] /(.*)/#' let skipChar c = void $ MT.state $ ByteString.span (== fromIntegral (fromEnum c)) let skipPrefix p = MT.modify' $ ByteString.stripPrefix p >>> fromMaybe (error $ "skipPrefix fail to match: " <> show p) let breakOnChar c = ByteString.break (== fromIntegral (fromEnum c)) let breakOnSpace = breakOnChar ' ' let skipSuffix p = MT.modify' $ ByteString.stripSuffix p >>> fromMaybe ( error $ "skipSuffix: mismatch: " <> show p <> "\n on line: " <> ShortText.unpack (decodeUtf8 line) <> "\n escaped: " <> show (ShortText.unpack (decodeUtf8 line)) ) let (dict, leftover) = (`MT.runState` line) do chineseTrad <- MT.state $ breakOnSpace >>> first decodeUtf8 skipChar ' ' chineseSimpl <- MT.state $ breakOnSpace >>> first decodeUtf8 skipChar ' ' skipPrefix "[" pinyins <- MT.state $ breakOnChar ']' >>> first (\s -> s & ByteString.split (fromIntegral (fromEnum ' ')) <&> decodeUtf8) skipPrefix "] /" -- CorrectnessNote: some lines do not end with \r -- hence make it optional. MT.modify' \s -> s & ByteString.stripSuffix "\r" & fromMaybe s skipSuffix "/" english <- MT.gets \s -> s & ByteString.split (fromIntegral (fromEnum '/')) <&> decodeUtf8 MT.put mempty let chinese = chineseSimpl return $ ChineseDict $ Map.singleton chinese $ pure $ ChineseDictEntry { chinese , pinyins , english , hskLevel = Nothing } if not (ByteString.null leftover) then error $ "parserLine: leftover: " <> show leftover else loop (acc <> dict) loop mempty readChineseDict :: IO ChineseDict readChineseDict = do 0 & toEnum & enumFrom & foldMapM readHSK {- where manualDict = ChineseDict $ Map.fromListWith (<>) [ (chinese, e) | e@ChineseDictEntry{chinese} <- manualEntries ] manualEntries = [ ChineseDictEntry { chinese = "心" , pinyins = ["xīn"] , english = ["heart"] , hskLevel = Nothing } , ChineseDictEntry { chinese = "果" , pinyins = ["guǒ"] , english = ["fruit"] , hskLevel = Nothing } , ChineseDictEntry { chinese = "冰" , pinyins = ["bīng"] , english = ["ice"] , hskLevel = Nothing } , ChineseDictEntry { chinese = "淇" , pinyins = ["qí"] , english = [] , hskLevel = Nothing } , ChineseDictEntry { chinese = "淋" , pinyins = ["lín"] , english = [] , hskLevel = Nothing } , ChineseDictEntry { chinese = "天" , pinyins = ["tiān"] , english = ["sky"] , hskLevel = Nothing } , ChineseDictEntry { chinese = "空" , pinyins = ["kōng"] , english = ["empty"] , hskLevel = Nothing } , ChineseDictEntry { chinese = "中" , pinyins = ["zhōng"] , english = ["middle"] , hskLevel = Nothing } , ChineseDictEntry { chinese = "架" , pinyins = ["jià"] , english = ["shelf"] , hskLevel = Nothing } , ChineseDictEntry { chinese = "飞" , pinyins = ["fēi"] , english = ["fly"] , hskLevel = Nothing } , ChineseDictEntry { chinese = "机" , pinyins = ["jī"] , english = ["machine"] , hskLevel = Nothing } ] -} {- readHSK :: Sys.IO [HSK] readHSK = do Sys.withFile "data/langs/mandarin/hsk.csv" Sys.ReadMode \csvFile -> do let loopH !_ (CSV.Incremental.FailH _ errMsg) = do Sys.putStrLn errMsg; Sys.exitFailure loopH acc (CSV.Incremental.PartialH k) = feed k csvFile >>= loopH acc loopH _acc (CSV.Incremental.DoneH !h p) = p p <- loopH [] CSV.Incremental.decodeByName let loop !_ (CSV.Incremental.Fail _ errMsg) = do Sys.putStrLn errMsg; Sys.exitFailure loop acc (CSV.Incremental.Many rs k) = feed k csvFile >>= loop (acc <> rs) loop acc (CSV.Incremental.Done rs) = loop (acc <> rs) loop [] p -} data Langue = LangueFrançais | LangueAnglais | LangueMandarin | LangueMandarinPinyin | LanguePhonetic deriving (Eq, Ord, Show) data Rosetta = Rosetta { rosettaPictures :: [File.FilePath] , rosettaEntries :: [RosettaEntry] } deriving (Eq, Ord, Show) data RosettaEntry = RosettaEntry { rosettaEntryGlyphsText :: ShortText , rosettaEntryGlyphsLangue :: Langue } deriving (Eq, Ord, Show) rosetta :: ChineseDict -> Text -> Rosetta -> IO Builder rosetta (ChineseDict chineseDict) title Rosetta{..} = do -- FIXME: this absolute path is not reproducible out of my system dataPath <- Self.getDataDir <&> File.normalise return $ Blaze.renderMarkupBuilder do H.docTypeHtml do H.head do H.title $ title & H.toHtml H.link ! HA.rel "stylesheet" ! HA.type_ "text/css" ! HA.href (toValue $ dataPath "styles/rosetta.css") H.body do "\n" H.div ! classes ["rosetta"] $ do forM_ (List.zip rosettaPictures rosettaEntries) \(rosettaPicture, RosettaEntry{..}) -> do -- pTraceShowM (rosettaPicture, RosettaEntry{..}) "\n" H.div ! classes ["rosetta-row", "rosetta-row-" <> show rosettaEntryGlyphsLangue] $ do H.div ! classes ["rosetta-cell", "rosetta-cell-picture"] $ do H.img ! HA.src ("file://" <> dataPath "images" "thumbnails" rosettaPicture & toValue) H.div ! classes ["rosetta-cell", "rosetta-cell-words"] $ do let atomWidth = case rosettaEntryGlyphsLangue of LangueMandarin -> "1.5cm" _ -> "1cm" let cellSpace = \case ' ' -> "writing-words-cell-space" _ -> "" H.div ! classes ["writing-words"] $ do forM_ (rosettaEntryGlyphsText & ShortText.split (== ' ')) \writingWord -> do let atomLength = writingWord & ShortText.length & show H.div ! classes [ "writing-words-word" ] $ do when (rosettaEntryGlyphsLangue == LangueMandarin) do H.div ! classes [ "writing-words-row" , "writing-words-" <> show LangueMandarinPinyin ] ! HA.style ("grid-template-columns: repeat(" <> atomLength <> ", " <> atomWidth <> ");" & toValue) $ do forM_ (writingWord & ShortText.unpack) \writingChar -> do let ChineseDictEntry{pinyins} = chineseDict & Map.lookup (ShortText.singleton writingChar) & fromMaybe (error $ "chineseDict missing: " <> [writingChar]) & List.head -- FIXME H.div ! classes ["writing-words-cell", cellSpace writingChar] $ do forM_ pinyins H.toHtml H.div ! classes [ "writing-words-row" , "writing-words-" <> show rosettaEntryGlyphsLangue , "writing-words-row-model" ] ! HA.style ("grid-template-columns: repeat(" <> atomLength <> ", " <> atomWidth <> ");" & toValue) $ do forM_ (writingWord & ShortText.unpack) \writingChar -> do H.div ! classes ["writing-words-cell", cellSpace writingChar] $ do fromString [writingChar] "\n" H.div ! classes [ "writing-words-row" , "writing-words-" <> show rosettaEntryGlyphsLangue , "writing-words-row-input" ] ! HA.style ("grid-template-columns: repeat(" <> atomLength <> ", " <> atomWidth <> ");" & toValue) $ do forM_ (writingWord & ShortText.unpack) \writingChar -> do H.div ! classes ["writing-words-cell", cellSpace writingChar] $ do case rosettaEntryGlyphsLangue of LangueMandarin -> fromString [writingChar] _ -> " " "\n" "\n"