{-# LANGUAGE OverloadedLists #-} {-# LANGUAGE OverloadedStrings #-} module Language.Chinese where import Control.Applicative (Alternative (..)) import Control.Monad.ST qualified as ST import Control.Monad.Trans.State qualified as MT import Data.Aeson qualified as JSON import Data.Aeson.Key qualified as JSON.DictKey import Data.Aeson.KeyMap qualified as JSON import Data.ByteString (ByteString) import Data.ByteString qualified as ByteString import Data.ByteString.Builder qualified as BS import Data.Char qualified as Char import Data.Csv ((.!)) import Data.Csv qualified as CSV import Data.Csv.Incremental qualified as CSV.Incremental import Data.List qualified as List import Data.Map.Strict qualified as Map import Data.STRef qualified as ST import Data.Set qualified as Set import Data.Text qualified as Text import Data.Text.Short qualified as ShortText import Graph.DOT qualified as DOT import Numeric.Decimal.BoundedArithmetic (arithError, arithM) import Paths_worksheets qualified as Self import System.Exit qualified as Sys import System.FilePath qualified as Sys import System.FilePath.Posix (()) import System.FilePath.Posix qualified as File import System.IO qualified as Sys import Text.Read (read, readMaybe) import Worksheets.Utils.HTML (className, classes, styles, (!)) import Worksheets.Utils.HTML qualified as HTML import Prelude (error, fromIntegral, undefined, (/)) -- import Text.Blaze import Text.Blaze.Html5 qualified as HTML import Text.Blaze.Html5.Attributes qualified as HA import Text.Blaze.Renderer.Utf8 qualified as Blaze import Language.Pronunciation qualified as Pron import Worksheets.Utils.Char qualified as Char import Worksheets.Utils.JSON import Worksheets.Utils.Prelude import Worksheets.Utils.Probability newtype ChineseDict = ChineseDict (Map ShortText ChineseDictEntries) deriving (Show) unChineseDict (ChineseDict x) = x instance Semigroup ChineseDict where ChineseDict x <> ChineseDict y = ChineseDict (Map.unionWithKey merge x y) where merge _k !xV !yV = xV <> yV keyLengthToSumOfEntries :: ChineseDict -> Map Natural Natural keyLengthToSumOfEntries (ChineseDict d) = [ ( k & ShortText.length & fromIntegral , 1 ) | k <- d & Map.keys ] & Map.fromListWith (+) chineseFrequencySum :: ChineseDict -> Double chineseFrequencySum (ChineseDict d) = [ pure freq | e <- d & Map.elems , freq <- e & chineseFrequency & maybeToList ] & sum & arithError & runProbability & fromRational @Double instance Monoid ChineseDict where mempty = ChineseDict mempty -- FIXME: use a sum type Char/Word data ChineseDictEntries = ChineseDictEntries { chinesePinyins :: ![ShortText] , -- TODO: , chineseIPA :: !IPA.IPA chineseEnglish :: ![ShortText] , chineseFrequency :: !(Maybe Probability) , chineseComponents :: !(Maybe [Char]) , chineseStrokes :: !(Maybe Natural) -- ^ FIXME: use Natural } deriving (Generic, Show) chineseDecomp :: ShortText -> ChineseDictEntries -> [Char] chineseDecomp key ChineseDictEntries{chineseComponents} = chineseComponents & fromMaybe (if 1 < ShortText.length key then ShortText.unpack key else []) instance Semigroup ChineseDictEntries where x <> y = ChineseDictEntries { chinesePinyins = if null (chinesePinyins x) then chinesePinyins y else if null (chinesePinyins y) || chinesePinyins x == chinesePinyins y then chinesePinyins x else -- traceString -- ( mconcat $ -- [ "Semigroup: chinesePinyins: mismatch: " -- , show (chinesePinyins x) -- , " <> " -- , show (chinesePinyins y) -- ] -- ) (chinesePinyins x) , chineseEnglish = chineseEnglish x <> chineseEnglish y , chineseFrequency = chineseFrequency x <|> chineseFrequency y , chineseComponents = case chineseComponents x of Nothing -> chineseComponents y Just xS -> case chineseComponents y of Nothing -> chineseComponents x Just yS | xS == yS -> chineseComponents x | otherwise -> error $ "Semigroup: chineseComponents: mismatch: " <> show xS <> " <> " <> show yS , chineseStrokes = case chineseStrokes x of Nothing -> chineseStrokes y Just xS -> case chineseStrokes y of Nothing -> chineseStrokes x Just yS | xS == yS -> chineseStrokes x | otherwise -> error $ "Semigroup: chineseStrokes: mismatch: " <> show xS <> " <> " <> show yS } instance Monoid ChineseDictEntries where mempty = ChineseDictEntries { chinesePinyins = mempty , chineseEnglish = mempty , chineseFrequency = Nothing , chineseComponents = Nothing , chineseStrokes = Nothing } readLoachWangFreq :: IO ChineseDict readLoachWangFreq = do readJSON "data/langs/mandarin/LoachWang/word_freq.json" $ JSON.withObject "WordFreq" \kv -> ChineseDict . Map.fromAscList <$> forM (kv & JSON.toMap & Map.toList) \(k, v) -> v & JSON.withScientific "Frequency" \freqSci -> do let freq = freqSci & toRational & probability & arithError return ( k & JSON.DictKey.toShortText , mempty{chineseFrequency = Just freq} ) readLoachWangStrokes :: IO ChineseDict readLoachWangStrokes = readJSON ("data/langs/mandarin/LoachWang/char_strokes.json") $ JSON.withObject "WordFreq" \kv -> ChineseDict . Map.fromAscList <$> forM (kv & JSON.toMap & Map.toList) \(k, v) -> do strokes <- JSON.parseJSON v return ( k & JSON.DictKey.toShortText , mempty{chineseStrokes = Just strokes} ) readChiseStrokes :: IO ChineseDict readChiseStrokes = withDataFile "data/langs/mandarin/cjkvi-ids/ucs-strokes.txt" \dataHandle -> do let loop accDict@(ChineseDict !acc) = do isEOF <- dataHandle & Sys.hIsEOF if isEOF then return accDict else do line <- ByteString.hGetLine dataHandle let decodeUtf8 = ShortText.fromByteString >>> fromMaybe (error "invalid UTF-8") let splitOnChar c = ByteString.split (fromIntegral (fromEnum c)) case line & splitOnChar '\t' of [unicodeLit, unicodeChar, chineseStrokesLit] | unicodeLit & ByteString.isPrefixOf "U+" -> loop $ ChineseDict $ acc & Map.insert (unicodeChar & decodeUtf8) mempty{chineseStrokes} where chineseStrokes :: Maybe Natural = chineseStrokesLit & splitOnChar ',' <&> ( decodeUtf8 >>> ShortText.unpack >>> readMaybe @Natural ) -- CorrectnessWarning: conservative bias toward the greatest number of strokes. & List.maximum <&> fromIntegral _ -> loop accDict loop mempty readChiseDecomp :: IO ChineseDict readChiseDecomp = withDataFile "data/langs/mandarin/cjkvi-ids/ids.txt" $ (`loop` mempty) where loop dataHandle accDict@(ChineseDict !acc) = do isEOF <- dataHandle & Sys.hIsEOF if isEOF then return accDict else do line <- ByteString.hGetLine dataHandle let decodeUtf8 = ShortText.fromByteString >>> fromMaybe (error "invalid UTF-8") let splitOnChar c = ByteString.split (fromIntegral (fromEnum c)) case line & splitOnChar '\t' of unicodeLit : (decodeUtf8 -> unicodeChar) : decomps | unicodeLit & ByteString.isPrefixOf "U+" -> let decompWithCodes :: [([Char], Set Char)] = decomps <&> \decomp -> case decomp & ByteString.stripSuffix "]" of Nothing -> ( decomp & decodeUtf8 & ShortText.unpack , mempty ) Just decompWithCode -> ( decompChars & ShortText.unpack -- ExplanationNote: remove the final '[' & List.init , decompCodes & ShortText.unpack & fromList ) where (decompChars, decompCodes) = decompWithCode & decodeUtf8 & ShortText.breakEnd (== '[') chineseComponents = decompWithCodes -- FIXME: CorrectnessWarning: maybe select using the Codes -- instead of just taking the first. & List.head & fst -- ExplanationNote: remove IDCs (Ideographic Description Characters) -- namely "⿰" (U+2FF0) to "⿻" (U+2FFB). & List.filter (\c -> not ('\x2FF0' <= c && c <= '\x2FFB')) & List.filter (\c -> ShortText.singleton c /= unicodeChar) & Just in loop dataHandle $ ChineseDict $ acc & Map.insert unicodeChar mempty{chineseComponents} _ -> loop dataHandle accDict readChineseDict :: IO ChineseDict readChineseDict = fold $ list [ return extraEntries , readLoachWangFreq , readChiseStrokes , readChiseDecomp , -- , readChineseLexicalDatabase readCEDICT -- , readLoachWangStrokes -- , readOutlierDecomp ] extraEntries :: ChineseDict extraEntries = ChineseDict $ [ [ ( "·" , mempty { chineseStrokes = Just 1 , chineseComponents = Just ['·'] } ) , ( "库蠓属" , mempty { chinesePinyins = ["ku4", "meng3", "shi4"] , chineseEnglish = ["culicoides"] } ) , ( "瘦果" , mempty { chinesePinyins = ["shou4", "guo3"] , chineseEnglish = ["achene"] } ) , ( "美洲野马" , mempty { chinesePinyins = ["mei3", "zhou1", "ye3", "ma3"] , chineseEnglish = ["mustang horse"] } ) , ( "钩粉蝶" , mempty { chinesePinyins = ["gou1", "fen3", "die2"] , chineseEnglish = ["brimstone butterfly"] } ) , ( "核果" , mempty { chinesePinyins = ["he2", "guo3"] , chineseEnglish = ["drupe"] } ) , ( "0" , mempty { chineseStrokes = Just 14 , chinesePinyins = ["ling2"] -- 零 } ) , ( "1" , mempty { chineseStrokes = Just 2 , chinesePinyins = ["yi1"] } ) , ( "2" , mempty { chineseStrokes = Just 2 , chinesePinyins = ["er4"] } ) , ( "3" , mempty { chineseStrokes = Just 2 , chinesePinyins = ["san1"] } ) , ( "4" , mempty { chineseStrokes = Just 2 , chinesePinyins = ["si4"] } ) , ( "5" , mempty { chineseStrokes = Just 2 , chinesePinyins = ["wu3"] } ) , ( "6" , mempty { chineseStrokes = Just 1 , chinesePinyins = ["liu4"] } ) , ( "7" , mempty { chineseStrokes = Just 1 , chinesePinyins = ["qi1"] } ) , ( "8" , mempty { chineseStrokes = Just 2 , chinesePinyins = ["ba1"] } ) , ( "9" , mempty { chineseStrokes = Just 1 , chinesePinyins = ["jiu3"] } ) ] , [ ( letter & ShortText.singleton , mempty { chineseStrokes = Just 1 , chinesePinyins = [] } ) | letter <- [ ['A' .. 'Z'] , ['a' .. 'z'] , ['π', '□', '○'] ] & mconcat ] ] & mconcat & Map.fromList readLoachWangWordOrder :: IO JSON.Array readLoachWangWordOrder = readJSON "data/langs/mandarin/LoachWang/loach_word_order.json" $ JSON.withArray "LoachWordOrder" \arr -> return arr readOutlierDecomp :: IO ChineseDict readOutlierDecomp = readJSON "data/langs/mandarin/LoachWang/outlier_decomp.json" $ JSON.withObject "OutlierDecomp" \kv -> ChineseDict . Map.fromAscList <$> forM (kv & JSON.toMap & Map.toList) \(k, v) -> v & JSON.withArray "Decomp" \decomp -> forM (decomp & toList) ( \sub -> sub & JSON.withText "Atom" \atom -> return $ atom & Text.unpack ) <&> \atoms -> ( k & JSON.DictKey.toShortText , mempty{chineseComponents = Just $ atoms & mconcat} ) reverseDecomp :: ChineseDict -> Map {-def-} ShortText {-refs-} [ShortText] reverseDecomp (ChineseDict dict) = [ sub := [key] | (key, entry) <- dict & Map.toList , sub <- entry & chineseDecomp key <&> ShortText.singleton ] & Map.fromListWith (<>) type Benefit = Probability type Cost = Natural type Weight = Probability type DictKey = ShortText data Weights = Weights { weightsMap :: Map DictKey (Benefit, Cost) } deriving (Show) -- | -- Zhao, K., zhao, D., Yang, J., Ma, W., & Yu, B. (2019). -- Efficient Learning Strategy of Chinese Characters Based on -- Usage Frequency and Structural Relationship. -- 2019 IEEE 4th International Conference on Big Data Analytics (ICBDA). -- doi:10.1109/icbda.2019.8713245 -- https://sci-hub.se/10.1109/icbda.2019.8713245 dictToWeights :: ChineseDict -> Weights dictToWeights cn@(ChineseDict dict) = ST.runST do -- ExplanationNote: memo tables, to avoid recomputing benefits and costs keyToBenefitST :: ST.STRef st (Map DictKey Benefit) <- ST.newSTRef Map.empty keyToCostST :: ST.STRef st (Map DictKey Cost) <- ST.newSTRef Map.empty let refsMap :: Map ShortText [ShortText] = reverseDecomp cn let keyToBenefitLoop :: DictKey -> ST.ST st Benefit keyToBenefitLoop key = do -- traceShowM ("keyToBenefitLoop"::String, key) keyToBenefitMap <- keyToBenefitST & ST.readSTRef keyToBenefitMap & Map.lookup key & \case Just bene -> return bene Nothing -> do let refs :: [ShortText] = refsMap & Map.lookup key & fromMaybe [] -- & fromMaybe [] & List.delete key -- traceShowM ("keyToBenefitLoop"::String, key, refs) let freq = dict & Map.lookup key & maybe proba0 (chineseFrequency >>> fromMaybe proba0) refsBC <- forM refs \ref -> do bene <- keyToBenefitLoop ref cost <- keyToCostLoop ref return (bene, cost) cost <- keyToCostLoop key bene <- arithM $ probability $ runProbability freq + toRational cost * sum [ runProbability b / toRational c | (b, c) <- refsBC , c /= 0 ] ST.modifySTRef keyToBenefitST $ Map.insert key bene return bene keyToCostLoop :: DictKey -> ST.ST st Cost keyToCostLoop key = do -- traceShowM ("keyToCostLoop"::String, key) keyToCostMap <- keyToCostST & ST.readSTRef keyToCostMap & Map.lookup key & \case Just cost -> return cost Nothing -> do let entry = dict & Map.lookup key & fromMaybe mempty cost <- -- single char if ShortText.length key == 1 then do let deps = entry & chineseDecomp key & List.delete (ShortText.unpack key & List.head) -- & (`Set.difference` (ShortText.unpack key & Set.fromList)) if null deps then -- atom char return $ entry & chineseStrokes & fromMaybe 1 else -- & fromMaybe (error $ "dictToWeights: missing chineseStrokes for: " <> ShortText.unpack key) do -- composite char -- traceShowM ("composite"::String, show key, show <$> Set.toList deps -- , deps & (`Set.difference` (ShortText.unpack key & Set.fromList)) -- ) deps & foldMapM (\dep -> Sum <$> keyToCostLoop (dep & ShortText.singleton)) <&> getSum else -- word do -- traceShowM ("word"::String, key) key & ShortText.unpack & foldMapM (\dep -> Sum <$> keyToCostLoop (dep & ShortText.singleton)) <&> getSum ST.modifySTRef keyToCostST $ Map.insert key cost return cost weightsMap <- dict & Map.traverseWithKey \key _entry -> do -- traceShowM ("dictToWeights"::String, key) bene <- keyToBenefitLoop key cost <- keyToCostLoop key return (bene, cost) return Weights { weightsMap } newtype ChineseOrder = ChineseOrder (Map (Down Weight) Weights) unChineseOrder (ChineseOrder x) = x dictOrder :: Weights -> ChineseOrder dictOrder w = [ ( probability (runProbability b / toRational c) & arithError & Down , Weights { weightsMap = Map.singleton k v } ) | (k, v@(b, c)) <- w & weightsMap & Map.toList ] & Map.fromListWith (\x y -> Weights{weightsMap = Map.unionWith (error "dictOrder") (weightsMap x) (weightsMap y)}) & ChineseOrder dictOrderIO = readChineseDict <&> (dictToWeights >>> dictOrder) -- order100 :: IO [(DictKey, Maybe ChineseDictEntries)] order100 = do dict@(ChineseDict d) <- readChineseDict let ChineseOrder o = dict & dictToWeights & dictOrder return $ o & Map.take 100 & Map.elems & foldMap (weightsMap >>> Map.keys) <&> \k -> (k, d & Map.lookup k <&> \r -> (chinesePinyins r, chineseEnglish r)) dictOrderBySameWeight (ChineseOrder o) = [ ( Map.size (weightsMap v) , Map.singleton k v ) | (k, v) <- o & Map.toList ] & Map.fromListWith (<>) {- data UnfoldedGraph node lbl edge = UnfoldedGraph { unfoldedVertexToEdges :: !(Array.Array Vertex (lbl, Map edge IntSet)) , unfoldedNodeToVertex :: !(Map node Vertex) } deriving (Show, Generic) unfoldGMany :: forall node edge nodeLabel. Ord node => (node -> (nodeLabel, Map edge (Set node))) -> [node] -> UnfoldedGraph node nodeLabel edge unfoldGMany gen roots = ST.runST do nodeToVertexST :: ST.STRef st (Map node Vertex) <- ST.newSTRef Map.empty vertexToEdgesST :: ST.STRef st [(Vertex, (nodeLabel, Map edge IntSet))] <- ST.newSTRef [] lastVertexST <- ST.newSTRef firstVertex let newVertex = do v <- ST.readSTRef lastVertexST ST.writeSTRef lastVertexST (v + 1) return v let nodeToVertexDepthFirst :: node -> ST.ST st Vertex nodeToVertexDepthFirst src = nodeToVertexST & ST.readSTRef <&> Map.lookup src >>= \case -- DescriptionNote: `src` has already been seen, -- just return its unique `Vertex`. Just v -> return v -- DescriptionNote: `src` has never been seen yet, -- allocate a `newVertex` for it Nothing -> do dst <- newVertex ST.modifySTRef nodeToVertexST $ Map.insert src dst let (nodeLabel, edgeToDsts) = gen src edgeToDstsV :: Map edge IntSet <- edgeToDsts & Map.traverseWithKey \_edge dsts -> forM (dsts & Set.toList) nodeToVertexDepthFirst <&> IntSet.fromList let res = (dst, (nodeLabel, edgeToDstsV)) ST.modifySTRef vertexToEdgesST (res :) return dst forM_ roots nodeToVertexDepthFirst nodeToVertex <- ST.readSTRef nodeToVertexST vertexToEdges <- ST.readSTRef vertexToEdgesST lastId <- ST.readSTRef lastVertexST return UnfoldedGraph { unfoldedVertexToEdges = vertexToEdges & Array.array (firstVertex, lastId - 1) , unfoldedNodeToVertex = nodeToVertex } where firstVertex :: Vertex = 0 -} -- mostComp d = -- [ -- | d & reverseDecomp & Map.toList -- ] & Map.fromListWith (<>) data ChineseTone = ChineseTone1 | ChineseTone2 | ChineseTone3 | ChineseTone4 | ChineseTone5 deriving (Eq, Ord, Show, Enum) numberedPinyinToDiacriticPiniyn :: ShortText -> ShortText numberedPinyinToDiacriticPiniyn numPin = case pinTone of Nothing -> pinRoot Just tone -> pinRoot & ShortText.unpack & addUmlaut & setDiacritic & ShortText.pack where addUmlaut ('u' : ':' : cs) = 'ü' : addUmlaut cs addUmlaut ('v' : cs) = 'ü' : addUmlaut cs addUmlaut ('U' : ':' : cs) = 'Ü' : addUmlaut cs addUmlaut ('V' : cs) = 'Ü' : addUmlaut cs addUmlaut (c : cs) = c : addUmlaut cs addUmlaut [] = [] -- CorrectnessNote: a, e or the o in ou always get the marker setDiacritic (c@'a' : cs) = convert tone c : cs setDiacritic (c@'e' : cs) = convert tone c : cs setDiacritic (c@'o' : 'u' : cs) = convert tone c : 'u' : cs setDiacritic (c : cs) -- CorrectnessNote: if no a, e, or ou found, the tone mark goes on the last vowel | List.elem c vowels && all (`List.notElem` vowels) cs = convert tone c : cs | otherwise = c : setDiacritic cs setDiacritic [] = [] vowels = lowerVowels <> (Char.toUpper <$> lowerVowels) lowerVowels = ['a', 'e', 'i', 'o', 'u', 'ü'] & list where (pinRoot, pinTone) :: (ShortText, Maybe ChineseTone) = numPin & ShortText.spanEnd Char.isDigit & second \case "" -> Nothing ds -> ds & ShortText.unpack & read & (\x -> x - (1 :: Int)) & toEnum & Just convert :: ChineseTone -> Char -> Char convert t c = case t of ChineseTone1 -> case c of 'a' -> 'ā' 'e' -> 'ē' 'i' -> 'ī' 'o' -> 'ō' 'u' -> 'ū' 'ü' -> 'ǖ' 'A' -> 'Ā' 'E' -> 'Ē' 'I' -> 'Ī' 'O' -> 'Ō' 'U' -> 'Ū' 'Ü' -> 'Ǖ' _ -> undefined ChineseTone2 -> case c of 'a' -> 'á' 'e' -> 'é' 'i' -> 'í' 'o' -> 'ó' 'u' -> 'ú' 'ü' -> 'ǘ' 'A' -> 'Á' 'E' -> 'É' 'I' -> 'Í' 'O' -> 'Ó' 'U' -> 'Ú' 'Ü' -> 'Ǘ' _ -> undefined ChineseTone3 -> case c of 'a' -> 'ǎ' 'e' -> 'ě' 'i' -> 'ǐ' 'o' -> 'ǒ' 'u' -> 'ǔ' 'ü' -> 'ǚ' 'A' -> 'Ǎ' 'E' -> 'Ě' 'I' -> 'Ǐ' 'O' -> 'Ǒ' 'U' -> 'Ǔ' 'Ü' -> 'Ǚ' _ -> undefined ChineseTone4 -> case c of 'a' -> 'à' 'e' -> 'è' 'i' -> 'ì' 'o' -> 'ò' 'u' -> 'ù' 'ü' -> 'ǜ' 'A' -> 'À' 'E' -> 'È' 'I' -> 'Ì' 'O' -> 'Ò' 'U' -> 'Ù' 'Ü' -> 'Ǜ' _ -> undefined ChineseTone5 -> c data HskLevel = HskLevel301 | HskLevel302 | HskLevel303 | HskLevel304 | HskLevel305 | HskLevel306 deriving (Eq, Ord, Enum, Show) -- instance CSV.FromRecord ChineseDictEntries -- instance CSV.ToRecord ChineseDictEntries -- instance CSV.FromNamedRecord ChineseDictEntries -- instance CSV.ToNamedRecord ChineseDictEntries -- instance CSV.DefaultOrdered ChineseDictEntries feed :: (ByteString -> r) -> Sys.Handle -> Sys.IO r feed k csvFile = do Sys.hIsEOF csvFile >>= \case True -> return $ k "" False -> k <$> ByteString.hGetSome csvFile 4096 readHSK :: HskLevel -> IO ChineseDict readHSK chineseHSK = do withDataFile ("data/langs/mandarin/HSK/hsk" <> show hskIndex Sys.<.> "csv") \fileHandle -> do loop fileHandle mempty $ CSV.Incremental.decodeWithP parser decodeOpts CSV.NoHeader where hskIndex = chineseHSK & fromEnum & (+ 1) decodeOpts = CSV.defaultDecodeOptions parser :: CSV.Record -> CSV.Parser (ShortText, ChineseDictEntries) parser v | length v == 3 = do chinese <- v .! 0 chinesePinyins <- v .! 1 <&> ShortText.split (== ' ') chineseEnglish <- v .! 2 <&> pure let chineseFrequency = Nothing let chineseComponents = Nothing let chineseStrokes = Nothing pure (chinese, ChineseDictEntries{..}) | otherwise = empty check = either (\x -> Sys.print x >> return mempty) \(chinese, e) -> return $ ChineseDict $ Map.singleton chinese e loop fileHandle !acc = \case CSV.Incremental.Fail _ errMsg -> do Sys.putStrLn errMsg Sys.exitFailure CSV.Incremental.Many rs k -> do ok <- rs & foldMapM check t <- feed k fileHandle loop fileHandle (acc <> ok) t CSV.Incremental.Done rs -> do ok <- rs & foldMapM check return (acc <> ok) -- | Sun, C.C., Hendrix, P., Ma, J. et al. -- Chinese lexical database (CLD). -- Behav Res 50, 2606–2629 (2018). -- https://doi.org/10.3758/s13428-018-1038-3 readChineseLexicalDatabase :: IO ChineseDict readChineseLexicalDatabase = do withDataFile "data/langs/mandarin/chineselexicaldatabase2.1.csv" \fileHandle -> do loop fileHandle mempty $ CSV.Incremental.decodeWithP parser decodeOpts CSV.HasHeader where decodeOpts = CSV.defaultDecodeOptions parser :: CSV.Record -> CSV.Parser (DictKey, ChineseDictEntries) parser v | length v == 269 = do key <- v .! 0 keyLength :: Int <- v .! 5 pinyin1 <- v .! 15 pinyin2 <- v .! 16 pinyin3 <- v .! 17 pinyin4 <- v .! 18 let chinesePinyins = [pinyin1, pinyin2, pinyin3, pinyin4] & List.take keyLength freqPerMillion :: Double <- v .! 39 let chineseFrequency = freqPerMillion & toRational & (/ 1000000) & probability & fromMaybe (error "readChineseLexicalDatabase: Frequency") & Just pure (key, mempty{chineseFrequency, chinesePinyins}) | otherwise = error $ "readChineseLexicalDatabase: wrong number of columns: " <> show v check = either (\x -> Sys.print x >> return mempty) \(key, val) -> return $ ChineseDict $ Map.singleton key val loop fileHandle !acc = \case CSV.Incremental.Fail _ errMsg -> do Sys.putStrLn errMsg Sys.exitFailure CSV.Incremental.Many rs k -> do ok <- rs & foldMapM check t <- feed k fileHandle loop fileHandle (acc <> ok) t CSV.Incremental.Done rs -> do ok <- rs & foldMapM check return (acc <> ok) readCEDICT :: IO ChineseDict readCEDICT = do withDataFile "data/langs/mandarin/CEDICT/cedict_ts.u8" \cedictHandle -> do let skipHeader = do isEOF <- cedictHandle & Sys.hIsEOF if isEOF then return () else do lineBS <- ByteString.hGetLine cedictHandle let lineST = lineBS & ShortText.fromByteString & fromMaybe (error "invalid UTF-8") let begin = lineST & ShortText.take 1 when (begin == "#") do skipHeader skipHeader let loop !acc = do isEOF <- cedictHandle & Sys.hIsEOF if isEOF then return acc else do line <- ByteString.hGetLine cedictHandle let decodeUtf8 = ShortText.fromByteString >>> fromMaybe (error "invalid UTF-8") -- DescriptionNote: each line is formatted as: #(.+) (.+) \[(.+)] /(.*)/#' let skipChar c = void $ MT.state $ ByteString.span (== fromIntegral (fromEnum c)) let skipPrefix p = MT.modify' $ ByteString.stripPrefix p >>> fromMaybe (error $ "skipPrefix fail to match: " <> show p) let breakOnChar c = ByteString.break (== fromIntegral (fromEnum c)) let breakOnSpace = breakOnChar ' ' let skipSuffix p = MT.modify' $ ByteString.stripSuffix p >>> fromMaybe ( error $ "skipSuffix: mismatch: " <> show p <> "\n on line: " <> ShortText.unpack (decodeUtf8 line) <> "\n escaped: " <> show (ShortText.unpack (decodeUtf8 line)) ) let (dict, leftover) = (`MT.runState` line) do _chineseTrad <- MT.state $ breakOnSpace >>> first decodeUtf8 skipChar ' ' chineseSimpl <- MT.state $ breakOnSpace >>> first decodeUtf8 skipChar ' ' skipPrefix "[" chinesePinyins <- MT.state $ breakOnChar ']' >>> first ( \s -> s & ByteString.split (fromIntegral (fromEnum ' ')) -- CorrectnessWarning: convert pinyin to lowercase, and remove dashes <&> (decodeUtf8 >>> ShortText.unpack >>> (<&> Char.toLower) >>> ShortText.pack) & List.filter (/= "-") ) skipPrefix "] /" -- CorrectnessNote: some lines do not end with \r -- hence make it optional. MT.modify' \s -> s & ByteString.stripSuffix "\r" & fromMaybe s skipSuffix "/" chineseEnglish <- MT.gets \s -> s & ByteString.split (fromIntegral (fromEnum '/')) <&> decodeUtf8 MT.put mempty let chinese = chineseSimpl return $ ChineseDict $ Map.singleton chinese $ mempty { chinesePinyins , chineseEnglish } if not (ByteString.null leftover) then error $ "parserLine: leftover: " <> show leftover else loop (acc <> dict) loop mempty lookupPinyins :: ChineseDict -> ShortText -> [ShortText] lookupPinyins (ChineseDict dict) word = word & (`Map.lookup` dict) & fromMaybe (error $ "lookupPinyins: no entry for: {" <> wordString <> "}") & chinesePinyins & (\ps -> if null ps then error $ "lookupPinyins: empty entry for: " <> wordString else ps) where wordString = word & ShortText.unpack pronunciation :: ChineseDict -> [Pron.Lexeme] -> [[Either Char Pron.Pron]] pronunciation dict inp = inp & Pron.lexemesChars & Text.pack & Text.words <&> wordToPron where wordToPron :: Text -> [Either Char Pron.Pron] wordToPron wordText = [ pronun chars charCateg charBlock | (charCateg, uniBlockToChars) <- wordText & Text.unpack & List.map (\c -> (Char.generalCategory c, (Char.unicodeBlock c, c))) & Char.consecutiveGroups <&> (<&> Char.consecutiveGroups) , (charBlock, chars) <- uniBlockToChars ] & mconcat where pronun :: [Char] -> Char.GeneralCategory -> Maybe Char.UnicodeBlock -> [Either Char Pron.Pron] pronun chars Char.DecimalNumber _ = [ Right Pron.Pron { pronInput = [Pron.LexemeChar char] , pronRule = Pron.rule { Pron.rulePron = Pron.Pronunciations { Pron.unPronunciations = [ Pron.RuleLexemes [Pron.LexemeChar char] := Pron.Pronunciation { Pron.pronunciationIPABroad = [] , Pron.pronunciationText = txt char } ] } } } | char <- chars ] where txt c = c & ShortText.singleton & lookupPinyins dict <&> ( \p -> p & numberedPinyinToDiacriticPiniyn & ShortText.toText & Text.toLower ) & Text.intercalate " " pronun chars Char.OpenPunctuation _ = [ Left c | c <- chars ] pronun chars Char.ClosePunctuation _ = [ Left c | c <- chars ] pronun chars Char.OtherPunctuation _ = [ Left c | c <- chars ] pronun chars _ (Just Char.UnicodeBlockCJK{}) = [ Right Pron.Pron { pronInput = Pron.LexemeChar <$> chars , pronRule = Pron.rule { Pron.rulePron = Pron.Pronunciations { Pron.unPronunciations = [ Pron.RuleLexemes (Pron.LexemeChar <$> chars) := Pron.Pronunciation { Pron.pronunciationIPABroad = [] , Pron.pronunciationText = txt } ] } } } ] where txt = chars & ShortText.pack & lookupPinyins dict & List.map ( \pinyin -> pinyin & numberedPinyinToDiacriticPiniyn & ShortText.toText & Text.toLower ) & Text.intercalate " " -- if List.length wordPinyins == Text.length wordText = wordPinyins -- where -- wordPinyins = chars -- & ShortText.pack -- & lookupPinyins dict pronun chars categ block = errorShow ("pronunciation" :: Text, chars, categ, block) orderHTML dict@(ChineseDict d) = do dataPath <- Self.getDataDir <&> File.normalise return $ Blaze.renderMarkupBuilder do HTML.docTypeHtml do HTML.head do HTML.title $ ("Chinese Order" :: Text) & HTML.toHtml forM_ ( [ "styles/Paper.css" , "styles/Rosetta/Common.css" , "styles/Rosetta/Reading.css" ] & list ) \cssFile -> HTML.link ! HA.rel "stylesheet" ! HA.type_ "text/css" ! HA.href (dataPath cssFile & HTML.toValue) -- HTML.styleCSS $ pagesDifficulties & difficultyCSS HTML.body do let ChineseOrder o = dict & dictToWeights & dictOrder HTML.ol do forM_ ( o & Map.take 5000 & Map.elems & foldMap (weightsMap >>> Map.keys) ) \k -> HTML.li ! HTML.styles [ "break-inside" := "avoid" , "list-style-position" := "inside" ] $ do HTML.span $ k & HTML.toHtml forM_ (d & Map.lookup k) \v -> do HTML.ul ! HTML.styles [] $ do HTML.li do HTML.span $ v & chinesePinyins <&> ShortText.toText & Text.unwords & HTML.toHtml HTML.li do HTML.span $ v & chineseDecomp k & Text.pack & HTML.toHtml HTML.li do HTML.span $ v & chineseEnglish <&> ShortText.toText & Text.intercalate "; " & HTML.toHtml orderDOT :: ChineseDict -> IO BS.Builder orderDOT dict@(ChineseDict d) = do DOT.runDOT do -- dotComments [(BS.lazyByteString $ LazyText.encodeUtf8 $ pShow $ rangeToMstToGroupToClusters & Map.map Map.keys)] -- pTraceShow ("num of nodes", Map.size nodeToBranch, "num of branches", Map.size msf) $ DOT.dotLine "digraph g" {- dotBlock do dotLine "splines=\"ortho\"" indexFrom1M_ (rangeToMstToGroupToClusters & Map.toList) \(srcR, mstToGroupToClusters) srcRI -> do let srcRB = "r" <> BS.intDec srcRI dotLine $ "subgraph cluster_" <> srcRB dotBlock do dotComments ["Create a node for the range " <> srcRB] dotNode srcRB [ ("shape", "box") , ("label", builderQuotedString (showHuman srcR)) , ("color", "gray") , ("style", "filled") , ("fillcolor", "gray") ] dotLine "color=gray" dotBlock do dotLine "rank=same" dotComments ["Create the cluster nodes within the range " <> srcRB] forM_ (mstToGroupToClusters & Map.toList) \(mstI, groupToClusters) -> do forM_ (groupToClusters & Map.toList) \(srcGroup, srcClusters) -> do dotNodeCluster srcRI mstI srcGroup [ ( "label" , builderQuotedString $ (srcClusters & toList <&> showHuman & List.unlines) <> "\nT" <> printf "%03d" mstI <> "\nS" <> show scaleI -- <> {-maybe ""-} (("\n" <>) . showSimilarity) minSimil ) , ("style", "filled") , -- , minSimil & {-maybe ("", "")-} (\s -> ("fillcolor", 1 + ((floor (runProbability s * 10)) `mod` 10) & BS.intDec)) ("colorscheme", "ylorrd9") , ("shape", "box") ] dotComments ["Horizontally align the cluster nodes within the same range"] let row = [ (mstI, group) | (mstI, groupToClusters) <- mstToGroupToClusters & Map.toList , (group, _clusters) <- groupToClusters & Map.toList ] case row of [] -> return () c@(firstMst, firstGroup) : cs -> do dotEdges [srcRB, srcRB <> "t" <> BS.intDec firstMst <> "c" <> BS.intDec firstGroup] [ ("style", "invis") ] cs & (`foldM_` c) \(srcMst, srcGroup) dst@(dstMst, dstGroup) -> do dotEdgesCluster [(srcRI, srcMst, srcGroup), (srcRI, dstMst, dstGroup)] [ ("weight", "10") , ("style", "invis") ] return dst indexFrom1M_ sortedMSF \mst mstI -> do dotComments ["Create the edges of the MST " <> BS.intDec mstI] -- pTraceShowM (mstI, List.length (Tree.flatten mst)) let loop (Tree.Node MSTNode{mstNodeRangeCluster = src} dsts) = do forM_ dsts \dstNode@(Tree.Node MSTNode{mstNodeRangeCluster = dst, mstNodeSimilarity = simil} _) -> do -- let similB = BS.stringUtf8 $ showFFloat (Just 2) (simil & runProbability & fromRational @Double) "" let indexRangeCluster (r, c) = let clusterToGroup :: cluster :-> ClusterGroup = Map.fromList [ (cluster, group) | (group, clusters) <- rangeToMstToGroupToClusters & Map.lookup r & fromMaybe Map.empty & Map.lookup mstI & fromMaybe Map.empty & Map.toList , cluster <- clusters & Set.toList ] in ( 1 + Map.findIndex r rangeToMstToGroupToClusters , mstI , Map.lookup c clusterToGroup & fromMaybe (error (LazyText.unpack (pShow ("r", r, "c", c {-, "clusterToGroup", clusterToGroup, "rangeToMstToGroupToClusters", rangeToMstToGroupToClusters-})))) ) dotEdgesCluster [ indexRangeCluster src , indexRangeCluster dst ] [ ("constraint", "false") , ("color", (floor (runProbability simil * 10)) `mod` 10 & BS.intDec) , ("colorscheme", "ylorrd9") , -- , ("label", similB) ("fontcolor", "blue") , ("dir", "both") , ("arrowhead", "dot") , ("arrowtail", "dot") ] loop dstNode loop mst dotRanges rangeToMstToGroupToClusters -}