module Language where import Data.List qualified as List import Data.Set qualified as Set import Data.Text qualified as Text import Data.Text.Short qualified as ShortText import Worksheets.Utils.Char qualified as Char import Worksheets.Utils.IPA qualified as IPA import Worksheets.Utils.Prelude import Prelude (error) data Langue = LangueFrançais | LangueAnglais | LangueMandarin | LangueMandarinPinyin | LanguePhonetic deriving (Eq, Ord, Show) instance HasTypeDefault Langue where typeDefault = LangueAnglais langues :: Set Langue langues = [ LangueFrançais , LangueAnglais , LangueMandarin , LangueMandarinPinyin , LanguePhonetic ] & Set.fromList data CharMeta = CharMeta { charMetaChar :: Char , charMetaUnicodeCategory :: Char.GeneralCategory , charMetaUnicodeBlock :: Maybe Char.UnicodeBlock } deriving (Eq, Ord, Show) metanizer :: Text -> [CharMeta] metanizer t = t & Text.unpack <&> \c -> CharMeta { charMetaChar = c , charMetaUnicodeCategory = c & Char.generalCategory , charMetaUnicodeBlock = c & Char.unicodeBlock } data Token = Token { tokenText :: ShortText -- Char , tokenMeta :: (Char.GeneralCategory, Maybe Char.UnicodeBlock) } deriving (Eq, Ord, Show) tokenizer :: Text -> [Token] tokenizer t = t & Text.unpack <&> \c -> Token { tokenText = c & ShortText.singleton , tokenMeta = ( c & Char.generalCategory , c & Char.unicodeBlock ) } rosettaTokenizer :: ShortText -> [Token] rosettaTokenizer s = s & ShortText.unpack & group where group [] = [] group (inpHead : inpTail) = tok : group rest where tok = Token { tokenText = inpHead : txt & ShortText.pack , tokenMeta } tokenMeta = ( inpHead & Char.generalCategory , inpHead & Char.unicodeBlock ) (txt, rest) = inpTail & List.span \c -> (Char.generalCategory c, Char.unicodeBlock c) == tokenMeta groupByHoriz :: [Token] -> [[Token]] groupByHoriz = group where group [] = [] group (inpHead : inpTail) = case inpHead of Token{tokenMeta = (Char.Space, _)} -> group rest where (_skipSpaces, rest) = inpTail & List.span onSep tok -> (tok : nonSeps) : group rest where (nonSeps, rest) = inpTail & List.break onSep where onSep = \case Token{tokenText, tokenMeta = (Char.Space, _)} | tokenText & ShortText.unpack & all (== '\xA0') -> False | otherwise -> True _ -> False splitWords :: [Token] -> [[Token]] splitWords = group where group :: [Token] -> [[Token]] group [] = [] group (inpHead : inpTail) = case inpHead of Token{tokenText = ShortText.unpack >>> all (== '\xA0') -> True, tokenMeta = (Char.Space, _)} -> group rest where (_skipSpaces, rest) = inpTail & List.span onSep tok -> (tok : nonSeps) : group rest where (nonSeps, rest) = inpTail & List.break onSep where onSep = \case Token{tokenText = ShortText.unpack >>> all (== '\xA0') -> True, tokenMeta = (Char.Space, _)} -> True _ -> False {- -- | CorrectnessNote: beware than the tokenMeta is just preserved, -- it does not correspond to the pronunciation unicode code points. chinesePronunciation :: ChineseDict -> [Token] -> [Token] chinesePronunciation chineseDict toks = toks & List.concatMap \tok -> let tokText = tok & tokenText in let tokString = tokText & ShortText.unpack in case tok & tokenMeta of (_, Just Char.UnicodeBlockCJK{}) -> pinyins <&> \tokenText -> tok{tokenText} where pinyins :: [ShortText] pinyins | tokString & all Char.isNumber = tokString & List.concatMap \char -> char & ShortText.singleton & lookupPinyins chineseDict | List.length tokTextPins == ShortText.length tokText = tokTextPins | otherwise = error "chinesePronunciation: pinyins length mismatch" tokTextPins = tokText & lookupPinyins chineseDict (_, _) -> tokString <&> \_c -> tok{tokenText = ""} -} rosettaWordChars :: [Token] -> [Token] rosettaWordChars toks = toks & List.concatMap \tok -> let tokText = tok & tokenText in let tokString = tokText & ShortText.unpack in tokString <&> \char -> tok{tokenText = char & ShortText.singleton}