module Worksheets.Text where import Control.Monad (when) import Data.ByteString.Builder (Builder) import Data.Char qualified as Char import Data.Foldable (foldMap) import Data.List qualified as List import Data.Map.Strict qualified as Map import Data.Maybe (Maybe (..)) import Data.Text (Text) import Data.Text qualified as Text import Paths_worksheets qualified as Self import System.FilePath.Posix (()) import System.FilePath.Posix qualified as File import Text.Blaze import Text.Blaze.Html5 qualified as H import Text.Blaze.Html5.Attributes qualified as HA import Text.Blaze.Renderer.Utf8 qualified as Blaze import Text.Show (Show (..)) import Prelude type Texte = [TexteAtome] data TexteAtome = TexteAtome { texteAtomeLangue :: Langue , texteAtomeGlyphs :: Text , texteAtomePhonetic :: Maybe Text } deriving (Eq, Ord, Show) data Langue = LangueFrançais | LangueAnglais | LangueMandarin | LangueMandarinPinyin | LanguePhonetic deriving (Eq, Ord, Show) splitOnWords :: Text -> [Text] splitOnWords = splitOnChar splitterOnWords splitOnChar :: (Char -> Bool) -> Text -> [Text] splitOnChar f = go where go t = case f `Text.span` t of (l, r) | Text.null l && Text.null r -> [] | Text.null l -> Text.singleton (Text.head t) : go (Text.tail t) | Text.null r -> [l] | otherwise -> l : go r splitterOnWords :: Char -> Bool splitterOnWords c = case Char.generalCategory c of Char.UppercaseLetter -> False Char.LowercaseLetter -> False Char.TitlecaseLetter -> False Char.ModifierLetter -> False Char.OtherLetter -> False Char.NonSpacingMark -> True Char.SpacingCombiningMark -> True Char.EnclosingMark -> True Char.DecimalNumber -> False Char.LetterNumber -> False Char.OtherNumber -> False Char.ConnectorPunctuation -> True Char.DashPunctuation -> True Char.OpenPunctuation -> True Char.ClosePunctuation -> True Char.InitialQuote -> True Char.FinalQuote -> True Char.OtherPunctuation -> True Char.MathSymbol -> True Char.CurrencySymbol -> True Char.ModifierSymbol -> True Char.OtherSymbol -> True Char.Space -> True Char.LineSeparator -> True Char.ParagraphSeparator -> True Char.Control -> True Char.Format -> True Char.Surrogate -> True Char.PrivateUse -> True Char.NotAssigned -> True data CharClass = CharClassLetter | CharClassMark | CharClassNumber | CharClassPunctuation | CharClassQuote | CharClassSymbol | CharClassSeparator | CharClassOther deriving (Eq, Ord, Show) charClassifiers = [ (CharClassLetter, isLetter) , (CharClassMark, isMark) , (CharClassNumber, isNumber) , (CharClassPunctuation, isPunctuation) , (CharClassQuote, isQuote) , (CharClassSymbol, isSymbol) , (CharClassSeparator, isSeparator) , (CharClassOther, isOther) ] where isLetter Char.UppercaseLetter = True isLetter Char.LowercaseLetter = True isLetter Char.TitlecaseLetter = True isLetter Char.ModifierLetter = True isLetter Char.OtherLetter = True isLetter _ = False isMark Char.NonSpacingMark = True isMark Char.SpacingCombiningMark = True isMark Char.EnclosingMark = True isMark _ = False isNumber Char.DecimalNumber = True isNumber Char.LetterNumber = True isNumber Char.OtherNumber = True isNumber _ = False isPunctuation Char.ConnectorPunctuation = True isPunctuation Char.DashPunctuation = True isPunctuation Char.OpenPunctuation = True isPunctuation Char.ClosePunctuation = True isPunctuation Char.OtherPunctuation = True isPunctuation _ = False isQuote Char.InitialQuote = True isQuote Char.FinalQuote = True isQuote _ = False isSymbol Char.MathSymbol = True isSymbol Char.CurrencySymbol = True isSymbol Char.ModifierSymbol = True isSymbol Char.OtherSymbol = True isSymbol _ = False isSeparator Char.Space = True isSeparator Char.LineSeparator = True isSeparator Char.ParagraphSeparator = True isSeparator _ = False isOther Char.Control = True isOther Char.Format = True isOther Char.Surrogate = True isOther Char.PrivateUse = True isOther Char.NotAssigned = True isOther _ = False classifyText = go charClassifiers where go _ t | Text.null t = [] go [] t = (CharClassOther, Text.singleton (Text.head t)) : go charClassifiers (Text.tail t) go ((clsD, clsF) : fs) t = case Text.span (clsF . Char.generalCategory) t of (l, r) | Text.null l -> go fs t | otherwise -> (clsD, l) : go charClassifiers r isClassUnderscore (CharClassPunctuation, Text.all (== '_') -> True) = True isClassUnderscore _ = False isClassSeparator = \case (CharClassSeparator, _) -> True _ -> False splitOnUnderscore = splitCharClass isClassUnderscore splitOnSeparator = splitCharClass isClassSeparator splitCharClass :: ((CharClass, Text) -> Bool) -> [(CharClass, Text)] -> [[(CharClass, Text)]] splitCharClass isSep = go where go [] = [] go ts = case ts & List.span isSep of ([], _r) -> case ts & List.break isSep of (l, r) -> l : go r (_l, r) -> go r zipTexte :: Texte -> Texte -> [(TexteAtome, TexteAtome)] zipTexte = List.zip -- m0 = "开心果 冰淇淋 很 好吃" -- p0 = "kāi_xīn_guǒ bīng_qí_lín hěn hào_chī" splitMandarin t = t & classifyText & splitCharClass isClassSeparator <&> foldMap \(_cls, c) -> c & Text.unpack splitPinyin t = t & classifyText & splitCharClass isClassSeparator <&> foldMap \ct@(_cls, c) -> [c | not (isClassUnderscore ct)] -- z0 = List.zipWith (List.zip) m0 p0