module Language where

import Data.List qualified as List
import Data.Set qualified as Set
import Data.Text qualified as Text
import Data.Text.Short qualified as ShortText
import Worksheets.Utils.Char qualified as Char
import Worksheets.Utils.IPA qualified as IPA
import Worksheets.Utils.Prelude
import Prelude (error)

data Langue
  = LangueFrançais
  | LangueAnglais
  | LangueMandarin
  | LangueMandarinPinyin
  | LanguePhonetic
  deriving (Eq, Ord, Show)
instance HasTypeDefault Langue where
  typeDefault = LangueAnglais
langues :: Set Langue
langues =
  [ LangueFrançais
  , LangueAnglais
  , LangueMandarin
  , LangueMandarinPinyin
  , LanguePhonetic
  ]
    & Set.fromList

data CharMeta = CharMeta
  { charMetaChar :: Char
  , charMetaUnicodeCategory :: Char.GeneralCategory
  , charMetaUnicodeBlock :: Maybe Char.UnicodeBlock
  }
  deriving (Eq, Ord, Show)

metanizer :: Text -> [CharMeta]
metanizer t =
  t & Text.unpack <&> \c ->
    CharMeta
      { charMetaChar = c
      , charMetaUnicodeCategory = c & Char.generalCategory
      , charMetaUnicodeBlock = c & Char.unicodeBlock
      }

data Token = Token
  { tokenText :: ShortText -- Char
  , tokenMeta :: (Char.GeneralCategory, Maybe Char.UnicodeBlock)
  }
  deriving (Eq, Ord, Show)

tokenizer :: Text -> [Token]
tokenizer t =
  t & Text.unpack <&> \c ->
    Token
      { tokenText = c & ShortText.singleton
      , tokenMeta =
          ( c & Char.generalCategory
          , c & Char.unicodeBlock
          )
      }

rosettaTokenizer :: ShortText -> [Token]
rosettaTokenizer s = s & ShortText.unpack & group
  where
    group [] = []
    group (inpHead : inpTail) = tok : group rest
      where
        tok =
          Token
            { tokenText = inpHead : txt & ShortText.pack
            , tokenMeta
            }
        tokenMeta =
          ( inpHead & Char.generalCategory
          , inpHead & Char.unicodeBlock
          )
        (txt, rest) =
          inpTail & List.span \c ->
            (Char.generalCategory c, Char.unicodeBlock c) == tokenMeta

groupByHoriz :: [Token] -> [[Token]]
groupByHoriz = group
  where
    group [] = []
    group (inpHead : inpTail) =
      case inpHead of
        Token{tokenMeta = (Char.Space, _)} -> group rest
          where
            (_skipSpaces, rest) = inpTail & List.span onSep
        tok -> (tok : nonSeps) : group rest
          where
            (nonSeps, rest) = inpTail & List.break onSep
      where
        onSep = \case
          Token{tokenText, tokenMeta = (Char.Space, _)}
            | tokenText & ShortText.unpack & all (== '\xA0') -> False
            | otherwise -> True
          _ -> False
splitWords :: [Token] -> [[Token]]
splitWords = group
  where
    group :: [Token] -> [[Token]]
    group [] = []
    group (inpHead : inpTail) =
      case inpHead of
        Token{tokenText = ShortText.unpack >>> all (== '\xA0') -> True, tokenMeta = (Char.Space, _)} -> group rest
          where
            (_skipSpaces, rest) = inpTail & List.span onSep
        tok -> (tok : nonSeps) : group rest
          where
            (nonSeps, rest) = inpTail & List.break onSep
      where
        onSep = \case
          Token{tokenText = ShortText.unpack >>> all (== '\xA0') -> True, tokenMeta = (Char.Space, _)} -> True
          _ -> False

{-
-- | CorrectnessNote: beware than the tokenMeta is just preserved,
-- it does not correspond to the pronunciation unicode code points.
chinesePronunciation :: ChineseDict -> [Token] -> [Token]
chinesePronunciation chineseDict toks =
  toks & List.concatMap \tok ->
    let tokText = tok & tokenText
    in let tokString = tokText & ShortText.unpack
       in case tok & tokenMeta of
            (_, Just Char.UnicodeBlockCJK{}) -> pinyins <&> \tokenText -> tok{tokenText}
              where
                pinyins :: [ShortText]
                pinyins
                  | tokString & all Char.isNumber =
                      tokString & List.concatMap \char ->
                        char & ShortText.singleton & lookupPinyins chineseDict
                  | List.length tokTextPins == ShortText.length tokText = tokTextPins
                  | otherwise = error "chinesePronunciation: pinyins length mismatch"
                tokTextPins = tokText & lookupPinyins chineseDict
            (_, _) -> tokString <&> \_c -> tok{tokenText = ""}
-}
rosettaWordChars :: [Token] -> [Token]
rosettaWordChars toks =
  toks & List.concatMap \tok ->
    let tokText = tok & tokenText
    in let tokString = tokText & ShortText.unpack
       in tokString <&> \char ->
            tok{tokenText = char & ShortText.singleton}