{-# LANGUAGE OverloadedLists #-}
{-# LANGUAGE OverloadedStrings #-}

module Language.Chinese where

import Control.Applicative (Alternative (..))
import Control.Monad.ST qualified as ST
import Control.Monad.Trans.State qualified as MT
import Data.Aeson qualified as JSON
import Data.Aeson.Key qualified as JSON.DictKey
import Data.Aeson.KeyMap qualified as JSON
import Data.ByteString (ByteString)
import Data.ByteString qualified as ByteString
import Data.ByteString.Builder qualified as BS
import Data.Char qualified as Char
import Data.Csv ((.!))
import Data.Csv qualified as CSV
import Data.Csv.Incremental qualified as CSV.Incremental
import Data.List qualified as List
import Data.Map.Strict qualified as Map
import Data.STRef qualified as ST
import Data.Set qualified as Set
import Data.Text qualified as Text
import Data.Text.Short qualified as ShortText
import Graph.DOT qualified as DOT
import Numeric.Decimal.BoundedArithmetic (arithError, arithM)
import Paths_worksheets qualified as Self
import System.Exit qualified as Sys
import System.FilePath qualified as Sys
import System.FilePath.Posix ((</>))
import System.FilePath.Posix qualified as File
import System.IO qualified as Sys
import Text.Read (read, readMaybe)
import Worksheets.Utils.HTML (className, classes, styles, (!))
import Worksheets.Utils.HTML qualified as HTML
import Prelude (error, fromIntegral, undefined, (/))

-- import Text.Blaze
import Text.Blaze.Html5 qualified as HTML
import Text.Blaze.Html5.Attributes qualified as HA
import Text.Blaze.Renderer.Utf8 qualified as Blaze

import Language.Pronunciation qualified as Pron
import Worksheets.Utils.Char qualified as Char
import Worksheets.Utils.JSON
import Worksheets.Utils.Prelude
import Worksheets.Utils.Probability

newtype ChineseDict = ChineseDict (Map ShortText ChineseDictEntries)
  deriving (Show)
unChineseDict (ChineseDict x) = x
instance Semigroup ChineseDict where
  ChineseDict x <> ChineseDict y =
    ChineseDict (Map.unionWithKey merge x y)
    where
      merge _k !xV !yV = xV <> yV

keyLengthToSumOfEntries :: ChineseDict -> Map Natural Natural
keyLengthToSumOfEntries (ChineseDict d) =
  [ ( k & ShortText.length & fromIntegral
    , 1
    )
  | k <- d & Map.keys
  ]
    & Map.fromListWith (+)

chineseFrequencySum :: ChineseDict -> Double
chineseFrequencySum (ChineseDict d) =
  [ pure freq
  | e <- d & Map.elems
  , freq <- e & chineseFrequency & maybeToList
  ]
    & sum
    & arithError
    & runProbability
    & fromRational @Double

instance Monoid ChineseDict where
  mempty = ChineseDict mempty

-- FIXME: use a sum type Char/Word
data ChineseDictEntries = ChineseDictEntries
  { chinesePinyins :: ![ShortText]
  , -- TODO: , chineseIPA :: !IPA.IPA
    chineseEnglish :: ![ShortText]
  , chineseFrequency :: !(Maybe Probability)
  , chineseComponents :: !(Maybe [Char])
  , chineseStrokes :: !(Maybe Natural)
  -- ^ FIXME: use Natural
  }
  deriving (Generic, Show)
chineseDecomp :: ShortText -> ChineseDictEntries -> [Char]
chineseDecomp key ChineseDictEntries{chineseComponents} =
  chineseComponents
    & fromMaybe (if 1 < ShortText.length key then ShortText.unpack key else [])

instance Semigroup ChineseDictEntries where
  x <> y =
    ChineseDictEntries
      { chinesePinyins =
          if null (chinesePinyins x)
            then chinesePinyins y
            else
              if null (chinesePinyins y)
                || chinesePinyins x
                == chinesePinyins y
                then chinesePinyins x
                else -- traceString
                --  ( mconcat $
                --      [ "Semigroup: chinesePinyins: mismatch: "
                --      , show (chinesePinyins x)
                --      , " <> "
                --      , show (chinesePinyins y)
                --      ]
                --  )
                  (chinesePinyins x)
      , chineseEnglish = chineseEnglish x <> chineseEnglish y
      , chineseFrequency = chineseFrequency x <|> chineseFrequency y
      , chineseComponents =
          case chineseComponents x of
            Nothing -> chineseComponents y
            Just xS ->
              case chineseComponents y of
                Nothing -> chineseComponents x
                Just yS
                  | xS == yS -> chineseComponents x
                  | otherwise -> error $ "Semigroup: chineseComponents: mismatch: " <> show xS <> " <> " <> show yS
      , chineseStrokes =
          case chineseStrokes x of
            Nothing -> chineseStrokes y
            Just xS ->
              case chineseStrokes y of
                Nothing -> chineseStrokes x
                Just yS
                  | xS == yS -> chineseStrokes x
                  | otherwise -> error $ "Semigroup: chineseStrokes: mismatch: " <> show xS <> " <> " <> show yS
      }
instance Monoid ChineseDictEntries where
  mempty =
    ChineseDictEntries
      { chinesePinyins = mempty
      , chineseEnglish = mempty
      , chineseFrequency = Nothing
      , chineseComponents = Nothing
      , chineseStrokes = Nothing
      }

readLoachWangFreq :: IO ChineseDict
readLoachWangFreq = do
  readJSON "data/langs/mandarin/LoachWang/word_freq.json" $
    JSON.withObject "WordFreq" \kv ->
      ChineseDict
        . Map.fromAscList
        <$> forM (kv & JSON.toMap & Map.toList) \(k, v) ->
          v & JSON.withScientific "Frequency" \freqSci -> do
            let freq = freqSci & toRational & probability & arithError
            return
              ( k & JSON.DictKey.toShortText
              , mempty{chineseFrequency = Just freq}
              )

readLoachWangStrokes :: IO ChineseDict
readLoachWangStrokes =
  readJSON ("data/langs/mandarin/LoachWang/char_strokes.json") $
    JSON.withObject "WordFreq" \kv ->
      ChineseDict
        . Map.fromAscList
        <$> forM (kv & JSON.toMap & Map.toList) \(k, v) -> do
          strokes <- JSON.parseJSON v
          return
            ( k & JSON.DictKey.toShortText
            , mempty{chineseStrokes = Just strokes}
            )

readChiseStrokes :: IO ChineseDict
readChiseStrokes =
  withDataFile "data/langs/mandarin/cjkvi-ids/ucs-strokes.txt" \dataHandle -> do
    let loop accDict@(ChineseDict !acc) = do
          isEOF <- dataHandle & Sys.hIsEOF
          if isEOF
            then return accDict
            else do
              line <- ByteString.hGetLine dataHandle
              let decodeUtf8 = ShortText.fromByteString >>> fromMaybe (error "invalid UTF-8")
              let splitOnChar c = ByteString.split (fromIntegral (fromEnum c))
              case line & splitOnChar '\t' of
                [unicodeLit, unicodeChar, chineseStrokesLit]
                  | unicodeLit & ByteString.isPrefixOf "U+" ->
                      loop $
                        ChineseDict $
                          acc
                            & Map.insert
                              (unicodeChar & decodeUtf8)
                              mempty{chineseStrokes}
                  where
                    chineseStrokes :: Maybe Natural =
                      chineseStrokesLit
                        & splitOnChar ','
                        <&> ( decodeUtf8
                                >>> ShortText.unpack
                                >>> readMaybe @Natural
                            )
                        -- CorrectnessWarning: conservative bias toward the greatest number of strokes.
                        & List.maximum
                        <&> fromIntegral
                _ -> loop accDict
    loop mempty

readChiseDecomp :: IO ChineseDict
readChiseDecomp =
  withDataFile "data/langs/mandarin/cjkvi-ids/ids.txt" $ (`loop` mempty)
  where
    loop dataHandle accDict@(ChineseDict !acc) = do
      isEOF <- dataHandle & Sys.hIsEOF
      if isEOF
        then return accDict
        else do
          line <- ByteString.hGetLine dataHandle
          let decodeUtf8 = ShortText.fromByteString >>> fromMaybe (error "invalid UTF-8")
          let splitOnChar c = ByteString.split (fromIntegral (fromEnum c))
          case line & splitOnChar '\t' of
            unicodeLit : (decodeUtf8 -> unicodeChar) : decomps
              | unicodeLit & ByteString.isPrefixOf "U+" ->
                  let
                    decompWithCodes :: [([Char], Set Char)] =
                      decomps <&> \decomp ->
                        case decomp & ByteString.stripSuffix "]" of
                          Nothing ->
                            ( decomp & decodeUtf8 & ShortText.unpack
                            , mempty
                            )
                          Just decompWithCode ->
                            ( decompChars
                                & ShortText.unpack
                                -- ExplanationNote: remove the final '['
                                & List.init
                            , decompCodes & ShortText.unpack & fromList
                            )
                            where
                              (decompChars, decompCodes) =
                                decompWithCode
                                  & decodeUtf8
                                  & ShortText.breakEnd (== '[')
                    chineseComponents =
                      decompWithCodes
                        -- FIXME: CorrectnessWarning: maybe select using the Codes
                        -- instead of just taking the first.
                        & List.head
                        & fst
                        -- ExplanationNote: remove IDCs (Ideographic Description Characters)
                        -- namely "⿰" (U+2FF0) to "⿻" (U+2FFB).
                        & List.filter (\c -> not ('\x2FF0' <= c && c <= '\x2FFB'))
                        & List.filter (\c -> ShortText.singleton c /= unicodeChar)
                        & Just
                  in
                    loop dataHandle $
                      ChineseDict $
                        acc & Map.insert unicodeChar mempty{chineseComponents}
            _ -> loop dataHandle accDict

readChineseDict :: IO ChineseDict
readChineseDict =
  fold $
    list
      [ return extraEntries
      , readLoachWangFreq
      , readChiseStrokes
      , readChiseDecomp
      , -- , readChineseLexicalDatabase
        readCEDICT
        -- , readLoachWangStrokes
        -- , readOutlierDecomp
      ]
extraEntries :: ChineseDict
extraEntries =
  ChineseDict $
    [
      [
        ( "·"
        , mempty
            { chineseStrokes = Just 1
            , chineseComponents = Just ['·']
            }
        )
      ,
        ( "库蠓属"
        , mempty
            { chinesePinyins = ["ku4", "meng3", "shi4"]
            , chineseEnglish = ["culicoides"]
            }
        )
      ,
        ( "瘦果"
        , mempty
            { chinesePinyins = ["shou4", "guo3"]
            , chineseEnglish = ["achene"]
            }
        )
      ,
        ( "美洲野马"
        , mempty
            { chinesePinyins = ["mei3", "zhou1", "ye3", "ma3"]
            , chineseEnglish = ["mustang horse"]
            }
        )
      ,
        ( "钩粉蝶"
        , mempty
            { chinesePinyins = ["gou1", "fen3", "die2"]
            , chineseEnglish = ["brimstone butterfly"]
            }
        )
      ,
        ( "核果"
        , mempty
            { chinesePinyins = ["he2", "guo3"]
            , chineseEnglish = ["drupe"]
            }
        )
      ,
        ( "0"
        , mempty
            { chineseStrokes = Just 14
            , chinesePinyins = ["ling2"] -- 零
            }
        )
      ,
        ( "1"
        , mempty
            { chineseStrokes = Just 2
            , chinesePinyins = ["yi1"]
            }
        )
      ,
        ( "2"
        , mempty
            { chineseStrokes = Just 2
            , chinesePinyins = ["er4"]
            }
        )
      ,
        ( "3"
        , mempty
            { chineseStrokes = Just 2
            , chinesePinyins = ["san1"]
            }
        )
      ,
        ( "4"
        , mempty
            { chineseStrokes = Just 2
            , chinesePinyins = ["si4"]
            }
        )
      ,
        ( "5"
        , mempty
            { chineseStrokes = Just 2
            , chinesePinyins = ["wu3"]
            }
        )
      ,
        ( "6"
        , mempty
            { chineseStrokes = Just 1
            , chinesePinyins = ["liu4"]
            }
        )
      ,
        ( "7"
        , mempty
            { chineseStrokes = Just 1
            , chinesePinyins = ["qi1"]
            }
        )
      ,
        ( "8"
        , mempty
            { chineseStrokes = Just 2
            , chinesePinyins = ["ba1"]
            }
        )
      ,
        ( "9"
        , mempty
            { chineseStrokes = Just 1
            , chinesePinyins = ["jiu3"]
            }
        )
      ]
    , [ ( letter & ShortText.singleton
        , mempty
            { chineseStrokes = Just 1
            , chinesePinyins = []
            }
        )
      | letter <-
          [ ['A' .. 'Z']
          , ['a' .. 'z']
          , ['π', '□', '○']
          ]
            & mconcat
      ]
    ]
      & mconcat
      & Map.fromList

readLoachWangWordOrder :: IO JSON.Array
readLoachWangWordOrder =
  readJSON "data/langs/mandarin/LoachWang/loach_word_order.json" $
    JSON.withArray "LoachWordOrder" \arr -> return arr

readOutlierDecomp :: IO ChineseDict
readOutlierDecomp =
  readJSON "data/langs/mandarin/LoachWang/outlier_decomp.json" $
    JSON.withObject "OutlierDecomp" \kv ->
      ChineseDict
        . Map.fromAscList
        <$> forM (kv & JSON.toMap & Map.toList) \(k, v) ->
          v & JSON.withArray "Decomp" \decomp ->
            forM
              (decomp & toList)
              ( \sub ->
                  sub & JSON.withText "Atom" \atom ->
                    return $ atom & Text.unpack
              )
              <&> \atoms ->
                ( k & JSON.DictKey.toShortText
                , mempty{chineseComponents = Just $ atoms & mconcat}
                )

reverseDecomp :: ChineseDict -> Map {-def-} ShortText {-refs-} [ShortText]
reverseDecomp (ChineseDict dict) =
  [ sub := [key]
  | (key, entry) <- dict & Map.toList
  , sub <- entry & chineseDecomp key <&> ShortText.singleton
  ]
    & Map.fromListWith (<>)

type Benefit = Probability
type Cost = Natural
type Weight = Probability
type DictKey = ShortText
data Weights = Weights
  { weightsMap :: Map DictKey (Benefit, Cost)
  }
  deriving (Show)

-- |
-- Zhao, K., zhao, D., Yang, J., Ma, W., & Yu, B. (2019).
-- Efficient Learning Strategy of Chinese Characters Based on
-- Usage Frequency and Structural Relationship.
-- 2019 IEEE 4th International Conference on Big Data Analytics (ICBDA).
-- doi:10.1109/icbda.2019.8713245
-- https://sci-hub.se/10.1109/icbda.2019.8713245
dictToWeights :: ChineseDict -> Weights
dictToWeights cn@(ChineseDict dict) = ST.runST do
  -- ExplanationNote: memo tables, to avoid recomputing benefits and costs
  keyToBenefitST :: ST.STRef st (Map DictKey Benefit) <- ST.newSTRef Map.empty
  keyToCostST :: ST.STRef st (Map DictKey Cost) <- ST.newSTRef Map.empty
  let refsMap :: Map ShortText [ShortText] = reverseDecomp cn
  let
    keyToBenefitLoop :: DictKey -> ST.ST st Benefit
    keyToBenefitLoop key = do
      -- traceShowM ("keyToBenefitLoop"::String, key)
      keyToBenefitMap <- keyToBenefitST & ST.readSTRef
      keyToBenefitMap
        & Map.lookup key
        & \case
          Just bene -> return bene
          Nothing -> do
            let refs :: [ShortText] =
                  refsMap
                    & Map.lookup key
                    & fromMaybe []
                    -- & fromMaybe []
                    & List.delete key
            -- traceShowM ("keyToBenefitLoop"::String, key, refs)
            let freq =
                  dict
                    & Map.lookup key
                    & maybe proba0 (chineseFrequency >>> fromMaybe proba0)
            refsBC <- forM refs \ref -> do
              bene <- keyToBenefitLoop ref
              cost <- keyToCostLoop ref
              return (bene, cost)
            cost <- keyToCostLoop key
            bene <-
              arithM $
                probability $
                  runProbability freq
                    + toRational cost
                    * sum
                      [ runProbability b / toRational c
                      | (b, c) <- refsBC
                      , c /= 0
                      ]
            ST.modifySTRef keyToBenefitST $ Map.insert key bene
            return bene
    keyToCostLoop :: DictKey -> ST.ST st Cost
    keyToCostLoop key = do
      -- traceShowM ("keyToCostLoop"::String, key)
      keyToCostMap <- keyToCostST & ST.readSTRef
      keyToCostMap
        & Map.lookup key
        & \case
          Just cost -> return cost
          Nothing -> do
            let entry = dict & Map.lookup key & fromMaybe mempty
            cost <-
              -- single char
              if ShortText.length key == 1
                then do
                  let deps =
                        entry
                          & chineseDecomp key
                          & List.delete (ShortText.unpack key & List.head)
                  -- & (`Set.difference` (ShortText.unpack key & Set.fromList))
                  if null deps
                    then -- atom char
                      return $
                        entry
                          & chineseStrokes
                          & fromMaybe 1
                    else -- & fromMaybe (error $ "dictToWeights: missing chineseStrokes for: " <> ShortText.unpack key)
                      do
                        -- composite char
                        -- traceShowM ("composite"::String, show key, show <$> Set.toList deps
                        --            , deps & (`Set.difference` (ShortText.unpack key & Set.fromList))
                        --            )
                        deps
                          & foldMapM (\dep -> Sum <$> keyToCostLoop (dep & ShortText.singleton))
                          <&> getSum
                else -- word
                  do
                    -- traceShowM ("word"::String, key)
                    key
                      & ShortText.unpack
                      & foldMapM (\dep -> Sum <$> keyToCostLoop (dep & ShortText.singleton))
                      <&> getSum
            ST.modifySTRef keyToCostST $ Map.insert key cost
            return cost
  weightsMap <-
    dict & Map.traverseWithKey \key _entry -> do
      -- traceShowM ("dictToWeights"::String, key)
      bene <- keyToBenefitLoop key
      cost <- keyToCostLoop key
      return (bene, cost)
  return
    Weights
      { weightsMap
      }

newtype ChineseOrder = ChineseOrder (Map (Down Weight) Weights)
unChineseOrder (ChineseOrder x) = x
dictOrder :: Weights -> ChineseOrder
dictOrder w =
  [ ( probability (runProbability b / toRational c) & arithError & Down
    , Weights
        { weightsMap = Map.singleton k v
        }
    )
  | (k, v@(b, c)) <- w & weightsMap & Map.toList
  ]
    & Map.fromListWith (\x y -> Weights{weightsMap = Map.unionWith (error "dictOrder") (weightsMap x) (weightsMap y)})
    & ChineseOrder

dictOrderIO = readChineseDict <&> (dictToWeights >>> dictOrder)

-- order100 :: IO [(DictKey, Maybe ChineseDictEntries)]
order100 = do
  dict@(ChineseDict d) <- readChineseDict
  let ChineseOrder o = dict & dictToWeights & dictOrder
  return $
    o
      & Map.take 100
      & Map.elems
      & foldMap (weightsMap >>> Map.keys)
      <&> \k -> (k, d & Map.lookup k <&> \r -> (chinesePinyins r, chineseEnglish r))

dictOrderBySameWeight (ChineseOrder o) =
  [ ( Map.size (weightsMap v)
    , Map.singleton k v
    )
  | (k, v) <- o & Map.toList
  ]
    & Map.fromListWith (<>)

{-
data UnfoldedGraph node lbl edge = UnfoldedGraph
  { unfoldedVertexToEdges :: !(Array.Array Vertex (lbl, Map edge IntSet))
  , unfoldedNodeToVertex :: !(Map node Vertex)
  }
  deriving (Show, Generic)
unfoldGMany ::
  forall node edge nodeLabel.
  Ord node =>
  (node -> (nodeLabel, Map edge (Set node))) ->
  [node] ->
  UnfoldedGraph node nodeLabel edge
unfoldGMany gen roots = ST.runST do
  nodeToVertexST :: ST.STRef st (Map node Vertex) <- ST.newSTRef Map.empty
  vertexToEdgesST :: ST.STRef st [(Vertex, (nodeLabel, Map edge IntSet))] <- ST.newSTRef []
  lastVertexST <- ST.newSTRef firstVertex
  let newVertex = do
        v <- ST.readSTRef lastVertexST
        ST.writeSTRef lastVertexST (v + 1)
        return v
  let
    nodeToVertexDepthFirst :: node -> ST.ST st Vertex
    nodeToVertexDepthFirst src =
      nodeToVertexST
        & ST.readSTRef
          <&> Map.lookup src
          >>= \case
            -- DescriptionNote: `src` has already been seen,
            -- just return its unique `Vertex`.
            Just v -> return v
            -- DescriptionNote: `src` has never been seen yet,
            -- allocate a `newVertex` for it
            Nothing -> do
              dst <- newVertex
              ST.modifySTRef nodeToVertexST $ Map.insert src dst
              let (nodeLabel, edgeToDsts) = gen src
              edgeToDstsV :: Map edge IntSet <-
                edgeToDsts & Map.traverseWithKey \_edge dsts ->
                  forM (dsts & Set.toList) nodeToVertexDepthFirst
                    <&> IntSet.fromList
              let res = (dst, (nodeLabel, edgeToDstsV))
              ST.modifySTRef vertexToEdgesST (res :)
              return dst
  forM_ roots nodeToVertexDepthFirst
  nodeToVertex <- ST.readSTRef nodeToVertexST
  vertexToEdges <- ST.readSTRef vertexToEdgesST
  lastId <- ST.readSTRef lastVertexST
  return
    UnfoldedGraph
      { unfoldedVertexToEdges = vertexToEdges & Array.array (firstVertex, lastId - 1)
      , unfoldedNodeToVertex = nodeToVertex
      }
  where
    firstVertex :: Vertex = 0
-}

-- mostComp d =
--   [
--   | d & reverseDecomp & Map.toList
--   ] & Map.fromListWith (<>)

data ChineseTone
  = ChineseTone1
  | ChineseTone2
  | ChineseTone3
  | ChineseTone4
  | ChineseTone5
  deriving (Eq, Ord, Show, Enum)

numberedPinyinToDiacriticPiniyn :: ShortText -> ShortText
numberedPinyinToDiacriticPiniyn numPin =
  case pinTone of
    Nothing -> pinRoot
    Just tone ->
      pinRoot
        & ShortText.unpack
        & addUmlaut
        & setDiacritic
        & ShortText.pack
      where
        addUmlaut ('u' : ':' : cs) = 'ü' : addUmlaut cs
        addUmlaut ('v' : cs) = 'ü' : addUmlaut cs
        addUmlaut ('U' : ':' : cs) = 'Ü' : addUmlaut cs
        addUmlaut ('V' : cs) = 'Ü' : addUmlaut cs
        addUmlaut (c : cs) = c : addUmlaut cs
        addUmlaut [] = []
        -- CorrectnessNote: a, e or the o in ou always get the marker
        setDiacritic (c@'a' : cs) = convert tone c : cs
        setDiacritic (c@'e' : cs) = convert tone c : cs
        setDiacritic (c@'o' : 'u' : cs) = convert tone c : 'u' : cs
        setDiacritic (c : cs)
          -- CorrectnessNote: if no a, e, or ou found, the tone mark goes on the last vowel
          | List.elem c vowels && all (`List.notElem` vowels) cs = convert tone c : cs
          | otherwise = c : setDiacritic cs
        setDiacritic [] = []
        vowels = lowerVowels <> (Char.toUpper <$> lowerVowels)
        lowerVowels = ['a', 'e', 'i', 'o', 'u', 'ü'] & list
  where
    (pinRoot, pinTone) ::
      (ShortText, Maybe ChineseTone) =
        numPin & ShortText.spanEnd Char.isDigit & second \case
          "" -> Nothing
          ds -> ds & ShortText.unpack & read & (\x -> x - (1 :: Int)) & toEnum & Just
    convert :: ChineseTone -> Char -> Char
    convert t c = case t of
      ChineseTone1 ->
        case c of
          'a' -> 'ā'
          'e' -> 'ē'
          'i' -> 'ī'
          'o' -> 'ō'
          'u' -> 'ū'
          'ü' -> 'ǖ'
          'A' -> 'Ā'
          'E' -> 'Ē'
          'I' -> 'Ī'
          'O' -> 'Ō'
          'U' -> 'Ū'
          'Ü' -> 'Ǖ'
          _ -> undefined
      ChineseTone2 ->
        case c of
          'a' -> 'á'
          'e' -> 'é'
          'i' -> 'í'
          'o' -> 'ó'
          'u' -> 'ú'
          'ü' -> 'ǘ'
          'A' -> 'Á'
          'E' -> 'É'
          'I' -> 'Í'
          'O' -> 'Ó'
          'U' -> 'Ú'
          'Ü' -> 'Ǘ'
          _ -> undefined
      ChineseTone3 ->
        case c of
          'a' -> 'ǎ'
          'e' -> 'ě'
          'i' -> 'ǐ'
          'o' -> 'ǒ'
          'u' -> 'ǔ'
          'ü' -> 'ǚ'
          'A' -> 'Ǎ'
          'E' -> 'Ě'
          'I' -> 'Ǐ'
          'O' -> 'Ǒ'
          'U' -> 'Ǔ'
          'Ü' -> 'Ǚ'
          _ -> undefined
      ChineseTone4 ->
        case c of
          'a' -> 'à'
          'e' -> 'è'
          'i' -> 'ì'
          'o' -> 'ò'
          'u' -> 'ù'
          'ü' -> 'ǜ'
          'A' -> 'À'
          'E' -> 'È'
          'I' -> 'Ì'
          'O' -> 'Ò'
          'U' -> 'Ù'
          'Ü' -> 'Ǜ'
          _ -> undefined
      ChineseTone5 -> c

data HskLevel
  = HskLevel301
  | HskLevel302
  | HskLevel303
  | HskLevel304
  | HskLevel305
  | HskLevel306
  deriving (Eq, Ord, Enum, Show)

-- instance CSV.FromRecord ChineseDictEntries
-- instance CSV.ToRecord ChineseDictEntries
-- instance CSV.FromNamedRecord ChineseDictEntries
-- instance CSV.ToNamedRecord ChineseDictEntries
-- instance CSV.DefaultOrdered ChineseDictEntries

feed :: (ByteString -> r) -> Sys.Handle -> Sys.IO r
feed k csvFile = do
  Sys.hIsEOF csvFile >>= \case
    True -> return $ k ""
    False -> k <$> ByteString.hGetSome csvFile 4096

readHSK :: HskLevel -> IO ChineseDict
readHSK chineseHSK = do
  withDataFile ("data/langs/mandarin/HSK/hsk" <> show hskIndex Sys.<.> "csv") \fileHandle -> do
    loop fileHandle mempty $
      CSV.Incremental.decodeWithP parser decodeOpts CSV.NoHeader
  where
    hskIndex = chineseHSK & fromEnum & (+ 1)
    decodeOpts = CSV.defaultDecodeOptions
    parser :: CSV.Record -> CSV.Parser (ShortText, ChineseDictEntries)
    parser v
      | length v == 3 = do
          chinese <- v .! 0
          chinesePinyins <- v .! 1 <&> ShortText.split (== ' ')
          chineseEnglish <- v .! 2 <&> pure
          let chineseFrequency = Nothing
          let chineseComponents = Nothing
          let chineseStrokes = Nothing
          pure (chinese, ChineseDictEntries{..})
      | otherwise = empty
    check =
      either (\x -> Sys.print x >> return mempty) \(chinese, e) ->
        return $ ChineseDict $ Map.singleton chinese e
    loop fileHandle !acc = \case
      CSV.Incremental.Fail _ errMsg -> do
        Sys.putStrLn errMsg
        Sys.exitFailure
      CSV.Incremental.Many rs k -> do
        ok <- rs & foldMapM check
        t <- feed k fileHandle
        loop fileHandle (acc <> ok) t
      CSV.Incremental.Done rs -> do
        ok <- rs & foldMapM check
        return (acc <> ok)

-- | Sun, C.C., Hendrix, P., Ma, J. et al.
-- Chinese lexical database (CLD).
-- Behav Res 50, 2606–2629 (2018).
-- https://doi.org/10.3758/s13428-018-1038-3
readChineseLexicalDatabase :: IO ChineseDict
readChineseLexicalDatabase = do
  withDataFile "data/langs/mandarin/chineselexicaldatabase2.1.csv" \fileHandle -> do
    loop fileHandle mempty $
      CSV.Incremental.decodeWithP parser decodeOpts CSV.HasHeader
  where
    decodeOpts = CSV.defaultDecodeOptions
    parser :: CSV.Record -> CSV.Parser (DictKey, ChineseDictEntries)
    parser v
      | length v == 269 = do
          key <- v .! 0
          keyLength :: Int <- v .! 5
          pinyin1 <- v .! 15
          pinyin2 <- v .! 16
          pinyin3 <- v .! 17
          pinyin4 <- v .! 18
          let chinesePinyins =
                [pinyin1, pinyin2, pinyin3, pinyin4]
                  & List.take keyLength
          freqPerMillion :: Double <- v .! 39
          let chineseFrequency =
                freqPerMillion
                  & toRational
                  & (/ 1000000)
                  & probability
                  & fromMaybe (error "readChineseLexicalDatabase: Frequency")
                  & Just
          pure (key, mempty{chineseFrequency, chinesePinyins})
      | otherwise = error $ "readChineseLexicalDatabase: wrong number of columns: " <> show v
    check =
      either (\x -> Sys.print x >> return mempty) \(key, val) ->
        return $ ChineseDict $ Map.singleton key val
    loop fileHandle !acc = \case
      CSV.Incremental.Fail _ errMsg -> do
        Sys.putStrLn errMsg
        Sys.exitFailure
      CSV.Incremental.Many rs k -> do
        ok <- rs & foldMapM check
        t <- feed k fileHandle
        loop fileHandle (acc <> ok) t
      CSV.Incremental.Done rs -> do
        ok <- rs & foldMapM check
        return (acc <> ok)

readCEDICT :: IO ChineseDict
readCEDICT = do
  withDataFile "data/langs/mandarin/CEDICT/cedict_ts.u8" \cedictHandle -> do
    let skipHeader = do
          isEOF <- cedictHandle & Sys.hIsEOF
          if isEOF
            then return ()
            else do
              lineBS <- ByteString.hGetLine cedictHandle
              let lineST = lineBS & ShortText.fromByteString & fromMaybe (error "invalid UTF-8")
              let begin = lineST & ShortText.take 1
              when (begin == "#") do
                skipHeader
    skipHeader
    let loop !acc = do
          isEOF <- cedictHandle & Sys.hIsEOF
          if isEOF
            then return acc
            else do
              line <- ByteString.hGetLine cedictHandle
              let decodeUtf8 = ShortText.fromByteString >>> fromMaybe (error "invalid UTF-8")
              -- DescriptionNote: each line is formatted as: #(.+) (.+) \[(.+)] /(.*)/#'
              let skipChar c = void $ MT.state $ ByteString.span (== fromIntegral (fromEnum c))
              let skipPrefix p =
                    MT.modify' $
                      ByteString.stripPrefix p
                        >>> fromMaybe (error $ "skipPrefix fail to match: " <> show p)
              let breakOnChar c = ByteString.break (== fromIntegral (fromEnum c))
              let breakOnSpace = breakOnChar ' '
              let skipSuffix p =
                    MT.modify' $
                      ByteString.stripSuffix p
                        >>> fromMaybe
                          ( error $
                              "skipSuffix: mismatch: "
                                <> show p
                                <> "\n  on line: "
                                <> ShortText.unpack (decodeUtf8 line)
                                <> "\n  escaped: "
                                <> show (ShortText.unpack (decodeUtf8 line))
                          )
              let (dict, leftover) = (`MT.runState` line) do
                    _chineseTrad <- MT.state $ breakOnSpace >>> first decodeUtf8
                    skipChar ' '
                    chineseSimpl <- MT.state $ breakOnSpace >>> first decodeUtf8
                    skipChar ' '
                    skipPrefix "["
                    chinesePinyins <-
                      MT.state $
                        breakOnChar ']'
                          >>> first
                            ( \s ->
                                s
                                  & ByteString.split (fromIntegral (fromEnum ' '))
                                  -- CorrectnessWarning: convert pinyin to lowercase, and remove dashes
                                  <&> (decodeUtf8 >>> ShortText.unpack >>> (<&> Char.toLower) >>> ShortText.pack)
                                  & List.filter (/= "-")
                            )
                    skipPrefix "] /"
                    -- CorrectnessNote: some lines do not end with \r
                    -- hence make it optional.
                    MT.modify' \s -> s & ByteString.stripSuffix "\r" & fromMaybe s
                    skipSuffix "/"
                    chineseEnglish <- MT.gets \s -> s & ByteString.split (fromIntegral (fromEnum '/')) <&> decodeUtf8
                    MT.put mempty
                    let chinese = chineseSimpl
                    return $
                      ChineseDict $
                        Map.singleton chinese $
                          mempty
                            { chinesePinyins
                            , chineseEnglish
                            }
              if not (ByteString.null leftover)
                then error $ "parserLine: leftover: " <> show leftover
                else loop (acc <> dict)
    loop mempty

lookupPinyins :: ChineseDict -> ShortText -> [ShortText]
lookupPinyins (ChineseDict dict) word =
  word
    & (`Map.lookup` dict)
    & fromMaybe (error $ "lookupPinyins: no entry for: {" <> wordString <> "}")
    & chinesePinyins
    & (\ps -> if null ps then error $ "lookupPinyins: empty entry for: " <> wordString else ps)
  where
    wordString = word & ShortText.unpack

pronunciation :: ChineseDict -> [Pron.Lexeme] -> [[Either Char Pron.Pron]]
pronunciation dict inp =
  inp & Pron.lexemesChars & Text.pack & Text.words <&> wordToPron
  where
    wordToPron :: Text -> [Either Char Pron.Pron]
    wordToPron wordText =
      [ pronun chars charCateg charBlock
      | (charCateg, uniBlockToChars) <-
          wordText
            & Text.unpack
            & List.map (\c -> (Char.generalCategory c, (Char.unicodeBlock c, c)))
            & Char.consecutiveGroups
            <&> (<&> Char.consecutiveGroups)
      , (charBlock, chars) <- uniBlockToChars
      ]
        & mconcat
      where
        pronun :: [Char] -> Char.GeneralCategory -> Maybe Char.UnicodeBlock -> [Either Char Pron.Pron]
        pronun chars Char.DecimalNumber _ =
          [ Right
            Pron.Pron
              { pronInput = [Pron.LexemeChar char]
              , pronRule =
                  Pron.rule
                    { Pron.rulePron =
                        Pron.Pronunciations
                          { Pron.unPronunciations =
                              [ Pron.RuleLexemes [Pron.LexemeChar char] :=
                                  Pron.Pronunciation
                                    { Pron.pronunciationIPABroad = []
                                    , Pron.pronunciationText = txt char
                                    }
                              ]
                          }
                    }
              }
          | char <- chars
          ]
          where
            txt c =
              c
                & ShortText.singleton
                & lookupPinyins dict
                <&> ( \p ->
                        p
                          & numberedPinyinToDiacriticPiniyn
                          & ShortText.toText
                          & Text.toLower
                    )
                & Text.intercalate " "
        pronun chars Char.OpenPunctuation _ =
          [ Left c
          | c <- chars
          ]
        pronun chars Char.ClosePunctuation _ =
          [ Left c
          | c <- chars
          ]
        pronun chars Char.OtherPunctuation _ =
          [ Left c
          | c <- chars
          ]
        pronun chars _ (Just Char.UnicodeBlockCJK{}) =
          [ Right
              Pron.Pron
                { pronInput = Pron.LexemeChar <$> chars
                , pronRule =
                    Pron.rule
                      { Pron.rulePron =
                          Pron.Pronunciations
                            { Pron.unPronunciations =
                                [ Pron.RuleLexemes (Pron.LexemeChar <$> chars) :=
                                    Pron.Pronunciation
                                      { Pron.pronunciationIPABroad = []
                                      , Pron.pronunciationText = txt
                                      }
                                ]
                            }
                      }
                }
          ]
          where
            txt =
              chars
                & ShortText.pack
                & lookupPinyins dict
                & List.map
                  ( \pinyin ->
                      pinyin
                        & numberedPinyinToDiacriticPiniyn
                        & ShortText.toText
                        & Text.toLower
                  )
                & Text.intercalate " "
        -- if List.length wordPinyins == Text.length wordText = wordPinyins
        -- where
        -- wordPinyins = chars
        --  & ShortText.pack
        --  & lookupPinyins dict
        pronun chars categ block = errorShow ("pronunciation" :: Text, chars, categ, block)

orderHTML dict@(ChineseDict d) = do
  dataPath <- Self.getDataDir <&> File.normalise
  return $ Blaze.renderMarkupBuilder do
    HTML.docTypeHtml do
      HTML.head do
        HTML.title $ ("Chinese Order" :: Text) & HTML.toHtml
        forM_
          ( [ "styles/Paper.css"
            , "styles/Rosetta/Common.css"
            , "styles/Rosetta/Reading.css"
            ]
              & list
          )
          \cssFile ->
            HTML.link
              ! HA.rel "stylesheet"
              ! HA.type_ "text/css"
              ! HA.href (dataPath </> cssFile & HTML.toValue)
      -- HTML.styleCSS $ pagesDifficulties & difficultyCSS
      HTML.body do
        let ChineseOrder o = dict & dictToWeights & dictOrder
        HTML.ol do
          forM_
            ( o
                & Map.take 5000
                & Map.elems
                & foldMap (weightsMap >>> Map.keys)
            )
            \k ->
              HTML.li
                ! HTML.styles
                  [ "break-inside" := "avoid"
                  , "list-style-position" := "inside"
                  ]
                $ do
                  HTML.span $ k & HTML.toHtml
                  forM_ (d & Map.lookup k) \v -> do
                    HTML.ul ! HTML.styles [] $ do
                      HTML.li do
                        HTML.span $ v & chinesePinyins <&> ShortText.toText & Text.unwords & HTML.toHtml
                      HTML.li do
                        HTML.span $ v & chineseDecomp k & Text.pack & HTML.toHtml
                      HTML.li do
                        HTML.span $ v & chineseEnglish <&> ShortText.toText & Text.intercalate "; " & HTML.toHtml

orderDOT :: ChineseDict -> IO BS.Builder
orderDOT dict@(ChineseDict d) = do
  DOT.runDOT do
    -- dotComments [(BS.lazyByteString $ LazyText.encodeUtf8 $ pShow $ rangeToMstToGroupToClusters & Map.map Map.keys)]
    -- pTraceShow ("num of nodes", Map.size nodeToBranch, "num of branches", Map.size msf) $
    DOT.dotLine "digraph g"

{-
dotBlock do
  dotLine "splines=\"ortho\""
  indexFrom1M_ (rangeToMstToGroupToClusters & Map.toList) \(srcR, mstToGroupToClusters) srcRI -> do
    let srcRB = "r" <> BS.intDec srcRI
    dotLine $ "subgraph cluster_" <> srcRB
    dotBlock do
      dotComments ["Create a node for the range " <> srcRB]
      dotNode
        srcRB
        [ ("shape", "box")
        , ("label", builderQuotedString (showHuman srcR))
        , ("color", "gray")
        , ("style", "filled")
        , ("fillcolor", "gray")
        ]
      dotLine "color=gray"
      dotBlock do
        dotLine "rank=same"
        dotComments ["Create the cluster nodes within the range " <> srcRB]
        forM_ (mstToGroupToClusters & Map.toList) \(mstI, groupToClusters) -> do
          forM_ (groupToClusters & Map.toList) \(srcGroup, srcClusters) -> do
            dotNodeCluster
              srcRI
              mstI
              srcGroup
              [
                ( "label"
                , builderQuotedString $
                    (srcClusters & toList <&> showHuman & List.unlines)
                      <> "\nT"
                      <> printf "%03d" mstI
                      <> "\nS"
                      <> show scaleI
                      -- <> {-maybe ""-} (("\n" <>) . showSimilarity) minSimil
                )
              , ("style", "filled")
              , -- , minSimil & {-maybe ("", "")-} (\s -> ("fillcolor", 1 + ((floor (runProbability s * 10)) `mod` 10) & BS.intDec))
                ("colorscheme", "ylorrd9")
              , ("shape", "box")
              ]
        dotComments ["Horizontally align the cluster nodes within the same range"]
        let row =
              [ (mstI, group)
              | (mstI, groupToClusters) <- mstToGroupToClusters & Map.toList
              , (group, _clusters) <- groupToClusters & Map.toList
              ]
        case row of
          [] -> return ()
          c@(firstMst, firstGroup) : cs -> do
            dotEdges
              [srcRB, srcRB <> "t" <> BS.intDec firstMst <> "c" <> BS.intDec firstGroup]
              [ ("style", "invis")
              ]
            cs & (`foldM_` c) \(srcMst, srcGroup) dst@(dstMst, dstGroup) -> do
              dotEdgesCluster
                [(srcRI, srcMst, srcGroup), (srcRI, dstMst, dstGroup)]
                [ ("weight", "10")
                , ("style", "invis")
                ]
              return dst
  indexFrom1M_ sortedMSF \mst mstI -> do
    dotComments ["Create the edges of the MST " <> BS.intDec mstI]
    -- pTraceShowM (mstI, List.length (Tree.flatten mst))
    let loop (Tree.Node MSTNode{mstNodeRangeCluster = src} dsts) = do
          forM_ dsts \dstNode@(Tree.Node MSTNode{mstNodeRangeCluster = dst, mstNodeSimilarity = simil} _) -> do
            -- let similB = BS.stringUtf8 $ showFFloat (Just 2) (simil & runProbability & fromRational @Double) ""
            let indexRangeCluster (r, c) =
                  let clusterToGroup :: cluster :-> ClusterGroup =
                        Map.fromList
                          [ (cluster, group)
                          | (group, clusters) <-
                              rangeToMstToGroupToClusters
                                & Map.lookup r
                                & fromMaybe Map.empty
                                & Map.lookup mstI
                                & fromMaybe Map.empty
                                & Map.toList
                          , cluster <- clusters & Set.toList
                          ]
                  in ( 1 + Map.findIndex r rangeToMstToGroupToClusters
                     , mstI
                     , Map.lookup c clusterToGroup
                        & fromMaybe (error (LazyText.unpack (pShow ("r", r, "c", c {-, "clusterToGroup", clusterToGroup, "rangeToMstToGroupToClusters", rangeToMstToGroupToClusters-}))))
                     )
            dotEdgesCluster
              [ indexRangeCluster src
              , indexRangeCluster dst
              ]
              [ ("constraint", "false")
              , ("color", (floor (runProbability simil * 10)) `mod` 10 & BS.intDec)
              , ("colorscheme", "ylorrd9")
              , -- , ("label", similB)
                ("fontcolor", "blue")
              , ("dir", "both")
              , ("arrowhead", "dot")
              , ("arrowtail", "dot")
              ]
            loop dstNode
    loop mst
  dotRanges rangeToMstToGroupToClusters
-}