src/Language.hs

   1 module Language where
   2
   3 import Data.List qualified as List
   4 import Data.Set qualified as Set
   5 import Data.Text qualified as Text
   6 import Data.Text.Short qualified as ShortText
   7 import Worksheets.Utils.Char qualified as Char
   8 import Worksheets.Utils.IPA qualified as IPA
   9 import Worksheets.Utils.Prelude
  10 import Prelude (error)
  11
  12 data Langue
  13   = LangueFrançais
  14   | LangueAnglais
  15   | LangueMandarin
  16   | LangueMandarinPinyin
  17   | LanguePhonetic
  18   deriving (Eq, Ord, Show)
  19 instance HasTypeDefault Langue where
  20   typeDefault = LangueAnglais
  21 langues :: Set Langue
  22 langues =
  23   [ LangueFrançais
  24   , LangueAnglais
  25   , LangueMandarin
  26   , LangueMandarinPinyin
  27   , LanguePhonetic
  28   ]
  29     & Set.fromList
  30
  31 data CharMeta = CharMeta
  32   { charMetaChar :: Char
  33   , charMetaUnicodeCategory :: Char.GeneralCategory
  34   , charMetaUnicodeBlock :: Maybe Char.UnicodeBlock
  35   }
  36   deriving (Eq, Ord, Show)
  37
  38 metanizer :: Text -> [CharMeta]
  39 metanizer t =
  40   t & Text.unpack <&> \c ->
  41     CharMeta
  42       { charMetaChar = c
  43       , charMetaUnicodeCategory = c & Char.generalCategory
  44       , charMetaUnicodeBlock = c & Char.unicodeBlock
  45       }
  46
  47 data Token = Token
  48   { tokenText :: ShortText -- Char
  49   , tokenMeta :: (Char.GeneralCategory, Maybe Char.UnicodeBlock)
  50   }
  51   deriving (Eq, Ord, Show)
  52
  53 tokenizer :: Text -> [Token]
  54 tokenizer t =
  55   t & Text.unpack <&> \c ->
  56     Token
  57       { tokenText = c & ShortText.singleton
  58       , tokenMeta =
  59           ( c & Char.generalCategory
  60           , c & Char.unicodeBlock
  61           )
  62       }
  63
  64 rosettaTokenizer :: ShortText -> [Token]
  65 rosettaTokenizer s = s & ShortText.unpack & group
  66   where
  67     group [] = []
  68     group (inpHead : inpTail) = tok : group rest
  69       where
  70         tok =
  71           Token
  72             { tokenText = inpHead : txt & ShortText.pack
  73             , tokenMeta
  74             }
  75         tokenMeta =
  76           ( inpHead & Char.generalCategory
  77           , inpHead & Char.unicodeBlock
  78           )
  79         (txt, rest) =
  80           inpTail & List.span \c ->
  81             (Char.generalCategory c, Char.unicodeBlock c) == tokenMeta
  82
  83 groupByHoriz :: [Token] -> [[Token]]
  84 groupByHoriz = group
  85   where
  86     group [] = []
  87     group (inpHead : inpTail) =
  88       case inpHead of
  89         Token{tokenMeta = (Char.Space, _)} -> group rest
  90           where
  91             (_skipSpaces, rest) = inpTail & List.span onSep
  92         tok -> (tok : nonSeps) : group rest
  93           where
  94             (nonSeps, rest) = inpTail & List.break onSep
  95       where
  96         onSep = \case
  97           Token{tokenText, tokenMeta = (Char.Space, _)}
  98             | tokenText & ShortText.unpack & all (== '\xA0') -> False
  99             | otherwise -> True
 100           _ -> False
 101 splitWords :: [Token] -> [[Token]]
 102 splitWords = group
 103   where
 104     group :: [Token] -> [[Token]]
 105     group [] = []
 106     group (inpHead : inpTail) =
 107       case inpHead of
 108         Token{tokenText = ShortText.unpack >>> all (== '\xA0') -> True, tokenMeta = (Char.Space, _)} -> group rest
 109           where
 110             (_skipSpaces, rest) = inpTail & List.span onSep
 111         tok -> (tok : nonSeps) : group rest
 112           where
 113             (nonSeps, rest) = inpTail & List.break onSep
 114       where
 115         onSep = \case
 116           Token{tokenText = ShortText.unpack >>> all (== '\xA0') -> True, tokenMeta = (Char.Space, _)} -> True
 117           _ -> False
 118
 119 {-
 120 -- | CorrectnessNote: beware than the tokenMeta is just preserved,
 121 -- it does not correspond to the pronunciation unicode code points.
 122 chinesePronunciation :: ChineseDict -> [Token] -> [Token]
 123 chinesePronunciation chineseDict toks =
 124   toks & List.concatMap \tok ->
 125     let tokText = tok & tokenText
 126     in let tokString = tokText & ShortText.unpack
 127        in case tok & tokenMeta of
 128             (_, Just Char.UnicodeBlockCJK{}) -> pinyins <&> \tokenText -> tok{tokenText}
 129               where
 130                 pinyins :: [ShortText]
 131                 pinyins
 132                   | tokString & all Char.isNumber =
 133                       tokString & List.concatMap \char ->
 134                         char & ShortText.singleton & lookupPinyins chineseDict
 135                   | List.length tokTextPins == ShortText.length tokText = tokTextPins
 136                   | otherwise = error "chinesePronunciation: pinyins length mismatch"
 137                 tokTextPins = tokText & lookupPinyins chineseDict
 138             (_, _) -> tokString <&> \_c -> tok{tokenText = ""}
 139 -}
 140 rosettaWordChars :: [Token] -> [Token]
 141 rosettaWordChars toks =
 142   toks & List.concatMap \tok ->
 143     let tokText = tok & tokenText
 144     in let tokString = tokText & ShortText.unpack
 145        in tokString <&> \char ->
 146             tok{tokenText = char & ShortText.singleton}