3 import Data.List qualified as List
4 import Data.Set qualified as Set
5 import Data.Text qualified as Text
6 import Data.Text.Short qualified as ShortText
7 import Worksheets.Utils.Char qualified as Char
8 import Worksheets.Utils.IPA qualified as IPA
9 import Worksheets.Utils.Prelude
10 import Prelude (error)
16 | LangueMandarinPinyin
18 deriving (Eq, Ord, Show)
19 instance HasTypeDefault Langue where
20 typeDefault = LangueAnglais
26 , LangueMandarinPinyin
31 data CharMeta = CharMeta
32 { charMetaChar :: Char
33 , charMetaUnicodeCategory :: Char.GeneralCategory
34 , charMetaUnicodeBlock :: Maybe Char.UnicodeBlock
36 deriving (Eq, Ord, Show)
38 metanizer :: Text -> [CharMeta]
40 t & Text.unpack <&> \c ->
43 , charMetaUnicodeCategory = c & Char.generalCategory
44 , charMetaUnicodeBlock = c & Char.unicodeBlock
48 { tokenText :: ShortText -- Char
49 , tokenMeta :: (Char.GeneralCategory, Maybe Char.UnicodeBlock)
51 deriving (Eq, Ord, Show)
53 tokenizer :: Text -> [Token]
55 t & Text.unpack <&> \c ->
57 { tokenText = c & ShortText.singleton
59 ( c & Char.generalCategory
60 , c & Char.unicodeBlock
64 rosettaTokenizer :: ShortText -> [Token]
65 rosettaTokenizer s = s & ShortText.unpack & group
68 group (inpHead : inpTail) = tok : group rest
72 { tokenText = inpHead : txt & ShortText.pack
76 ( inpHead & Char.generalCategory
77 , inpHead & Char.unicodeBlock
80 inpTail & List.span \c ->
81 (Char.generalCategory c, Char.unicodeBlock c) == tokenMeta
83 groupByHoriz :: [Token] -> [[Token]]
87 group (inpHead : inpTail) =
89 Token{tokenMeta = (Char.Space, _)} -> group rest
91 (_skipSpaces, rest) = inpTail & List.span onSep
92 tok -> (tok : nonSeps) : group rest
94 (nonSeps, rest) = inpTail & List.break onSep
97 Token{tokenText, tokenMeta = (Char.Space, _)}
98 | tokenText & ShortText.unpack & all (== '\xA0') -> False
101 splitWords :: [Token] -> [[Token]]
104 group :: [Token] -> [[Token]]
106 group (inpHead : inpTail) =
108 Token{tokenText = ShortText.unpack >>> all (== '\xA0') -> True, tokenMeta = (Char.Space, _)} -> group rest
110 (_skipSpaces, rest) = inpTail & List.span onSep
111 tok -> (tok : nonSeps) : group rest
113 (nonSeps, rest) = inpTail & List.break onSep
116 Token{tokenText = ShortText.unpack >>> all (== '\xA0') -> True, tokenMeta = (Char.Space, _)} -> True
120 -- | CorrectnessNote: beware than the tokenMeta is just preserved,
121 -- it does not correspond to the pronunciation unicode code points.
122 chinesePronunciation :: ChineseDict -> [Token] -> [Token]
123 chinesePronunciation chineseDict toks =
124 toks & List.concatMap \tok ->
125 let tokText = tok & tokenText
126 in let tokString = tokText & ShortText.unpack
127 in case tok & tokenMeta of
128 (_, Just Char.UnicodeBlockCJK{}) -> pinyins <&> \tokenText -> tok{tokenText}
130 pinyins :: [ShortText]
132 | tokString & all Char.isNumber =
133 tokString & List.concatMap \char ->
134 char & ShortText.singleton & lookupPinyins chineseDict
135 | List.length tokTextPins == ShortText.length tokText = tokTextPins
136 | otherwise = error "chinesePronunciation: pinyins length mismatch"
137 tokTextPins = tokText & lookupPinyins chineseDict
138 (_, _) -> tokString <&> \_c -> tok{tokenText = ""}
140 rosettaWordChars :: [Token] -> [Token]
141 rosettaWordChars toks =
142 toks & List.concatMap \tok ->
143 let tokText = tok & tokenText
144 in let tokString = tokText & ShortText.unpack
145 in tokString <&> \char ->
146 tok{tokenText = char & ShortText.singleton}