]> Git — Sourcephile - julm/worksheets.git/blob - src/Language.hs
update
[julm/worksheets.git] / src / Language.hs
1 module Language where
2
3 import Data.List qualified as List
4 import Data.Set qualified as Set
5 import Data.Text qualified as Text
6 import Data.Text.Short qualified as ShortText
7 import Worksheets.Utils.Char qualified as Char
8 import Worksheets.Utils.IPA qualified as IPA
9 import Worksheets.Utils.Prelude
10 import Prelude (error)
11
12 data Langue
13 = LangueFrançais
14 | LangueAnglais
15 | LangueMandarin
16 | LangueMandarinPinyin
17 | LanguePhonetic
18 deriving (Eq, Ord, Show)
19 instance HasTypeDefault Langue where
20 typeDefault = LangueAnglais
21 langues :: Set Langue
22 langues =
23 [ LangueFrançais
24 , LangueAnglais
25 , LangueMandarin
26 , LangueMandarinPinyin
27 , LanguePhonetic
28 ]
29 & Set.fromList
30
31 data CharMeta = CharMeta
32 { charMetaChar :: Char
33 , charMetaUnicodeCategory :: Char.GeneralCategory
34 , charMetaUnicodeBlock :: Maybe Char.UnicodeBlock
35 }
36 deriving (Eq, Ord, Show)
37
38 metanizer :: Text -> [CharMeta]
39 metanizer t =
40 t & Text.unpack <&> \c ->
41 CharMeta
42 { charMetaChar = c
43 , charMetaUnicodeCategory = c & Char.generalCategory
44 , charMetaUnicodeBlock = c & Char.unicodeBlock
45 }
46
47 data Token = Token
48 { tokenText :: ShortText -- Char
49 , tokenMeta :: (Char.GeneralCategory, Maybe Char.UnicodeBlock)
50 }
51 deriving (Eq, Ord, Show)
52
53 tokenizer :: Text -> [Token]
54 tokenizer t =
55 t & Text.unpack <&> \c ->
56 Token
57 { tokenText = c & ShortText.singleton
58 , tokenMeta =
59 ( c & Char.generalCategory
60 , c & Char.unicodeBlock
61 )
62 }
63
64 rosettaTokenizer :: ShortText -> [Token]
65 rosettaTokenizer s = s & ShortText.unpack & group
66 where
67 group [] = []
68 group (inpHead : inpTail) = tok : group rest
69 where
70 tok =
71 Token
72 { tokenText = inpHead : txt & ShortText.pack
73 , tokenMeta
74 }
75 tokenMeta =
76 ( inpHead & Char.generalCategory
77 , inpHead & Char.unicodeBlock
78 )
79 (txt, rest) =
80 inpTail & List.span \c ->
81 (Char.generalCategory c, Char.unicodeBlock c) == tokenMeta
82
83 groupByHoriz :: [Token] -> [[Token]]
84 groupByHoriz = group
85 where
86 group [] = []
87 group (inpHead : inpTail) =
88 case inpHead of
89 Token{tokenMeta = (Char.Space, _)} -> group rest
90 where
91 (_skipSpaces, rest) = inpTail & List.span onSep
92 tok -> (tok : nonSeps) : group rest
93 where
94 (nonSeps, rest) = inpTail & List.break onSep
95 where
96 onSep = \case
97 Token{tokenText, tokenMeta = (Char.Space, _)}
98 | tokenText & ShortText.unpack & all (== '\xA0') -> False
99 | otherwise -> True
100 _ -> False
101 splitWords :: [Token] -> [[Token]]
102 splitWords = group
103 where
104 group :: [Token] -> [[Token]]
105 group [] = []
106 group (inpHead : inpTail) =
107 case inpHead of
108 Token{tokenText = ShortText.unpack >>> all (== '\xA0') -> True, tokenMeta = (Char.Space, _)} -> group rest
109 where
110 (_skipSpaces, rest) = inpTail & List.span onSep
111 tok -> (tok : nonSeps) : group rest
112 where
113 (nonSeps, rest) = inpTail & List.break onSep
114 where
115 onSep = \case
116 Token{tokenText = ShortText.unpack >>> all (== '\xA0') -> True, tokenMeta = (Char.Space, _)} -> True
117 _ -> False
118
119 {-
120 -- | CorrectnessNote: beware than the tokenMeta is just preserved,
121 -- it does not correspond to the pronunciation unicode code points.
122 chinesePronunciation :: ChineseDict -> [Token] -> [Token]
123 chinesePronunciation chineseDict toks =
124 toks & List.concatMap \tok ->
125 let tokText = tok & tokenText
126 in let tokString = tokText & ShortText.unpack
127 in case tok & tokenMeta of
128 (_, Just Char.UnicodeBlockCJK{}) -> pinyins <&> \tokenText -> tok{tokenText}
129 where
130 pinyins :: [ShortText]
131 pinyins
132 | tokString & all Char.isNumber =
133 tokString & List.concatMap \char ->
134 char & ShortText.singleton & lookupPinyins chineseDict
135 | List.length tokTextPins == ShortText.length tokText = tokTextPins
136 | otherwise = error "chinesePronunciation: pinyins length mismatch"
137 tokTextPins = tokText & lookupPinyins chineseDict
138 (_, _) -> tokString <&> \_c -> tok{tokenText = ""}
139 -}
140 rosettaWordChars :: [Token] -> [Token]
141 rosettaWordChars toks =
142 toks & List.concatMap \tok ->
143 let tokText = tok & tokenText
144 in let tokString = tokText & ShortText.unpack
145 in tokString <&> \char ->
146 tok{tokenText = char & ShortText.singleton}