]> Git — Sourcephile - julm/worksheets.git/blob - src/Worksheets/Text.hs
wip
[julm/worksheets.git] / src / Worksheets / Text.hs
1 module Worksheets.Text where
2
3 import Control.Monad (when)
4 import Data.ByteString.Builder (Builder)
5 import Data.Char qualified as Char
6 import Data.Foldable (foldMap)
7 import Data.List qualified as List
8 import Data.Map.Strict qualified as Map
9 import Data.Maybe (Maybe (..))
10 import Data.Text (Text)
11 import Data.Text qualified as Text
12 import Paths_worksheets qualified as Self
13 import System.FilePath.Posix ((</>))
14 import System.FilePath.Posix qualified as File
15 import Text.Blaze
16 import Text.Blaze.Html5 qualified as H
17 import Text.Blaze.Html5.Attributes qualified as HA
18 import Text.Blaze.Renderer.Utf8 qualified as Blaze
19 import Text.Show (Show (..))
20 import Prelude
21
22 type Texte = [TexteAtome]
23 data TexteAtome = TexteAtome
24 { texteAtomeLangue :: Langue
25 , texteAtomeGlyphs :: Text
26 , texteAtomePhonetic :: Maybe Text
27 }
28 deriving (Eq, Ord, Show)
29
30 data Langue
31 = LangueFrançais
32 | LangueAnglais
33 | LangueMandarin
34 | LangueMandarinPinyin
35 | LanguePhonetic
36 deriving (Eq, Ord, Show)
37
38 splitOnWords :: Text -> [Text]
39 splitOnWords = splitOnChar splitterOnWords
40
41 splitOnChar :: (Char -> Bool) -> Text -> [Text]
42 splitOnChar f = go
43 where
44 go t =
45 case f `Text.span` t of
46 (l, r)
47 | Text.null l && Text.null r -> []
48 | Text.null l -> Text.singleton (Text.head t) : go (Text.tail t)
49 | Text.null r -> [l]
50 | otherwise -> l : go r
51
52 splitterOnWords :: Char -> Bool
53 splitterOnWords c = case Char.generalCategory c of
54 Char.UppercaseLetter -> False
55 Char.LowercaseLetter -> False
56 Char.TitlecaseLetter -> False
57 Char.ModifierLetter -> False
58 Char.OtherLetter -> False
59 Char.NonSpacingMark -> True
60 Char.SpacingCombiningMark -> True
61 Char.EnclosingMark -> True
62 Char.DecimalNumber -> False
63 Char.LetterNumber -> False
64 Char.OtherNumber -> False
65 Char.ConnectorPunctuation -> True
66 Char.DashPunctuation -> True
67 Char.OpenPunctuation -> True
68 Char.ClosePunctuation -> True
69 Char.InitialQuote -> True
70 Char.FinalQuote -> True
71 Char.OtherPunctuation -> True
72 Char.MathSymbol -> True
73 Char.CurrencySymbol -> True
74 Char.ModifierSymbol -> True
75 Char.OtherSymbol -> True
76 Char.Space -> True
77 Char.LineSeparator -> True
78 Char.ParagraphSeparator -> True
79 Char.Control -> True
80 Char.Format -> True
81 Char.Surrogate -> True
82 Char.PrivateUse -> True
83 Char.NotAssigned -> True
84
85 data CharClass
86 = CharClassLetter
87 | CharClassMark
88 | CharClassNumber
89 | CharClassPunctuation
90 | CharClassQuote
91 | CharClassSymbol
92 | CharClassSeparator
93 | CharClassOther
94 deriving (Eq, Ord, Show)
95
96 charClassifiers =
97 [ (CharClassLetter, isLetter)
98 , (CharClassMark, isMark)
99 , (CharClassNumber, isNumber)
100 , (CharClassPunctuation, isPunctuation)
101 , (CharClassQuote, isQuote)
102 , (CharClassSymbol, isSymbol)
103 , (CharClassSeparator, isSeparator)
104 , (CharClassOther, isOther)
105 ]
106 where
107 isLetter Char.UppercaseLetter = True
108 isLetter Char.LowercaseLetter = True
109 isLetter Char.TitlecaseLetter = True
110 isLetter Char.ModifierLetter = True
111 isLetter Char.OtherLetter = True
112 isLetter _ = False
113 isMark Char.NonSpacingMark = True
114 isMark Char.SpacingCombiningMark = True
115 isMark Char.EnclosingMark = True
116 isMark _ = False
117 isNumber Char.DecimalNumber = True
118 isNumber Char.LetterNumber = True
119 isNumber Char.OtherNumber = True
120 isNumber _ = False
121 isPunctuation Char.ConnectorPunctuation = True
122 isPunctuation Char.DashPunctuation = True
123 isPunctuation Char.OpenPunctuation = True
124 isPunctuation Char.ClosePunctuation = True
125 isPunctuation Char.OtherPunctuation = True
126 isPunctuation _ = False
127 isQuote Char.InitialQuote = True
128 isQuote Char.FinalQuote = True
129 isQuote _ = False
130 isSymbol Char.MathSymbol = True
131 isSymbol Char.CurrencySymbol = True
132 isSymbol Char.ModifierSymbol = True
133 isSymbol Char.OtherSymbol = True
134 isSymbol _ = False
135 isSeparator Char.Space = True
136 isSeparator Char.LineSeparator = True
137 isSeparator Char.ParagraphSeparator = True
138 isSeparator _ = False
139 isOther Char.Control = True
140 isOther Char.Format = True
141 isOther Char.Surrogate = True
142 isOther Char.PrivateUse = True
143 isOther Char.NotAssigned = True
144 isOther _ = False
145
146 classifyText = go charClassifiers
147 where
148 go _ t | Text.null t = []
149 go [] t = (CharClassOther, Text.singleton (Text.head t)) : go charClassifiers (Text.tail t)
150 go ((clsD, clsF) : fs) t =
151 case Text.span (clsF . Char.generalCategory) t of
152 (l, r)
153 | Text.null l -> go fs t
154 | otherwise -> (clsD, l) : go charClassifiers r
155
156 isClassUnderscore (CharClassPunctuation, Text.all (== '_') -> True) = True
157 isClassUnderscore _ = False
158 isClassSeparator = \case
159 (CharClassSeparator, _) -> True
160 _ -> False
161 splitOnUnderscore = splitCharClass isClassUnderscore
162 splitOnSeparator = splitCharClass isClassSeparator
163
164 splitCharClass :: ((CharClass, Text) -> Bool) -> [(CharClass, Text)] -> [[(CharClass, Text)]]
165 splitCharClass isSep = go
166 where
167 go [] = []
168 go ts =
169 case ts & List.span isSep of
170 ([], _r) ->
171 case ts & List.break isSep of
172 (l, r) -> l : go r
173 (_l, r) -> go r
174
175 zipTexte :: Texte -> Texte -> [(TexteAtome, TexteAtome)]
176 zipTexte = List.zip
177
178 -- m0 = "开心果 冰淇淋 很 好吃"
179 -- p0 = "kāi_xīn_guǒ bīng_qí_lín hěn hào_chī"
180
181 splitMandarin t =
182 t
183 & classifyText
184 & splitCharClass isClassSeparator
185 <&> foldMap \(_cls, c) -> c & Text.unpack
186 splitPinyin t =
187 t
188 & classifyText
189 & splitCharClass isClassSeparator
190 <&> foldMap \ct@(_cls, c) -> [c | not (isClassUnderscore ct)]
191
192 -- z0 = List.zipWith (List.zip) m0 p0