]> Git — Sourcephile - julm/worksheets.git/blob - src/Worksheets/Writing/Rosetta.hs
add: Rosetta
[julm/worksheets.git] / src / Worksheets / Writing / Rosetta.hs
1 {-# LANGUAGE BangPatterns #-}
2 {-# LANGUAGE DeriveGeneric #-}
3 {-# LANGUAGE PackageImports #-}
4 {-# OPTIONS_GHC -Wno-unused-imports #-}
5
6 module Worksheets.Writing.Rosetta where
7
8 import Control.Applicative (Alternative (..), Applicative (..))
9 import Control.Monad (when)
10 import Control.Monad.Trans.State qualified as MT
11 import Data.ByteString (ByteString)
12 import Data.ByteString qualified as ByteString
13 import Data.ByteString.Builder (Builder)
14 import Data.ByteString.Short qualified as ShortByteString
15 import Data.Char qualified as Char
16 import Data.Csv ((.!))
17 import Data.Csv qualified as CSV
18 import Data.Csv.Incremental qualified as CSV.Incremental
19 import Data.Foldable (Foldable (..))
20 import Data.Int (Int)
21 import Data.List qualified as List
22 import Data.Map.Strict qualified as Map
23 import Data.Maybe (Maybe (..))
24 import Data.Monoid (Ap (..), Monoid (..))
25 import Data.Text (Text)
26 import Data.Text qualified as Text
27 import Data.Text.Encoding qualified as Text
28 import Data.Text.Lazy qualified as Text.Lazy
29 import Data.Text.Short (ShortText)
30 import Data.Text.Short qualified as ShortText
31 import Debug.Pretty.Simple (pTraceShow, pTraceShowId, pTraceShowM)
32 import GHC.Generics (Generic)
33 import Paths_worksheets qualified as Self
34 import System.Exit qualified as Sys
35 import System.FilePath qualified as Sys
36 import System.FilePath.Posix ((</>))
37 import System.FilePath.Posix qualified as File
38 import System.IO qualified as Sys
39 import Text.Blaze
40 import Text.Blaze.Html5 qualified as H
41 import Text.Blaze.Html5.Attributes qualified as HA
42 import Text.Blaze.Renderer.Utf8 qualified as Blaze
43 import Text.Show (Show (..))
44 import Utils.Blaze
45 import Prelude
46 import "base" Prelude (error, fromIntegral)
47
48 newtype ChineseDict = ChineseDict (Map ShortText [ChineseDictEntry])
49 deriving (Show)
50 instance Semigroup ChineseDict where
51 ChineseDict x <> ChineseDict y =
52 ChineseDict (Map.unionWithKey merge x y)
53 where
54 merge _k !xV !yV = xV <> yV
55
56 -- xV
57 -- & traceString (List.unlines $
58 -- [ "Semigroup ChineseDict: key collision: " <> ShortText.unpack k <> " ("<>show k<>")"
59 -- , xV & pShowNoColor & Text.Lazy.unpack
60 -- , yV & pShowNoColor & Text.Lazy.unpack
61 -- ])
62 instance Monoid ChineseDict where
63 mempty = ChineseDict mempty
64 data ChineseDictEntry = ChineseDictEntry
65 { chinese :: !ShortText
66 , pinyins :: ![ShortText]
67 , english :: ![ShortText]
68 , hskLevel :: !(Maybe HskLevel)
69 }
70 deriving (Generic, Show)
71
72 data HskLevel
73 = HskLevel301
74 | HskLevel302
75 | HskLevel303
76 | HskLevel304
77 | HskLevel305
78 | HskLevel306
79 deriving (Eq, Ord, Enum, Show)
80
81 -- instance CSV.FromRecord ChineseDictEntry
82 -- instance CSV.ToRecord ChineseDictEntry
83 -- instance CSV.FromNamedRecord ChineseDictEntry
84 -- instance CSV.ToNamedRecord ChineseDictEntry
85 -- instance CSV.DefaultOrdered ChineseDictEntry
86
87 feed :: (ByteString -> r) -> Sys.Handle -> Sys.IO r
88 feed k csvFile = do
89 Sys.hIsEOF csvFile >>= \case
90 True -> return $ k ""
91 False -> k <$> ByteString.hGetSome csvFile 4096
92
93 readHSK :: HskLevel -> IO ChineseDict
94 readHSK hskLevel = do
95 Sys.withFile ("data/langs/mandarin/hsk_csv/hsk" <> show hskIndex Sys.<.> "csv") Sys.ReadMode \hskHandle -> do
96 loop hskHandle mempty $
97 CSV.Incremental.decodeWithP parser decodeOpts CSV.NoHeader
98 where
99 hskIndex = hskLevel & fromEnum & (+ 1)
100 decodeOpts = CSV.defaultDecodeOptions
101 parser :: CSV.Record -> CSV.Parser ChineseDictEntry
102 parser v
103 | length v == 3 = do
104 chinese <- v .! 0
105 pinyins <- v .! 1 <&> pure
106 english <- v .! 2 <&> pure
107 pure ChineseDictEntry{hskLevel = Just hskLevel, ..}
108 | otherwise = empty
109 check =
110 either (\x -> Sys.print x >> return mempty) \e@ChineseDictEntry{chinese} ->
111 return $ ChineseDict $ Map.singleton chinese [e]
112 loop hskHandle !acc = \case
113 CSV.Incremental.Fail _ errMsg -> do
114 Sys.putStrLn errMsg
115 Sys.exitFailure
116 CSV.Incremental.Many rs k -> do
117 ok <- rs & foldMapM check
118 t <- feed k hskHandle
119 loop hskHandle (acc <> ok) t
120 CSV.Incremental.Done rs -> do
121 ok <- rs & foldMapM check
122 return (acc <> ok)
123
124 readCEDICT :: IO ChineseDict
125 readCEDICT = do
126 Sys.withFile ("data/langs/mandarin/cedict/cedict_ts.u8") Sys.ReadMode \cedictHandle -> do
127 let skipHeader = do
128 isEOF <- cedictHandle & Sys.hIsEOF
129 if isEOF
130 then return ()
131 else do
132 lineBS <- ByteString.hGetLine cedictHandle
133 let lineST = lineBS & ShortText.fromByteString & fromMaybe (error "invalid UTF-8")
134 let begin = lineST & ShortText.take 1
135 when (begin == "#") do
136 skipHeader
137 skipHeader
138 let loop !acc = do
139 isEOF <- cedictHandle & Sys.hIsEOF
140 if isEOF
141 then return acc
142 else do
143 line <- ByteString.hGetLine cedictHandle
144 let decodeUtf8 = ShortText.fromByteString >>> fromMaybe (error "invalid UTF-8")
145 -- DescriptionNote: each line is formatted as: #(.+) (.+) \[(.+)] /(.*)/#'
146 let skipChar c = void $ MT.state $ ByteString.span (== fromIntegral (fromEnum c))
147 let skipPrefix p =
148 MT.modify' $
149 ByteString.stripPrefix p
150 >>> fromMaybe (error $ "skipPrefix fail to match: " <> show p)
151 let breakOnChar c = ByteString.break (== fromIntegral (fromEnum c))
152 let breakOnSpace = breakOnChar ' '
153 let skipSuffix p =
154 MT.modify' $
155 ByteString.stripSuffix p
156 >>> fromMaybe
157 ( error $
158 "skipSuffix: mismatch: "
159 <> show p
160 <> "\n on line: "
161 <> ShortText.unpack (decodeUtf8 line)
162 <> "\n escaped: "
163 <> show (ShortText.unpack (decodeUtf8 line))
164 )
165 let (dict, leftover) = (`MT.runState` line) do
166 chineseTrad <- MT.state $ breakOnSpace >>> first decodeUtf8
167 skipChar ' '
168 chineseSimpl <- MT.state $ breakOnSpace >>> first decodeUtf8
169 skipChar ' '
170 skipPrefix "["
171 pinyins <-
172 MT.state $
173 breakOnChar ']'
174 >>> first (\s -> s & ByteString.split (fromIntegral (fromEnum ' ')) <&> decodeUtf8)
175 skipPrefix "] /"
176 -- CorrectnessNote: some lines do not end with \r
177 -- hence make it optional.
178 MT.modify' \s -> s & ByteString.stripSuffix "\r" & fromMaybe s
179 skipSuffix "/"
180 english <- MT.gets \s -> s & ByteString.split (fromIntegral (fromEnum '/')) <&> decodeUtf8
181 MT.put mempty
182 let chinese = chineseSimpl
183 return $
184 ChineseDict $
185 Map.singleton chinese $
186 pure $
187 ChineseDictEntry
188 { chinese
189 , pinyins
190 , english
191 , hskLevel = Nothing
192 }
193 if not (ByteString.null leftover)
194 then error $ "parserLine: leftover: " <> show leftover
195 else loop (acc <> dict)
196 loop mempty
197
198 readChineseDict :: IO ChineseDict
199 readChineseDict = do
200 0 & toEnum & enumFrom & foldMapM readHSK
201
202 {-
203 where
204 manualDict = ChineseDict $ Map.fromListWith (<>) [ (chinese, e) | e@ChineseDictEntry{chinese} <- manualEntries ]
205 manualEntries =
206 [ ChineseDictEntry
207 { chinese = "心"
208 , pinyins = ["xīn"]
209 , english = ["heart"]
210 , hskLevel = Nothing
211 }
212 , ChineseDictEntry
213 { chinese = "果"
214 , pinyins = ["guǒ"]
215 , english = ["fruit"]
216 , hskLevel = Nothing
217 }
218 , ChineseDictEntry
219 { chinese = "冰"
220 , pinyins = ["bīng"]
221 , english = ["ice"]
222 , hskLevel = Nothing
223 }
224 , ChineseDictEntry
225 { chinese = "淇"
226 , pinyins = ["qí"]
227 , english = []
228 , hskLevel = Nothing
229 }
230 , ChineseDictEntry
231 { chinese = "淋"
232 , pinyins = ["lín"]
233 , english = []
234 , hskLevel = Nothing
235 }
236 , ChineseDictEntry
237 { chinese = "天"
238 , pinyins = ["tiān"]
239 , english = ["sky"]
240 , hskLevel = Nothing
241 }
242 , ChineseDictEntry
243 { chinese = "空"
244 , pinyins = ["kōng"]
245 , english = ["empty"]
246 , hskLevel = Nothing
247 }
248 , ChineseDictEntry
249 { chinese = "中"
250 , pinyins = ["zhōng"]
251 , english = ["middle"]
252 , hskLevel = Nothing
253 }
254 , ChineseDictEntry
255 { chinese = "架"
256 , pinyins = ["jià"]
257 , english = ["shelf"]
258 , hskLevel = Nothing
259 }
260 , ChineseDictEntry
261 { chinese = "飞"
262 , pinyins = ["fēi"]
263 , english = ["fly"]
264 , hskLevel = Nothing
265 }
266 , ChineseDictEntry
267 { chinese = "机"
268 , pinyins = ["jī"]
269 , english = ["machine"]
270 , hskLevel = Nothing
271 }
272 ]
273 -}
274
275 {-
276 readHSK :: Sys.IO [HSK]
277 readHSK = do
278 Sys.withFile "data/langs/mandarin/hsk.csv" Sys.ReadMode \csvFile -> do
279 let loopH !_ (CSV.Incremental.FailH _ errMsg) = do Sys.putStrLn errMsg; Sys.exitFailure
280 loopH acc (CSV.Incremental.PartialH k) = feed k csvFile >>= loopH acc
281 loopH _acc (CSV.Incremental.DoneH !h p) = p
282 p <- loopH [] CSV.Incremental.decodeByName
283 let loop !_ (CSV.Incremental.Fail _ errMsg) = do Sys.putStrLn errMsg; Sys.exitFailure
284 loop acc (CSV.Incremental.Many rs k) = feed k csvFile >>= loop (acc <> rs)
285 loop acc (CSV.Incremental.Done rs) = loop (acc <> rs)
286 loop [] p
287 -}
288
289 data Langue
290 = LangueFrançais
291 | LangueAnglais
292 | LangueMandarin
293 | LangueMandarinPinyin
294 | LanguePhonetic
295 deriving (Eq, Ord, Show)
296
297 data Rosetta = Rosetta
298 { rosettaPictures :: [File.FilePath]
299 , rosettaEntries :: [RosettaEntry]
300 }
301 deriving (Eq, Ord, Show)
302
303 data RosettaEntry = RosettaEntry
304 { rosettaEntryGlyphsText :: ShortText
305 , rosettaEntryGlyphsLangue :: Langue
306 }
307 deriving (Eq, Ord, Show)
308
309 rosetta :: ChineseDict -> Text -> Rosetta -> IO Builder
310 rosetta (ChineseDict chineseDict) title Rosetta{..} = do
311 -- FIXME: this absolute path is not reproducible out of my system
312 dataPath <- Self.getDataDir <&> File.normalise
313 return $ Blaze.renderMarkupBuilder do
314 H.docTypeHtml do
315 H.head do
316 H.title $ title & H.toHtml
317 H.link
318 ! HA.rel "stylesheet"
319 ! HA.type_ "text/css"
320 ! HA.href (toValue $ dataPath </> "styles/rosetta.css")
321 H.body do
322 "\n"
323 H.div ! classes ["rosetta"] $ do
324 forM_ (List.zip rosettaPictures rosettaEntries) \(rosettaPicture, RosettaEntry{..}) -> do
325 -- pTraceShowM (rosettaPicture, RosettaEntry{..})
326 "\n"
327 H.div ! classes ["rosetta-row", "rosetta-row-" <> show rosettaEntryGlyphsLangue] $ do
328 H.div ! classes ["rosetta-cell", "rosetta-cell-picture"] $ do
329 H.img ! HA.src ("file://" <> dataPath </> "images" </> "thumbnails" </> rosettaPicture & toValue)
330 H.div ! classes ["rosetta-cell", "rosetta-cell-words"] $ do
331 let atomWidth = case rosettaEntryGlyphsLangue of
332 LangueMandarin -> "1.5cm"
333 _ -> "1cm"
334 let cellSpace = \case
335 ' ' -> "writing-words-cell-space"
336 _ -> ""
337 H.div ! classes ["writing-words"] $ do
338 forM_ (rosettaEntryGlyphsText & ShortText.split (== ' ')) \writingWord -> do
339 let atomLength = writingWord & ShortText.length & show
340 H.div
341 ! classes
342 [ "writing-words-word"
343 ]
344 $ do
345 when (rosettaEntryGlyphsLangue == LangueMandarin) do
346 H.div
347 ! classes
348 [ "writing-words-row"
349 , "writing-words-" <> show LangueMandarinPinyin
350 ]
351 ! HA.style ("grid-template-columns: repeat(" <> atomLength <> ", " <> atomWidth <> ");" & toValue)
352 $ do
353 forM_ (writingWord & ShortText.unpack) \writingChar -> do
354 let ChineseDictEntry{pinyins} =
355 chineseDict
356 & Map.lookup (ShortText.singleton writingChar)
357 & fromMaybe (error $ "chineseDict missing: " <> [writingChar])
358 & List.head -- FIXME
359 H.div ! classes ["writing-words-cell", cellSpace writingChar] $ do
360 forM_ pinyins H.toHtml
361 H.div
362 ! classes
363 [ "writing-words-row"
364 , "writing-words-" <> show rosettaEntryGlyphsLangue
365 , "writing-words-row-model"
366 ]
367 ! HA.style ("grid-template-columns: repeat(" <> atomLength <> ", " <> atomWidth <> ");" & toValue)
368 $ do
369 forM_ (writingWord & ShortText.unpack) \writingChar -> do
370 H.div ! classes ["writing-words-cell", cellSpace writingChar] $ do
371 fromString [writingChar]
372 "\n"
373 H.div
374 ! classes
375 [ "writing-words-row"
376 , "writing-words-" <> show rosettaEntryGlyphsLangue
377 , "writing-words-row-input"
378 ]
379 ! HA.style ("grid-template-columns: repeat(" <> atomLength <> ", " <> atomWidth <> ");" & toValue)
380 $ do
381 forM_ (writingWord & ShortText.unpack) \writingChar -> do
382 H.div ! classes ["writing-words-cell", cellSpace writingChar] $ do
383 case rosettaEntryGlyphsLangue of
384 LangueMandarin -> fromString [writingChar]
385 _ -> " "
386 "\n"
387 "\n"