1 {-# LANGUAGE OverloadedLists #-}
3 module Utils.Pronunciation where
5 import Control.Monad.Trans.Class qualified as MT
6 import Control.Monad.Trans.State qualified as MT
7 import Data.ByteString.Builder qualified as ByteString.Builder
8 import Data.Char qualified as Char
9 import Data.GenValidity.Map ()
10 import Data.GenValidity.Sequence ()
11 import Data.GenValidity.Set ()
12 import Data.GenValidity.Text ()
13 import Data.List qualified as List
14 import Data.List.Zipper qualified as LZ
15 import Data.Map.Strict qualified as Map
16 import Data.Text qualified as Text
17 import Data.Text.Encoding qualified as Text
18 import Data.Text.Lazy qualified as Text.Lazy
19 import Data.Text.Lazy.Encoding qualified as Text.Lazy
20 import Data.Text.Short qualified as TextShort
21 import Data.Validity.Map ()
22 import Data.Validity.Set ()
23 import Data.Validity.Text ()
24 import Language.Pronunciation (Pronunciation (..), Pronunciations (..))
25 import Language.Pronunciation qualified as Pron
28 import Worksheets.Utils.HTML (Html)
29 import Worksheets.Utils.HTML qualified as HTML
30 import Worksheets.Utils.IPA qualified as IPA
31 import Worksheets.Utils.Prelude
33 cardsHtml :: String -> IO Html -> TestDefM (outers) () ()
34 cardsHtml title html = do
35 outPath <- goldenPath title "html"
36 builder <- html <&> HTML.renderMarkupBuilder & liftIO
38 goldenByteStringBuilderFile outPath (return builder)
40 lexerRender rules = do
42 outPath <- goldenPath title "html"
43 builder <- rules & Pron.tableHtml <&> HTML.renderMarkupBuilder & liftIO
45 goldenByteStringBuilderFile outPath (return builder)
47 pronunciationDecompositionTest :: Pron.Table -> String -> TestDefM (outers) () ()
48 pronunciationDecompositionTest rules sentence = do
49 outPath <- goldenPath sentence "txt"
51 pureGoldenByteStringBuilderFile outPath $
52 case sentence & Text.pack & Pron.run rules of
57 Left c -> [Text.singleton c, "\n"] & mconcat
58 Right Pron.Pron{pronInput, pronRule} ->
59 [ pronInput & Pron.lexemesChars & Text.pack
61 , pronRule & Pron.rulePron & Pron.unPronunciations & foldMap (snd >>> Pron.pronunciationText)
67 & ByteString.Builder.byteString
68 err@Left{} -> pShowNoColor err & Text.Lazy.encodeUtf8 & ByteString.Builder.lazyByteString
71 pronunciationResultTest :: Pron.Table -> String -> TestDefM (outers) () ()
72 pronunciationResultTest rules sentence = do
73 outPath <- goldenPath sentence "txt"
75 pureGoldenByteStringBuilderFile outPath $
76 case sentence & Text.pack & Pron.run rules of
78 x & foldMap \Pron.Pron{pronInput, pronRule} ->
81 & ByteString.Builder.byteString
82 err@Left{} -> pShowNoColor err & Text.Lazy.encodeUtf8 & ByteString.Builder.lazyByteString
85 pronunciationParserTest ::
87 Map Pron.InputLexemes Pron.Pronunciation ->
88 TestDefM (outers) () ()
89 pronunciationParserTest rules sentences = do
90 forM_ (sentences & Map.toList) \(inp, exp :: Pron.Pronunciation) -> do
91 it (inp & Pron.unInputLexemes & Pron.lexemesChars & show) do
92 let got :: Pron.Pronunciation =
95 & Pron.runParser rules
101 then errorShow ("missing phonetic for" :: Text, c, "in" :: Text, inp)
106 { pronunciationIPABroad = []
107 , pronunciationText = Text.singleton c
111 (Pron.pronRule >>> Pron.rulePron)
113 & Pron.joinPronunciations
114 & Pron.unPronunciations
117 Pron.pronunciationText got
118 `shouldBe` Pron.pronunciationText exp
123 & foldMap \Pron.ExampleLiteral{..} ->
124 [ [TextShort.unpack exampleLiteralText]
125 , if TextShort.null exampleLiteralMeaning
127 else ["{", exampleLiteralMeaning & TextShort.unpack, "}"]
131 outPath <- goldenPath baseName "txt"
133 pureGoldenByteStringBuilderFile outPath $
134 case sentence & Pron.parseLiterals rules of
139 Left c -> [Text.singleton c, "\n"] & mconcat
140 Right Pron.Pron{pronInput, pronRule} ->
141 -- Chunk{chunkInterval = trans, chunkMachine = Pron.Machine{machinePron = pron}}
142 [ pronRule & Pron.rulePron & Pron.unPronunciations & foldMap (snd >>> Pron.pronunciationText)
144 , show pronInput & Text.pack
150 & ByteString.Builder.byteString
151 err@Left{} -> pShowNoColor err & Text.Lazy.encodeUtf8 & ByteString.Builder.lazyByteString
154 data Lex = LexChar Char | LexBorder
158 { statePrev :: [(Lex, [Char])]
163 readFrench :: [Char] -> [(Lex, [Char])]
164 readFrench inp = go State{statePrev = [], stateNext = (LexChar <$> inp) <> [LexBorder]}
166 goMany fts State{statePrev, stateNext = n : next}
167 | let stack = n : (statePrev <&> fst)
168 , stack & List.isPrefixOf (fts <&> fst & List.reverse) =
169 -- \|| traceShow (stack, fts) False
170 Just State{statePrev = List.reverse fts <> List.drop (List.length fts - 1) statePrev, stateNext = next}
173 go State{statePrev, stateNext = []} = statePrev & List.reverse
174 go (goMany [LexChar 'C' := "s", LexChar 'E' := "ə"] -> Just st) = go st
175 go (goMany [LexChar 'C' := "", LexChar 'H' := "ʃ"] -> Just st) = go st
176 go (goMany [LexChar 'S' := "", LexChar 'S' := "s"] -> Just st) = go st
177 go (goMany [LexChar 'A' := "", LexChar 'N' := "ɑ̃"] -> Just st) = go st
178 go (goMany [LexChar 'A' := "", LexChar 'U' := "o"] -> Just st) = go st
179 go (goMany [LexChar 'E' := "", LexChar 'I' := "ɛ"] -> Just st) = go st
191 go (goMany [LexChar 'T' := "", LexChar 'T' := "t"] -> Just st) = go st
192 go (goMany [LexChar 'E' := "", LexChar 'T' := "t"] -> Just st) = go st
193 go (goMany [LexChar 'E' := "", LexBorder := ""] -> Just st) = go st
194 go (goMany [LexBorder := ""] -> Just st) = go st
195 go (goMany [LexChar 'A' := "a"] -> Just st) = go st
196 go (goMany [LexChar 'B' := "b"] -> Just st) = go st
197 go (goMany [LexChar 'C' := "k"] -> Just st) = go st
198 go (goMany [LexChar 'H' := ""] -> Just st) = go st
199 go (goMany [LexChar 'I' := "i"] -> Just st) = go st
200 go (goMany [LexChar 'L' := "l"] -> Just st) = go st
201 go (goMany [LexChar 'O' := "ɔ"] -> Just st) = go st
202 go (goMany [LexChar 'T' := "t"] -> Just st) = go st
203 go (goMany [LexChar 'N' := "n"] -> Just st) = go st
204 go (goMany [LexChar 'R' := "ʁ"] -> Just st) = go st
205 go (goMany [LexChar 'S' := "s"] -> Just st) = go st
206 go (goMany [LexChar 'E' := "ə"] -> Just st) = go st
207 go (goMany [LexChar 'É' := "e"] -> Just st) = go st
208 go (goMany [LexChar 'U' := "y"] -> Just st) = go st
209 go (goMany [LexChar 'V' := "v"] -> Just st) = go st
213 go (('C' := _):prev) (n@'E':next) = go ((n := "ə"):('C' := "s"):prev) next
214 go (('S' := _):prev) ('S':next) = go (('S' := ""):('S' := "s"):prev) next
215 go (((isVowel->True) := _):prev) ('S':next@((isVowel->True):_)) = go (('S' := "z"):prev) next
216 go prev next = errorShow (prev, next)
219 isConsonant = isVowel >>> not
232 describe "Pronunciation" do
236 `shouldBe` [ LexChar 'A' := "a"
246 readFrench "ASSURANCE"
247 `shouldBe` [ LexChar 'A' := "a"
253 , LexChar 'N' := "ɑ̃"
259 readFrench "CHEVAUCHE"
260 `shouldBe` [ LexChar 'C' := ""
272 readFrench "NÉCESSAIRE"
273 `shouldBe` [ LexChar 'N' := "n"
287 -- buildAna "a" `shouldBe` Ana [ "a" := ["a"] ]
288 -- buildAna "as" `shouldBe` Ana [ "as" := ["az"] ]
289 -- buildAna "ass" `shouldBe` Ana [ "ass" := ["as"] ]
292 `shouldBe` [ LexChar 'C' := "k"
303 -- buildAna "a" `shouldBe` Ana [ "a" := ["a"] ]
304 -- buildAna "as" `shouldBe` Ana [ "as" := ["az"] ]
305 -- buildAna "ass" `shouldBe` Ana [ "ass" := ["as"] ]
307 -- ana_assurance `shouldBe` ana_assurance
308 -- outPath <- goldenPath "1" "txt"
310 -- pureGoldenByteStringBuilderFile outPath $
313 -- & Text.Lazy.encodeUtf8
314 -- & ByteString.Builder.lazyByteString
316 describe "sentence" do
317 forM_ sentences \sentence -> do
318 outPath <- goldenPath sentence "txt"
320 pureGoldenByteStringBuilderFile outPath $
321 case sentence & Text.pack & parse patterns of
322 err@Left{} -> pShowNoColor err & Text.Lazy.encodeUtf8 & ByteString.Builder.lazyByteString
326 & Text.Lazy.encodeUtf8
327 & ByteString.Builder.lazyByteString
332 Left c -> [Text.singleton c, "\n"] & mconcat
333 Right Pron.Pron{pronInput, pronRule} ->
334 [ pronInput & Pron.lexemesChars & Text.pack
336 , pronRule & Pron.rulePron & Pron.unPronunciations & foldMap (snd >>> Pron.pronunciationText)
342 & ByteString.Builder.byteString
346 sentences :: [String]
353 type (:=) a b = (a, b)
354 data Inp = InpChar Char | InpBorder
355 ana1 :: [Inp] -> [Inp := [[Inp] := [IPA.Syllable []]]]
358 go :: [Inp] -> [Inp := [[Inp] := [IPA.Syllable []]]]
360 go (i@(InpChar 'a') : next) = [i := [[i] := ["a"]]] <> go next
363 data Dap k a = Dap { dapDefault :: a, dap :: Map k a }
364 type Phon = [IPA.Syllable []]
365 type CaseNextString = Dap String CasePrevPhon
366 type CasePrevPhon = Dap CasePhon CaseNextPhon
367 type CaseNextPhon = Dap CasePhon Phon
368 cases :: CaseNextString
370 [ "a" := Dap ["a"] []
371 , "au" := Dap ["o"] []
372 , "ay" := Dap ["ɛ"] []
373 , "ai" := Dap ["ɛ"] []
381 type Phon = IPA.Syllable []
382 type AnaMap = ([Char], [Phon])
384 { anaDecomp :: [AnaMap]
385 } deriving (Eq, Show)
389 -- as | az | vowel + s = z
390 -- ass | as | s + s = s
408 , "ssaire" := ["sɛʁ"]
411 buildAna :: String -> Ana
412 buildAna = List.foldl' (\acc c -> appendAna acc (InpChar c)) (Ana [])
414 appendAna :: Ana -> Inp -> Ana
415 appendAna (Ana []) (InpChar c) = case c of
416 'a' -> Ana [ "a" := ["a"] ]
417 'b' -> Ana [ "b" := ["be"] ]
418 'c' -> Ana [ "c" := ["se"] ]
419 'd' -> Ana [ "d" := ["de"] ]
420 'e' -> Ana [ "e" := ["ə"] ]
421 'f' -> Ana [ "f" := ["ɛf"] ]
422 'g' -> Ana [ "g" := ["g"] ]
423 'h' -> Ana [ "h" := ["aʃ"] ]
424 'i' -> Ana [ "i" := ["i"] ]
425 'j' -> Ana [ "j" := ["ʒi"] ]
426 'k' -> Ana [ "k" := ["ka"] ]
427 'l' -> Ana [ "l" := ["ɛl"] ]
428 'm' -> Ana [ "m" := ["ɛm"] ]
429 'n' -> Ana [ "n" := ["ɛn"] ]
430 'o' -> Ana [ "o" := ["o"] ]
431 'p' -> Ana [ "p" := ["pe"] ]
432 'q' -> Ana [ "q" := ["ky"] ]
433 'r' -> Ana [ "r" := ["ɛʁ"] ]
434 's' -> Ana [ "s" := ["ɛs"] ]
435 't' -> Ana [ "t" := ["te"] ]
436 'u' -> Ana [ "u" := ["y"] ]
437 'v' -> Ana [ "v" := ["ve"] ]
438 'w' -> Ana [ "w" := ["dubl", "ve"] ]
439 'x' -> Ana [ "x" := ["ɛks"] ]
440 'y' -> Ana [ "y" := ["igʁɛk"] ]
441 'z' -> Ana [ "z" := ["zɛd"] ]
444 (anaPrevPhonSeg -> Just IPA.Vowel{})
445 (InpChar 's') = (Ana ["as":=["az"]])
446 appendAna ana (InpChar 's')
447 | ana & anaPrevSuffix "s"
448 = ana & anaPrevMap \case
449 (List.isSuffixOf "s" -> True) := _ -> "ss" := ["s"]
450 appendAna x y = errorShow (x, y)
452 anaPrev (Ana xs) = xs & lastMaybe
453 anaPrevMap :: (AnaMap -> AnaMap) -> Ana -> Ana
454 anaPrevMap f (Ana l) =
455 case l & List.reverse of
456 x : xs -> Ana $ (f x : xs) & List.reverse
459 anaPrevSuffix :: [Char] -> Ana -> Bool
460 anaPrevSuffix suf ana = ana & anaPrev & \case
461 Just (cs := _) -> cs & List.isSuffixOf suf
463 anaPrevPhonSeg x = x & anaPrev >>= \case
465 | Just lastPhon <- phon & lastMaybe
466 , segs <- lastPhon & IPA.syllableToSegments
467 , Just lastSeg <- segs & lastMaybe <&> IPA.dropSegmentalFeatures
472 , inputPhonetic :: [IPA.Syllable []]
473 , inputMeaning :: Maybe ShortText
475 data Pos = PosPrev | PosNext
476 deriving (Eq, Ord, Show)
482 deriving (Eq, Ord, Show)
485 = PatTreeBranches (Map Step PatTree)
486 | PatTreeLeaf Pronunciations
488 instance IsList PatTree where
489 type Item PatTree = (Step, PatTree)
490 fromList = PatTreeBranches . Map.fromListWith (errorShow)
500 = ParseReadChar (Map Char Parse)
501 | ParsePhon [IPA.Syllable []]
507 { molLangues :: Set String
509 , molReading :: String
510 , molPhoneticPrepend :: Map MolPat [IPA.Syllable []]
518 deriving (Eq, Ord, Show)
521 { chunkLangue :: [String]
522 , chunkLetters :: String
523 , chunkPhonetic :: [IPA.Syllable []]
524 } deriving (Eq, Ord, Show)
531 { molLangues = ["french"]
534 , molPhoneticPrepend =
535 [ MolPatChars "" := ["ɛs"]
536 , MolPatChars "s" := ["s"]
540 -- assurances -> a.ssu.rances
545 mergeMol :: Mol -> Mol -> Mol
549 { chunkLangue = ["french"]
554 -- exposeChars :: [Char]
555 -- exposePhons :: [IPA.Syllables []]
557 data Merge = Merge Chunk Chunk Chunk
558 deriving (Eq, Ord, Show)
560 runMerge :: [Merge] -> Chunk -> Chunk -> Chunk
563 mergeFrench :: [Merge]
565 [ merge french_a french_n ["ɑ̃"]
568 merge x y chunkPhonetic =
571 { chunkLangue = chunkLangue x
572 , chunkLetters = chunkLetters x <> chunkLetters y
579 -- Consonant + Char =
580 -- Char + Consonant =
582 french_a = french { chunkLetters = "a", chunkPhonetic = ["a"] }
583 french_b = french { chunkLetters = "b", chunkPhonetic = ["be"] }
584 french_c = french { chunkLetters = "c", chunkPhonetic = ["se"] }
585 french_d = french { chunkLetters = "d", chunkPhonetic = ["de"] }
586 french_e = french { chunkLetters = "e", chunkPhonetic = ["ə"] }
587 french_f = french { chunkLetters = "f", chunkPhonetic = ["ɛf"] }
588 french_g = french { chunkLetters = "g", chunkPhonetic = ["g"] }
589 french_h = french { chunkLetters = "h", chunkPhonetic = ["aʃ"] }
590 french_i = french { chunkLetters = "i", chunkPhonetic = ["i"] }
591 french_j = french { chunkLetters = "j", chunkPhonetic = ["ʒi"] }
592 french_k = french { chunkLetters = "k", chunkPhonetic = ["ka"] }
593 french_l = french { chunkLetters = "l", chunkPhonetic = ["ɛl"] }
594 french_m = french { chunkLetters = "m", chunkPhonetic = ["ɛm"] }
595 french_n = french { chunkLetters = "n", chunkPhonetic = ["ɛn"] }
596 french_o = french { chunkLetters = "o", chunkPhonetic = ["o"] }
597 french_p = french { chunkLetters = "p", chunkPhonetic = ["pe"] }
598 french_q = french { chunkLetters = "q", chunkPhonetic = ["ky"] }
599 french_r = french { chunkLetters = "r", chunkPhonetic = ["ɛʁ"] }
600 french_s = french { chunkLetters = "s", chunkPhonetic = ["ɛs"] }
601 french_t = french { chunkLetters = "t", chunkPhonetic = ["te"] }
602 french_u = french { chunkLetters = "u", chunkPhonetic = ["y"] }
603 french_v = french { chunkLetters = "v", chunkPhonetic = ["ve"] }
604 french_w = french { chunkLetters = "w", chunkPhonetic = ["dubl", "ve"] }
605 french_x = french { chunkLetters = "x", chunkPhonetic = ["ɛks"] }
606 french_y = french { chunkLetters = "y", chunkPhonetic = ["igʁɛk"] }
607 french_z = french { chunkLetters = "z", chunkPhonetic = ["zɛd"] }
609 type Patterns = Map Step PatTree
611 patterns :: Map Step PatTree
613 [ Step PosNext (PatChar 'a') :=
614 [ Step PosNext PatLexicalBorder :=
615 PatTreeLeaf ["a" := "ə"]
617 , Step PosNext (PatChar 't') :=
618 [ Step PosNext (PatChar 'h') :=
619 [ Step PosNext (PatChar 'e') :=
620 [ Step PosNext (PatLexicalCategory Char.Space) :=
621 PatTreeLeaf ["the" := "zi"]
628 { stateInput :: LZ.Zipper Inp
629 , stateBuffer :: [Step]
630 , statePats :: Patterns
631 , statePatReset :: Bool
636 | PatLexicalCategory Char.GeneralCategory
639 | PatPhoneticSemiVowel
640 | PatPhoneticConsonant
641 deriving (Eq, Ord, Show)
645 , inpPronunciations :: Pronunciations
649 parse :: Patterns -> Text -> Either String [Inp]
650 parse initPats input =
653 { stateInput = input & Text.unpack & fmap charToInp & LZ.fromList
655 , statePats = initPats
656 , statePatReset = True
662 charToInp :: Char -> Inp
665 { inpSteps = [Step { stepPos = PosNext, stepPat = PatChar c }]
666 , inpPronunciations = []
668 loop :: State -> State
671 | (key, val) <- st & statePats & Map.toList
675 & fromMaybe (loop st{statePats = initPats})
676 look :: (Step, PatTree) -> State -> Maybe State
677 look kv@(key, val) st =
679 Step{stepPos = PosPrev, stepPat = patPrev} -> errorShow ("prev" :: Text)
680 Step{stepPos = PosNext, stepPat = patNext} ->
683 | stateInput st & LZ.endp -> match kv st
685 -- | Just inpNext <- stateInput st & LZ.safeCursor ->
686 -- inpNext & inpSteps &
690 match kv@(key, val) st =
695 { statePats = initPats
701 { inpSteps = stateBuffer st & (key :) & List.reverse
702 , inpPronunciations = pron
705 PatTreeBranches pats ->
709 , stateBuffer = key : stateBuffer st