1 {-# LANGUAGE OverloadedLists #-}
2 {-# LANGUAGE OverloadedStrings #-}
3 {-# LANGUAGE StandaloneDeriving #-}
5 module Language.Pronunciation where
7 import Control.Applicative (asum)
8 import Data.List qualified as List
9 import Data.Map.Strict qualified as Map
10 import Data.Set qualified as Set
11 import Data.Text qualified as Text
12 import Data.Text.Lazy qualified as TextLazy
14 import Paths_worksheets qualified as Self
15 import System.FilePath.Posix ((</>))
16 import System.FilePath.Posix qualified as File
17 import Text.Blaze.Html5.Attributes qualified as HA
18 import Worksheets.Utils.Char qualified as Char
19 import Worksheets.Utils.HTML (className, classes, cm, styles, (!))
20 import Worksheets.Utils.HTML qualified as HTML
21 import Worksheets.Utils.IPA qualified as IPA
22 import Worksheets.Utils.Paper qualified as Paper
23 import Worksheets.Utils.Prelude
24 import Prelude (error)
26 -- import Data.Radix1Tree.Word8.Key.Unsafe qualified as RT
27 -- import Data.Radix1Tree.Word8.Strict qualified as RT
29 -- import Data.List.Split qualified as Split
31 -- radixFromList :: [(Text, a)] -> RT.Radix1Tree a
32 -- radixFromList = foldr (\(k, a) p -> RT.insert (RT.unsafeFeedText k) a p) RT.empty
34 data Pronunciation = PronunciationIPABroad
35 { pronunciationText :: Text
36 , pronunciationIPA :: IPA.Syllable []
38 deriving (Eq, Ord, Show)
39 instance IsString Pronunciation where
41 "" -> PronunciationIPABroad "" $ IPA.Syllable []
42 s -> PronunciationIPABroad (s & Text.pack) $ fromString s
46 , -- , lexemeNext :: Text
47 lexemePron :: Pronunciation
49 deriving (Eq, Show, Generic)
51 data LexemePron = LexemePron
52 { lexemePronunciation :: Pronunciation
53 , lexemeExample :: [Literal]
57 data Literal = Literal
58 { literalText :: ShortText
59 , literalTags :: Set LiteralTag
61 deriving (Eq, Ord, Show)
62 instance IsString Literal where
65 { literalText = s & fromString
66 , literalTags = Set.empty
72 deriving (Eq, Ord, Show)
77 , literalTags = [LiteralTagMeta] & Set.fromList
79 occurence lit = lit{literalTags = lit & literalTags & Set.insert LiteralTagOccurence}
80 silent lit = lit{literalTags = lit & literalTags & Set.insert LiteralTagSilent}
82 type SyllableText = ShortText
83 type SyllableBroad = IPA.Syllable []
84 type SyllablesTable = Map SyllableText (Map SyllableText [([Literal], SyllableBroad)])
86 type ContextToLexemePron = (LexemePron, Map (IPA.Syllable []) LexemePron)
87 data Machine = Machine
88 { machinePron :: Maybe Pronunciation
89 , machineExample :: Set [Literal]
90 , machineAlts :: Map Trans Machine
91 -- ^ Those 'Machine's must not be recursive.
96 { machinePron = Nothing
97 , machineExample = Set.empty
98 , machineAlts = Map.empty
102 instance Monoid Machine where
105 , machineExample = []
106 , machineAlts = Map.empty
108 instance Semigroup Machine where
110 { machinePron = if machinePron x == machinePron y
112 else errorShow (machinePron x, machinePron y)
113 , machineExample = machineExample x <> machineExample y
114 , machineAlts = Map.unionWith (<>) (machineAlts x) (machineAlts y)
117 machineExamples :: Machine -> Set [Literal]
118 machineExamples mach =
120 & foldMap machineExamples
121 & Set.union (machineExample mach)
129 { chunkInterval :: [Trans]
130 , chunkInputNext :: [Char]
131 , chunkMachine :: Machine
133 deriving (Eq, Show, Generic)
135 chunkText :: Chunk -> Text
136 chunkText Chunk{chunkInterval} =
140 TransConsume (ObservChar c) -> c : acc
146 chunkPronunciation :: Chunk -> Text
147 chunkPronunciation Chunk{chunkMachine} =
150 & maybe "" pronunciationText
152 chunksWords :: [Chunk] -> [[Chunk]]
154 chunksWords chks = word0 : chunksWords next
156 (word0, rest) = chks & List.span (isSep >>> not)
157 (_sep, next) = rest & List.span isSep
158 isSep chk = chk & chunkInterval & List.elem (TransConsume ObservSpace)
160 inputToObservs :: Text -> [Observ]
161 inputToObservs t = ObservBorder : Text.foldr (\c acc -> ObservChar c : acc) [ObservBorder] t
167 transChunk :: Trans -> Chunk -> Maybe Chunk
168 transChunk obs chunk = do
169 nextMachine <- chunk & chunkMachine & machineAlts & Map.lookup trans
171 { chunkMachine = nextMachine
172 , chunkInterval = trans : chunkInterval chunk
173 , chunkInputNext = chunkInputNext chunk
178 & transChunk (TransLookAhead ObservBorder)
180 readChunk :: Bool -> Chunk -> Maybe Chunk
181 readChunk isBorder chunk =
182 case chunk & chunkInputNext of
183 [] | not isBorder -> chunk & borderChunk >>= readChunk False
184 | otherwise -> Just chunk
186 | not isBorder && or [Char.isSpace i, ]
189 consumeChar :: State -> Chunk -> Maybe Chunk
190 consumeChar st chunk = Nothing
191 | st == StateBorder = (chunk, [])
192 , otherwise = chunk & advanceChunk ObservBorder
193 consumeChar (c:cs) chunk
194 | st == StateBorder =
198 | Just nextMach <- jumps & Map.lookup (TransLookAhead ObservBorder) ->
200 { chunkInputNext = obs
201 , chunkMachine = nextMach
202 , chunkInterval = TransLookAhead ObservBorder : chunkInterval chunk
205 | Just nextMach <- jumps & Map.lookup (ObservChar c) ->
207 { chunkInputNext = obs
208 , chunkMachine = nextMach
209 , chunkInterval = TransConsume ob : chunkInterval chunk
212 , Just nextMach <- jumps & Map.lookup ObservSpace ->
214 { chunkInputNext = obs
215 , chunkMachine = nextMach
216 , chunkInterval = TransConsume ob : chunkInterval chunk
218 | Char.isPunctuation c
219 , Just nextMach <- jumps & Map.lookup ObservPunctuation ->
221 { chunkInputNext = obs
222 , chunkMachine = nextMach
223 , chunkInterval = TransConsume ob : chunkInterval chunk
226 jumps = chunk & chunkMachine & machineAlts
227 piorities :: [Observ]
233 [ TransConsume o | o <- obsClasses ]
234 [ TransLookAhead o | o <- obsClasses ]
238 [ [ObservSpace | Char.isSpace c]
239 , [ObservPunctuation | Char.isPunctuation c]
240 , [ObservMark | Char.isMark c]
241 , [ObservSymbol | Char.isSymbol c]
242 , [ObservSeparator | Char.isSeparator c]
246 runMachine :: Machine -> Text -> Either ([Trans], Text) [Chunk]
247 runMachine machInit = inputToObservs >>> loop True [] initChunk
252 , chunkInputNext = inp
253 , chunkMachine = machInit
255 loop :: Bool -> [Chunk] -> Chunk -> [Observ] -> Either ([Trans], Text) [Chunk]
256 loop isInit prevChunks chunk currentInput =
257 -- traceShow ("runMachine.loop" :: Text, isInit, chunk{chunkMachine = Machine "" [] []}, currentInput) $
259 then Right [chunk & chunkUpdateBehind prevChunks & chunkUpdateAhead []]
261 case chunk & updateChunkWithInput currentInput of
262 Just (newChunk, inputNext) -> loop False prevChunks newChunk inputNext
264 | isInit -> Left (chunk & chunkInterval, currentInput & Text.take 10)
266 -- Reset the machine to process the next chunk
267 nextChunks <- loop True (chunk : prevChunks) (initChunk currentInput) currentInput
268 Right $ (chunk & chunkUpdateBehind prevChunks & chunkUpdateAhead nextChunks) : nextChunks
270 updateChunkWithInput :: Text -> Chunk -> Maybe (Chunk, Text)
271 updateChunkWithInput input chunk =
272 -- traceShow ("updateChunkWithInput" :: Text, chunk{chunkMachine = Machine "" [] []}) $
273 [ (matchingLength, (newChunk, inputRest))
274 | (trans, transMach) <- chunk & chunkMachine & machineAlts & Map.toList
275 , let matchingLength = transMatchingLength input trans
276 , 0 < matchingLength || not (isTransConsume trans)
277 , -- , transMach & machinePron & isJust
278 let (inputRead, inputRest) = input & Text.splitAt matchingLength
281 { -- chunkText = chunkText chunk <> inputRead
282 chunkMachine = transMach
283 , chunkInterval = trans : chunkInterval chunk
286 & Map.fromListWith (\new old -> old)
290 transMatchingLength :: Text -> Trans -> Int
291 transMatchingLength inp = \case
295 | Text.isPrefixOf prefix inp -> Text.length prefix
296 | Text.isPrefixOf prefix (inp & Text.take (Text.length prefix) & Text.toLower) -> Text.length prefix
299 | Just (c0, _) <- Text.uncons inp
304 | Just (c0, _) <- Text.uncons inp
305 , Char.isPunctuation c0 ->
309 | Just (c0, _) <- Text.uncons inp
314 | Just (c0, _) <- Text.uncons inp
315 , Char.isNumber c0 ->
319 | Just (c0, _) <- Text.uncons inp
320 , Char.isSeparator c0 ->
324 | Just (c0, _) <- Text.uncons inp
325 , Char.isSymbol c0 ->
331 -- transNextOrId :: Trans -> Machine -> Machine
332 -- transNextOrId trans mach = mach & machineAlts & Map.findWithDefault mach trans
333 filterTransConsume = List.filter (\case TransConsume{} -> True; _ -> False)
334 isTransConsume = \case
335 TransConsume{} -> True
337 applyTrans :: Trans -> Chunk -> Chunk
338 applyTrans trans chunk =
339 case chunk & chunkMachine & machineAlts & Map.lookup trans of
342 -- \| Nothing <- nextMachine & machinePron -> chunk
345 { chunkMachine = nextMachine
346 , chunkInterval = trans : chunkInterval chunk
347 -- , chunkText = chunkText chunk
349 applyTransSeq :: [Trans] -> Chunk -> Chunk
350 applyTransSeq [] chunk = chunk
351 applyTransSeq (t : ts) chunk = chunk & applyTrans t & applyTransSeq ts
353 chunkUpdateAhead :: [Chunk] -> Chunk -> Chunk
354 chunkUpdateAhead aheadChunks chunk =
355 -- traceShow ("chunkUpdateAhead"::Text, chunk & chunkText, aheadObservs) $
356 chunk & applyTransSeq [TransLookAhead x | x <- mconcat aheadObservs]
358 aheadObservs :: [[Observ]]
360 [ -- specific observation
362 Chunk{chunkInterval = List.reverse . filterTransConsume -> TransConsume obs : _} : _ -> [obs]
364 , -- border observation
366 Chunk{chunkInterval = List.reverse . filterTransConsume -> TransConsume obs : _} : _ ->
368 ObservSpace -> [ObservBorder]
369 ObservNumber -> [ObservBorder]
370 ObservMark -> [ObservBorder]
371 ObservPunctuation -> [ObservBorder]
375 , -- vowel/semivowel/consonant observation
377 Chunk{chunkMachine = Machine{machinePron = Just PronunciationIPABroad{pronunciationIPA = IPA.syllableToSegments -> seg : _}}} : _ ->
379 IPA.Vowel{} -> [ObservVowel]
380 IPA.Consonant (IPA.Pulmonic _phonation _place IPA.Approximant) -> [ObservSemiVowel]
381 IPA.Consonant (IPA.Ejective _place IPA.Approximant) -> [ObservSemiVowel]
382 _ -> [ObservConsonant]
386 chunkUpdateBehind :: [Chunk] -> Chunk -> Chunk
387 chunkUpdateBehind behindChunks chunk =
388 -- traceShow ("chunkUpdateBehind"::Text, chunk & chunkText, behindObservs, behindChunks <&> chunkText) $
389 chunk & applyTransSeq [TransLookBehind x | x <- mconcat behindObservs]
391 behindObservs :: [[Observ]]
393 [ -- specific observation
395 Chunk{chunkInterval = filterTransConsume -> TransConsume obs : _} : _ -> [obs]
397 , -- border observation
399 Chunk{chunkInterval = filterTransConsume -> TransConsume obs : _} : _ ->
401 ObservSpace -> [ObservBorder]
402 ObservNumber -> [ObservBorder]
403 ObservMark -> [ObservBorder]
404 ObservPunctuation -> [ObservBorder]
408 , -- vowel/semivowel/consonant observation
410 Chunk{chunkMachine = Machine{machinePron = Just PronunciationIPABroad{pronunciationIPA = IPA.syllableToSegments >>> List.reverse -> seg : _}}} : _ ->
412 IPA.Vowel{} -> [ObservVowel]
413 IPA.Consonant (IPA.Pulmonic _phonation _place IPA.Approximant) -> [ObservSemiVowel]
414 IPA.Consonant (IPA.Ejective _place IPA.Approximant) -> [ObservSemiVowel]
415 _ -> [ObservConsonant]
421 = TransConsume Observ
422 | TransLookAhead Observ
423 | TransLookBehind Observ
424 | TransDefinition Text
425 deriving (Eq, Ord, Show)
439 deriving (Eq, Ord, Show)
440 instance IsString [Trans] where
441 fromString = fmap (TransConsume . ObservChar)
443 data LexerDict = LexerDict
444 { lexerDictMap :: Map Text (Map PronContext LexemePron)
445 , lexerDictMaxKeyLength :: Int
449 contextToLexemePron :: [(Text, ContextToLexemePron)] -> [(Text, LexemePron)]
450 contextToLexemePron =
453 \(key, (defLex, nextTolex))
454 acc@((nextKey,nextLex):as) ->
455 Map.findWithDefault defLex nextLex nextToLex:acc
460 borderRightChar = '⌞'
461 borderLeftText = borderLeftChar & Text.singleton
462 borderRightText = borderRightChar & Text.singleton
463 borderInnerText = borderRightText <> borderLeftText
466 mapLookupLonguest :: Text -> LexerDict -> Maybe Lexeme
467 mapLookupLonguest inp LexerDict{..} =
468 [ lexerDictMap & Map.lookup (key & Text.toLower)
470 & foldMap \nextToVariant ->
472 nextToVariant & Map.keys
476 & fromMaybe (error "empty map")
478 [ nextToVariant & Map.lookup (next & Text.toLower)
479 <&> (\variant -> Lexeme{lexemeKey=key, lexemeNext=next, lexemePron=variant & lexemePronunciation})
481 | next <- inp & Text.drop (key & Text.length)
486 | key <- inp & Text.take lexerDictMaxKeyLength
489 , not $ Text.null key
495 lexerInit :: Text -> Text
497 borderLeftText <> innerInput <> borderRightText
499 innerInput = input & Text.replace " " borderInnerText
501 type LexerError = (Text, [Text])
504 = PronContextStressed
505 | PronContextUnstressed
506 | PronContextBeforeBorder
507 | PronContextBeforeAnyVowel
508 | -- | Aka. semi-vowels
509 PronContextBeforeAnySemiVowel
510 | PronContextBeforeAnyConsonant
511 | PronContextBeforeAny
512 | PronContextBeforeSegment IPA.Segment
513 deriving (Eq, Ord, Show)
514 deriving instance Ord (IPA.Syllable [])
515 deriving instance Ord IPA.SuprasegmentalFeature
516 deriving instance Ord IPA.SegmentalFeature
517 deriving instance Ord IPA.Sibilance
518 deriving instance Ord IPA.Manner
519 deriving instance Ord IPA.Phonation
520 deriving instance Ord IPA.Roundedness
521 deriving instance Ord IPA.Height
522 deriving instance Ord IPA.Vowel
523 deriving instance Ord IPA.Consonant
524 deriving instance Ord IPA.Segment
526 lexer :: LexerDict -> Text -> Either LexerError [Lexeme]
527 lexer dict input = input & lexerInit & splitChunks <&> foldChunks
529 nextChunk :: Text -> Maybe (Text, Map PronContext LexemePron)
531 [ dict & lexerDictMap & Map.lookup (key & Text.toLower) <&> (key,) & maybeToList
534 & Text.take (dict & lexerDictMaxKeyLength)
537 , not $ Text.null key
541 splitChunks :: Text -> Either LexerError [(Text, Map PronContext LexemePron)]
546 Nothing -> Left (inp, keys)
550 & Text.take (dict & lexerDictMaxKeyLength)
553 Just kv@(key, _contextToVariant)
557 else case splitChunks inpNext of
558 Left (rest, keys) -> Left (rest, keys)
559 Right res' -> Right (kv : res')
561 inpNext = inp & Text.drop (Text.length key)
562 dropZeros :: [Lexeme] -> [Lexeme]
563 dropZeros = List.dropWhile \case
564 Lexeme{lexemePron = PronunciationIPABroad{pronunciationIPA = IPA.Syllable [IPA.Zero{}]}} -> True
566 foldChunks :: [(Text, Map PronContext LexemePron)] -> [Lexeme]
569 ( \(chunk, contextToVariant) afters ->
571 fromMaybe (Lexeme chunk $ PronunciationIPABroad "" (IPA.Syllable [IPA.Zero])) $
576 Lexeme{lexemeKey} : _ ->
577 case lexemeKey & Text.unpack of
579 | c & Char.isAlphaNum & not ->
581 & Map.lookup PronContextBeforeBorder
582 <&> \LexemePron{lexemePronunciation} -> Lexeme chunk lexemePronunciation
584 , case afters & dropZeros of
586 Lexeme{lexemePron = PronunciationIPABroad{pronunciationIPA = pron}} : _ ->
587 case pron & IPA.syllableToSegments of
589 | Just LexemePron{lexemePronunciation} <-
590 contextToVariant & Map.lookup (PronContextBeforeSegment seg) ->
591 Just $ Lexeme chunk lexemePronunciation
594 & Map.lookup PronContextBeforeAnyVowel
595 <&> \LexemePron{lexemePronunciation} -> Lexeme chunk lexemePronunciation
596 IPA.Consonant consonant : _ ->
598 IPA.Pulmonic _phonation _place IPA.Approximant ->
600 & Map.lookup PronContextBeforeAnySemiVowel
601 <&> \LexemePron{lexemePronunciation} -> Lexeme chunk lexemePronunciation
602 IPA.Ejective _place IPA.Approximant ->
604 & Map.lookup PronContextBeforeAnySemiVowel
605 <&> \LexemePron{lexemePronunciation} -> Lexeme chunk lexemePronunciation
608 & Map.lookup PronContextBeforeAnyConsonant
609 <&> \LexemePron{lexemePronunciation} -> Lexeme chunk lexemePronunciation
612 & Map.lookup PronContextBeforeAny
613 <&> \LexemePron{lexemePronunciation} -> Lexeme chunk lexemePronunciation
618 lexerDict lexerDictMap =
621 , lexerDictMaxKeyLength =
627 & fromMaybe (error "empty map")
630 lexerPron :: LexerDict -> Text -> Either LexerError [(Text, Text)]
637 PronunciationIPABroad txt (IPA.Syllable [IPA.Zero]) -> [(lexemeKey, txt)]
638 PronunciationIPABroad txt _ipa -> [(lexemeKey, txt)]
641 data FrenchToken = FrenchToken
646 lexerChunks :: LexerDict -> Text -> Either LexerError [FrenchToken]
647 lexerChunks dict inp =
653 PronunciationIPABroad{pronunciationIPA = IPA.Syllable [IPA.Zero]} -> [FrenchToken lexemeKey ""]
654 PronunciationIPABroad{pronunciationText} -> [FrenchToken lexemeKey pronunciationText]
657 groupLexemes :: [Lexeme] -> [[Lexeme]]
659 ls -- & Split.split ( Split.whenElt (\(t, _) -> t == "\n") & Split.dropDelims)
662 group :: [Lexeme] -> [[Lexeme]]
664 -- group (inpHead@("\n", lex) : inpTail) =
665 group (inpHead : inpTail) =
666 if inpHead & lexemeKey & Text.isPrefixOf borderLeftText
668 ( let (seps, rest) = inpTail & List.span \l -> not $ l & lexemeKey & Text.isPrefixOf borderLeftText
669 in (inpHead : seps) : group rest
671 else error "groupLexemes"
673 pronunciation :: LexerDict -> [Either Lexeme Text] -> [[Lexeme]]
674 pronunciation dict ts =
680 & either (error . TextLazy.unpack . pShow) id
681 -- & List.intercalate [("\n", LexemePron{lexemePronunciation=PronunciationSilent, lexemeExample=[]})]
684 | -- lines = txt & Text.split (== '\n')
691 [ (word, word & lexerPron dict)
699 syllablesTable :: SyllablesTable
702 [ "a" := ["ba" := "ba"]
703 , "e" := ["ber" := "bɛʁ", "be" := "bə", "be" := "bɛ"]
704 , "é" := ["bé" := "be"]
709 -- 2|ʁa,li,a,ti,ʁɛ,zə,ɑ̃
710 -- 3|ɑ̃,ni,ʁi,ka,e,ze,ʁɔ̃,ta,te,si,za
711 -- 4|za,to,ma,na,di,mi,kɔ̃,kɔ,o,sjɔ̃,tə,la,pa,ne
712 -- 5|ne,i,fi,se,bi,də,ɔ,ɛ̃,sje,ʁje,me,nə,ʁjɔ̃,mɑ̃,le,tɔ,lɔ,pi,ba,vi,zɑ̃
714 syllablesTableToHTML :: SyllablesTable -> IO HTML.Html
715 syllablesTableToHTML sylInitToSylFinalToPhon = do
716 -- FIXME: this absolute path is not portable out of my system
717 dataPath <- Self.getDataDir <&> File.normalise
719 -- let pageOrientation = Paper.PageOrientationLandscape
720 -- let pageSize = Paper.PageSizeA4
723 HTML.title $ "Syllabes"
726 -- , "SyllableTable.css"
732 ! HA.rel "stylesheet"
733 ! HA.type_ "text/css"
734 ! HA.href (dataPath </> "styles" </> cssFile & HTML.toValue)
735 -- HTML.styleCSS $ HTML.cssPrintPage pageOrientation pageSize
736 let sylInits = sylInitToSylFinalToPhon & Map.keysSet
737 let sylFinals = sylInitToSylFinalToPhon & Map.elems & foldMap Map.keysSet
738 HTML.body ! classes ["A4"] $ do
740 forM_ ([1 :: Int .. 3] & list) \page ->
745 "test" <> show page & HTML.toHtml
749 ! classes ["main-page"]
755 , "page-" <> className pageSize <> "-" <> className pageOrientation
758 [ "grid-template-columns" := (1 & fr & HTML.toCSS) & List.replicate (1 + Set.size sylFinals) & List.unwords
761 forM_ sylFinals \sylFinal -> do
763 sylFinal & HTML.toHtml
764 forM_ sylFinals \sylFinal -> do
765 forM_ (sylInitToSylFinalToPhon & Map.toList) \(sylInit, sylFinalToPhon) -> do
767 sylInit & HTML.toHtml
768 forM_ sylFinals \sylFinal -> do
770 forM_ (sylFinalToPhon & Map.lookup sylFinal & fromMaybe []) \phons -> do
771 phons & show & HTML.toHtml
774 data PronDictKey = PronDictKey Text
776 instance Ord PronDictKey where
777 PronDictKey x `compare` PronDictKey y =
778 compare (Down $ Text.length x) (Down $ Text.length y)
782 ful, pre, suf, inf :: Text -> [Text]
784 [ borderLeftText <> t <> borderRightText
785 , borderLeftText <> t <> "'"
786 , "'" <> t <> borderRightText
787 , borderLeftText <> t <> "-"
788 , "-" <> t <> borderRightText
790 pre t = [borderLeftText <> t, "-" <> t, "'" <> t]
791 suf t = [t <> borderRightText, t <> "-", t <> "'", t <> ",", t <> "."]