{-# LANGUAGE StrictData #-} {-# LANGUAGE TemplateHaskell #-} module Wiktionary where import Data.Aeson.TH import Data.List qualified as List import Data.Text qualified as Text import Worksheets.Utils.IPA (IPAPhons) import Worksheets.Utils.JSON qualified as JSON import Worksheets.Utils.Prelude import Worksheets.Utils.SQL qualified as SQL import Prelude (quot) data Abbreviation = Abbreviation { abbreviation_raw_tags :: Maybe JSON.Value , abbreviation_roman :: Maybe ShortText , abbreviation_sense :: Maybe JSON.Value , abbreviation_sense_index :: Maybe JSON.Value , abbreviation_tags :: Maybe JSON.Value , abbreviation_topics :: Maybe JSON.Value , abbreviation_translation :: Maybe JSON.Value , abbreviation_word :: Maybe ShortText } deriving (Eq, Show) $(deriveJSON JSON.options ''Abbreviation) data Anagram = Anagram { anagram_word :: Maybe ShortText } deriving (Eq, Show) $(deriveJSON JSON.options ''Anagram) data Form = Form { form_form :: Maybe ShortText , form_ipas :: Maybe [IPAPhons] , form_raw_tags :: Maybe JSON.Value , form_sense :: Maybe JSON.Value , form_sense_index :: Maybe JSON.Value , form_source :: Maybe ShortText , form_tags :: Maybe [ShortText] , form_hiragana :: Maybe ShortText , form_roman :: Maybe ShortText } deriving (Eq, Show) $(deriveJSON JSON.options ''Form) data Sense = Sense { sense_alt_of :: Maybe JSON.Value , sense_categories :: Maybe [ShortText] , sense_examples :: Maybe JSON.Value , sense_form_of :: Maybe JSON.Value , sense_glosses :: Maybe [ShortText] , sense_note :: Maybe JSON.Value , sense_raw_tags :: Maybe JSON.Value , sense_tags :: Maybe JSON.Value , sense_topics :: Maybe JSON.Value } deriving (Eq, Show) $(deriveJSON JSON.options ''Sense) data Sound = Sound { sound_audio :: Maybe ShortText , sound_enpr :: Maybe IPAPhons , sound_flac_url :: Maybe ShortText , sound_homophone :: Maybe JSON.Value , sound_ipa :: Maybe IPAPhons , sound_mp3_url :: Maybe ShortText , sound_note :: Maybe JSON.Value , sound_oga_url :: Maybe ShortText , sound_ogg_url :: Maybe ShortText , sound_opus_url :: Maybe ShortText , sound_raw_tags :: Maybe [ShortText] , sound_rhymes :: Maybe JSON.Value , sound_roman :: Maybe JSON.Value , sound_other :: Maybe JSON.Value , sound_text :: Maybe JSON.Value , sound_tags :: Maybe JSON.Value , sound_topics :: Maybe JSON.Value , sound_wav_url :: Maybe ShortText , sound_zh_pron :: Maybe JSON.Value -- zh-pron } deriving (Eq, Show) $(deriveJSON JSON.options ''Sound) data Synonym = Synonym { synonym_alt :: Maybe JSON.Value , synonym_raw_tags :: Maybe JSON.Value , synonym_roman :: Maybe JSON.Value , synonym_sense :: Maybe JSON.Value , synonym_sense_index :: Maybe JSON.Value , synonym_tags :: Maybe JSON.Value -- [ShortText] , synonym_topics :: Maybe JSON.Value , synonym_translation :: Maybe JSON.Value , synonym_word :: Maybe ShortText } deriving (Eq, Show) $(deriveJSON JSON.options ''Synonym) -- | Tries to follow the schema at: -- https://kaikki.org/dictionary/errors/mapping/index.html data Wiktionary = Wiktionary { wiktionary_id :: Int -- PRIMARY KEY , wiktionary_word :: Maybe ShortText , wiktionary_lang_code :: Maybe ShortText , wiktionary_lang :: Maybe ShortText , wiktionary_pos :: Maybe ShortText , wiktionary_pos_title :: Maybe ShortText , wiktionary_etymology_texts :: Maybe [ShortText] , wiktionary_senses :: Maybe [Sense] , wiktionary_forms :: Maybe [Form] , wiktionary_sounds :: Maybe [Sound] , wiktionary_translations :: Maybe JSON.Value , wiktionary_synonyms :: Maybe [Synonym] , wiktionary_derived :: Maybe JSON.Value , wiktionary_related :: Maybe JSON.Value , wiktionary_anagrams :: Maybe [Anagram] , wiktionary_categories :: Maybe [ShortText] , wiktionary_tags :: Maybe [ShortText] , wiktionary_raw_tags :: Maybe JSON.Value , wiktionary_meronyms :: Maybe JSON.Value , wiktionary_hyponyms :: Maybe JSON.Value , wiktionary_hypernyms :: Maybe JSON.Value , wiktionary_notes :: Maybe JSON.Value , wiktionary_proverbs :: Maybe JSON.Value , wiktionary_paronyms :: Maybe JSON.Value , wiktionary_antonyms :: Maybe JSON.Value , wiktionary_abbreviation :: Maybe [Abbreviation] , wiktionary_holonyms :: Maybe JSON.Value , wiktionary_etymology_examples :: Maybe JSON.Value , wiktionary_title :: Maybe ShortText , wiktionary_redirect :: Maybe ShortText , wiktionary_troponyms :: Maybe JSON.Value -- ^ Yes, the word can be missing, -- eg. when `wiktionary_pos` is `"hard-redirect"`. } deriving (Eq, Show, Generic) $(deriveJSON JSON.options ''Wiktionary) instance SQL.ToRow Wiktionary where toRow Wiktionary{..} = [ SQL.toField wiktionary_id , SQL.toField wiktionary_word , SQL.toField wiktionary_lang_code , SQL.toField wiktionary_lang , SQL.toField wiktionary_pos , SQL.toField wiktionary_pos_title , SQL.toField wiktionary_etymology_texts , SQL.toField wiktionary_senses , SQL.toField wiktionary_forms , SQL.toField wiktionary_sounds , SQL.toField wiktionary_translations , SQL.toField wiktionary_synonyms , SQL.toField wiktionary_derived , SQL.toField wiktionary_related , SQL.toField wiktionary_anagrams , SQL.toField wiktionary_categories , SQL.toField wiktionary_tags , SQL.toField wiktionary_raw_tags , SQL.toField wiktionary_meronyms , SQL.toField wiktionary_hyponyms , SQL.toField wiktionary_hypernyms , SQL.toField wiktionary_notes , SQL.toField wiktionary_proverbs , SQL.toField wiktionary_paronyms , SQL.toField wiktionary_antonyms , SQL.toField wiktionary_abbreviation , SQL.toField wiktionary_holonyms , SQL.toField wiktionary_etymology_examples , SQL.toField wiktionary_title , SQL.toField wiktionary_redirect , SQL.toField wiktionary_troponyms ] instance SQL.FromRow Wiktionary where fromRow = Wiktionary <$> SQL.fromFieldWithErrorContext <*> SQL.fromFieldWithErrorContext <*> SQL.fromFieldWithErrorContext <*> SQL.fromFieldWithErrorContext <*> SQL.fromFieldWithErrorContext <*> SQL.fromFieldWithErrorContext <*> SQL.fromFieldWithErrorContext <*> SQL.fromFieldWithErrorContext <*> SQL.fromFieldWithErrorContext <*> SQL.fromFieldWithErrorContext <*> SQL.fromFieldWithErrorContext <*> SQL.fromFieldWithErrorContext <*> SQL.fromFieldWithErrorContext <*> SQL.fromFieldWithErrorContext <*> SQL.fromFieldWithErrorContext <*> SQL.fromFieldWithErrorContext <*> SQL.fromFieldWithErrorContext <*> SQL.fromFieldWithErrorContext <*> SQL.fromFieldWithErrorContext <*> SQL.fromFieldWithErrorContext <*> SQL.fromFieldWithErrorContext <*> SQL.fromFieldWithErrorContext <*> SQL.fromFieldWithErrorContext <*> SQL.fromFieldWithErrorContext <*> SQL.fromFieldWithErrorContext <*> SQL.fromFieldWithErrorContext <*> SQL.fromFieldWithErrorContext <*> SQL.fromFieldWithErrorContext <*> SQL.fromFieldWithErrorContext <*> SQL.fromFieldWithErrorContext <*> SQL.fromFieldWithErrorContext type LangCode = ShortText ngramsOfLength :: Int -> Text -> [Text] ngramsOfLength n txt | Text.length txt < n = [] ngramsOfLength n txt = [ go q t | (start, t) <- txt & Text.tails & List.take (n) & List.zip [0 :: Int ..] , let q = quot (len - start) n ] & mconcat where len = Text.length txt go q t = case Text.splitAt n t of (a, b) | q <= 0 -> [] | otherwise -> a : go (q - 1) b ngramsWithinLengths :: Int -> Int -> Text -> [Text] ngramsWithinLengths low high t = [ ngramsOfLength n t | n <- [low .. high] ] & mconcat