1 {-# LANGUAGE StrictData #-}
2 {-# LANGUAGE TemplateHaskell #-}
4 module Wiktionary where
7 import Data.List qualified as List
8 import Data.Text qualified as Text
9 import Worksheets.Utils.IPA (IPAPhons)
10 import Worksheets.Utils.JSON qualified as JSON
11 import Worksheets.Utils.Prelude
12 import Worksheets.Utils.SQL qualified as SQL
15 data Abbreviation = Abbreviation
16 { abbreviation_raw_tags :: Maybe JSON.Value
17 , abbreviation_roman :: Maybe ShortText
18 , abbreviation_sense :: Maybe JSON.Value
19 , abbreviation_sense_index :: Maybe JSON.Value
20 , abbreviation_tags :: Maybe JSON.Value
21 , abbreviation_topics :: Maybe JSON.Value
22 , abbreviation_translation :: Maybe JSON.Value
23 , abbreviation_word :: Maybe ShortText
26 $(deriveJSON JSON.options ''Abbreviation)
28 data Anagram = Anagram
29 { anagram_word :: Maybe ShortText
32 $(deriveJSON JSON.options ''Anagram)
35 { form_form :: Maybe ShortText
36 , form_ipas :: Maybe [IPAPhons]
37 , form_raw_tags :: Maybe JSON.Value
38 , form_sense :: Maybe JSON.Value
39 , form_sense_index :: Maybe JSON.Value
40 , form_source :: Maybe ShortText
41 , form_tags :: Maybe [ShortText]
42 , form_hiragana :: Maybe ShortText
43 , form_roman :: Maybe ShortText
46 $(deriveJSON JSON.options ''Form)
49 { sense_alt_of :: Maybe JSON.Value
50 , sense_categories :: Maybe [ShortText]
51 , sense_examples :: Maybe JSON.Value
52 , sense_form_of :: Maybe JSON.Value
53 , sense_glosses :: Maybe [ShortText]
54 , sense_note :: Maybe JSON.Value
55 , sense_raw_tags :: Maybe JSON.Value
56 , sense_tags :: Maybe JSON.Value
57 , sense_topics :: Maybe JSON.Value
60 $(deriveJSON JSON.options ''Sense)
63 { sound_audio :: Maybe ShortText
64 , sound_enpr :: Maybe IPAPhons
65 , sound_flac_url :: Maybe ShortText
66 , sound_homophone :: Maybe JSON.Value
67 , sound_ipa :: Maybe IPAPhons
68 , sound_mp3_url :: Maybe ShortText
69 , sound_note :: Maybe JSON.Value
70 , sound_oga_url :: Maybe ShortText
71 , sound_ogg_url :: Maybe ShortText
72 , sound_opus_url :: Maybe ShortText
73 , sound_raw_tags :: Maybe [ShortText]
74 , sound_rhymes :: Maybe JSON.Value
75 , sound_roman :: Maybe JSON.Value
76 , sound_other :: Maybe JSON.Value
77 , sound_text :: Maybe JSON.Value
78 , sound_tags :: Maybe JSON.Value
79 , sound_topics :: Maybe JSON.Value
80 , sound_wav_url :: Maybe ShortText
81 , sound_zh_pron :: Maybe JSON.Value -- zh-pron
84 $(deriveJSON JSON.options ''Sound)
86 data Synonym = Synonym
87 { synonym_alt :: Maybe JSON.Value
88 , synonym_raw_tags :: Maybe JSON.Value
89 , synonym_roman :: Maybe JSON.Value
90 , synonym_sense :: Maybe JSON.Value
91 , synonym_sense_index :: Maybe JSON.Value
92 , synonym_tags :: Maybe JSON.Value -- [ShortText]
93 , synonym_topics :: Maybe JSON.Value
94 , synonym_translation :: Maybe JSON.Value
95 , synonym_word :: Maybe ShortText
98 $(deriveJSON JSON.options ''Synonym)
100 -- | Tries to follow the schema at:
101 -- https://kaikki.org/dictionary/errors/mapping/index.html
102 data Wiktionary = Wiktionary
103 { wiktionary_id :: Int -- PRIMARY KEY
104 , wiktionary_word :: Maybe ShortText
105 , wiktionary_lang_code :: Maybe ShortText
106 , wiktionary_lang :: Maybe ShortText
107 , wiktionary_pos :: Maybe ShortText
108 , wiktionary_pos_title :: Maybe ShortText
109 , wiktionary_etymology_texts :: Maybe [ShortText]
110 , wiktionary_senses :: Maybe [Sense]
111 , wiktionary_forms :: Maybe [Form]
112 , wiktionary_sounds :: Maybe [Sound]
113 , wiktionary_translations :: Maybe JSON.Value
114 , wiktionary_synonyms :: Maybe [Synonym]
115 , wiktionary_derived :: Maybe JSON.Value
116 , wiktionary_related :: Maybe JSON.Value
117 , wiktionary_anagrams :: Maybe [Anagram]
118 , wiktionary_categories :: Maybe [ShortText]
119 , wiktionary_tags :: Maybe [ShortText]
120 , wiktionary_raw_tags :: Maybe JSON.Value
121 , wiktionary_meronyms :: Maybe JSON.Value
122 , wiktionary_hyponyms :: Maybe JSON.Value
123 , wiktionary_hypernyms :: Maybe JSON.Value
124 , wiktionary_notes :: Maybe JSON.Value
125 , wiktionary_proverbs :: Maybe JSON.Value
126 , wiktionary_paronyms :: Maybe JSON.Value
127 , wiktionary_antonyms :: Maybe JSON.Value
128 , wiktionary_abbreviation :: Maybe [Abbreviation]
129 , wiktionary_holonyms :: Maybe JSON.Value
130 , wiktionary_etymology_examples :: Maybe JSON.Value
131 , wiktionary_title :: Maybe ShortText
132 , wiktionary_redirect :: Maybe ShortText
133 , wiktionary_troponyms :: Maybe JSON.Value
134 -- ^ Yes, the word can be missing,
135 -- eg. when `wiktionary_pos` is `"hard-redirect"`.
137 deriving (Eq, Show, Generic)
139 $(deriveJSON JSON.options ''Wiktionary)
141 instance SQL.ToRow Wiktionary where
142 toRow Wiktionary{..} =
143 [ SQL.toField wiktionary_id
144 , SQL.toField wiktionary_word
145 , SQL.toField wiktionary_lang_code
146 , SQL.toField wiktionary_lang
147 , SQL.toField wiktionary_pos
148 , SQL.toField wiktionary_pos_title
149 , SQL.toField wiktionary_etymology_texts
150 , SQL.toField wiktionary_senses
151 , SQL.toField wiktionary_forms
152 , SQL.toField wiktionary_sounds
153 , SQL.toField wiktionary_translations
154 , SQL.toField wiktionary_synonyms
155 , SQL.toField wiktionary_derived
156 , SQL.toField wiktionary_related
157 , SQL.toField wiktionary_anagrams
158 , SQL.toField wiktionary_categories
159 , SQL.toField wiktionary_tags
160 , SQL.toField wiktionary_raw_tags
161 , SQL.toField wiktionary_meronyms
162 , SQL.toField wiktionary_hyponyms
163 , SQL.toField wiktionary_hypernyms
164 , SQL.toField wiktionary_notes
165 , SQL.toField wiktionary_proverbs
166 , SQL.toField wiktionary_paronyms
167 , SQL.toField wiktionary_antonyms
168 , SQL.toField wiktionary_abbreviation
169 , SQL.toField wiktionary_holonyms
170 , SQL.toField wiktionary_etymology_examples
171 , SQL.toField wiktionary_title
172 , SQL.toField wiktionary_redirect
173 , SQL.toField wiktionary_troponyms
175 instance SQL.FromRow Wiktionary where
178 <$> SQL.fromFieldWithErrorContext
179 <*> SQL.fromFieldWithErrorContext
180 <*> SQL.fromFieldWithErrorContext
181 <*> SQL.fromFieldWithErrorContext
182 <*> SQL.fromFieldWithErrorContext
183 <*> SQL.fromFieldWithErrorContext
184 <*> SQL.fromFieldWithErrorContext
185 <*> SQL.fromFieldWithErrorContext
186 <*> SQL.fromFieldWithErrorContext
187 <*> SQL.fromFieldWithErrorContext
188 <*> SQL.fromFieldWithErrorContext
189 <*> SQL.fromFieldWithErrorContext
190 <*> SQL.fromFieldWithErrorContext
191 <*> SQL.fromFieldWithErrorContext
192 <*> SQL.fromFieldWithErrorContext
193 <*> SQL.fromFieldWithErrorContext
194 <*> SQL.fromFieldWithErrorContext
195 <*> SQL.fromFieldWithErrorContext
196 <*> SQL.fromFieldWithErrorContext
197 <*> SQL.fromFieldWithErrorContext
198 <*> SQL.fromFieldWithErrorContext
199 <*> SQL.fromFieldWithErrorContext
200 <*> SQL.fromFieldWithErrorContext
201 <*> SQL.fromFieldWithErrorContext
202 <*> SQL.fromFieldWithErrorContext
203 <*> SQL.fromFieldWithErrorContext
204 <*> SQL.fromFieldWithErrorContext
205 <*> SQL.fromFieldWithErrorContext
206 <*> SQL.fromFieldWithErrorContext
207 <*> SQL.fromFieldWithErrorContext
208 <*> SQL.fromFieldWithErrorContext
209 type LangCode = ShortText
211 ngramsOfLength :: Int -> Text -> [Text]
212 ngramsOfLength n txt | Text.length txt < n = []
213 ngramsOfLength n txt =
215 | (start, t) <- txt & Text.tails & List.take (n) & List.zip [0 :: Int ..]
216 , let q = quot (len - start) n
220 len = Text.length txt
221 go q t = case Text.splitAt n t of
224 | otherwise -> a : go (q - 1) b
226 ngramsWithinLengths :: Int -> Int -> Text -> [Text]
227 ngramsWithinLengths low high t =