]> Git — Sourcephile - julm/worksheets.git/blob - src/Wiktionary.hs
update
[julm/worksheets.git] / src / Wiktionary.hs
1 {-# LANGUAGE StrictData #-}
2 {-# LANGUAGE TemplateHaskell #-}
3
4 module Wiktionary where
5
6 import Data.Aeson.TH
7 import Data.List qualified as List
8 import Data.Text qualified as Text
9 import Worksheets.Utils.IPA (IPAPhons)
10 import Worksheets.Utils.JSON qualified as JSON
11 import Worksheets.Utils.Prelude
12 import Worksheets.Utils.SQL qualified as SQL
13 import Prelude (quot)
14
15 data Abbreviation = Abbreviation
16 { abbreviation_raw_tags :: Maybe JSON.Value
17 , abbreviation_roman :: Maybe ShortText
18 , abbreviation_sense :: Maybe JSON.Value
19 , abbreviation_sense_index :: Maybe JSON.Value
20 , abbreviation_tags :: Maybe JSON.Value
21 , abbreviation_topics :: Maybe JSON.Value
22 , abbreviation_translation :: Maybe JSON.Value
23 , abbreviation_word :: Maybe ShortText
24 }
25 deriving (Eq, Show)
26 $(deriveJSON JSON.options ''Abbreviation)
27
28 data Anagram = Anagram
29 { anagram_word :: Maybe ShortText
30 }
31 deriving (Eq, Show)
32 $(deriveJSON JSON.options ''Anagram)
33
34 data Form = Form
35 { form_form :: Maybe ShortText
36 , form_ipas :: Maybe [IPAPhons]
37 , form_raw_tags :: Maybe JSON.Value
38 , form_sense :: Maybe JSON.Value
39 , form_sense_index :: Maybe JSON.Value
40 , form_source :: Maybe ShortText
41 , form_tags :: Maybe [ShortText]
42 , form_hiragana :: Maybe ShortText
43 , form_roman :: Maybe ShortText
44 }
45 deriving (Eq, Show)
46 $(deriveJSON JSON.options ''Form)
47
48 data Sense = Sense
49 { sense_alt_of :: Maybe JSON.Value
50 , sense_categories :: Maybe [ShortText]
51 , sense_examples :: Maybe JSON.Value
52 , sense_form_of :: Maybe JSON.Value
53 , sense_glosses :: Maybe [ShortText]
54 , sense_note :: Maybe JSON.Value
55 , sense_raw_tags :: Maybe JSON.Value
56 , sense_tags :: Maybe JSON.Value
57 , sense_topics :: Maybe JSON.Value
58 }
59 deriving (Eq, Show)
60 $(deriveJSON JSON.options ''Sense)
61
62 data Sound = Sound
63 { sound_audio :: Maybe ShortText
64 , sound_enpr :: Maybe IPAPhons
65 , sound_flac_url :: Maybe ShortText
66 , sound_homophone :: Maybe JSON.Value
67 , sound_ipa :: Maybe IPAPhons
68 , sound_mp3_url :: Maybe ShortText
69 , sound_note :: Maybe JSON.Value
70 , sound_oga_url :: Maybe ShortText
71 , sound_ogg_url :: Maybe ShortText
72 , sound_opus_url :: Maybe ShortText
73 , sound_raw_tags :: Maybe [ShortText]
74 , sound_rhymes :: Maybe JSON.Value
75 , sound_roman :: Maybe JSON.Value
76 , sound_other :: Maybe JSON.Value
77 , sound_text :: Maybe JSON.Value
78 , sound_tags :: Maybe JSON.Value
79 , sound_topics :: Maybe JSON.Value
80 , sound_wav_url :: Maybe ShortText
81 , sound_zh_pron :: Maybe JSON.Value -- zh-pron
82 }
83 deriving (Eq, Show)
84 $(deriveJSON JSON.options ''Sound)
85
86 data Synonym = Synonym
87 { synonym_alt :: Maybe JSON.Value
88 , synonym_raw_tags :: Maybe JSON.Value
89 , synonym_roman :: Maybe JSON.Value
90 , synonym_sense :: Maybe JSON.Value
91 , synonym_sense_index :: Maybe JSON.Value
92 , synonym_tags :: Maybe JSON.Value -- [ShortText]
93 , synonym_topics :: Maybe JSON.Value
94 , synonym_translation :: Maybe JSON.Value
95 , synonym_word :: Maybe ShortText
96 }
97 deriving (Eq, Show)
98 $(deriveJSON JSON.options ''Synonym)
99
100 -- | Tries to follow the schema at:
101 -- https://kaikki.org/dictionary/errors/mapping/index.html
102 data Wiktionary = Wiktionary
103 { wiktionary_id :: Int -- PRIMARY KEY
104 , wiktionary_word :: Maybe ShortText
105 , wiktionary_lang_code :: Maybe ShortText
106 , wiktionary_lang :: Maybe ShortText
107 , wiktionary_pos :: Maybe ShortText
108 , wiktionary_pos_title :: Maybe ShortText
109 , wiktionary_etymology_texts :: Maybe [ShortText]
110 , wiktionary_senses :: Maybe [Sense]
111 , wiktionary_forms :: Maybe [Form]
112 , wiktionary_sounds :: Maybe [Sound]
113 , wiktionary_translations :: Maybe JSON.Value
114 , wiktionary_synonyms :: Maybe [Synonym]
115 , wiktionary_derived :: Maybe JSON.Value
116 , wiktionary_related :: Maybe JSON.Value
117 , wiktionary_anagrams :: Maybe [Anagram]
118 , wiktionary_categories :: Maybe [ShortText]
119 , wiktionary_tags :: Maybe [ShortText]
120 , wiktionary_raw_tags :: Maybe JSON.Value
121 , wiktionary_meronyms :: Maybe JSON.Value
122 , wiktionary_hyponyms :: Maybe JSON.Value
123 , wiktionary_hypernyms :: Maybe JSON.Value
124 , wiktionary_notes :: Maybe JSON.Value
125 , wiktionary_proverbs :: Maybe JSON.Value
126 , wiktionary_paronyms :: Maybe JSON.Value
127 , wiktionary_antonyms :: Maybe JSON.Value
128 , wiktionary_abbreviation :: Maybe [Abbreviation]
129 , wiktionary_holonyms :: Maybe JSON.Value
130 , wiktionary_etymology_examples :: Maybe JSON.Value
131 , wiktionary_title :: Maybe ShortText
132 , wiktionary_redirect :: Maybe ShortText
133 , wiktionary_troponyms :: Maybe JSON.Value
134 -- ^ Yes, the word can be missing,
135 -- eg. when `wiktionary_pos` is `"hard-redirect"`.
136 }
137 deriving (Eq, Show, Generic)
138
139 $(deriveJSON JSON.options ''Wiktionary)
140
141 instance SQL.ToRow Wiktionary where
142 toRow Wiktionary{..} =
143 [ SQL.toField wiktionary_id
144 , SQL.toField wiktionary_word
145 , SQL.toField wiktionary_lang_code
146 , SQL.toField wiktionary_lang
147 , SQL.toField wiktionary_pos
148 , SQL.toField wiktionary_pos_title
149 , SQL.toField wiktionary_etymology_texts
150 , SQL.toField wiktionary_senses
151 , SQL.toField wiktionary_forms
152 , SQL.toField wiktionary_sounds
153 , SQL.toField wiktionary_translations
154 , SQL.toField wiktionary_synonyms
155 , SQL.toField wiktionary_derived
156 , SQL.toField wiktionary_related
157 , SQL.toField wiktionary_anagrams
158 , SQL.toField wiktionary_categories
159 , SQL.toField wiktionary_tags
160 , SQL.toField wiktionary_raw_tags
161 , SQL.toField wiktionary_meronyms
162 , SQL.toField wiktionary_hyponyms
163 , SQL.toField wiktionary_hypernyms
164 , SQL.toField wiktionary_notes
165 , SQL.toField wiktionary_proverbs
166 , SQL.toField wiktionary_paronyms
167 , SQL.toField wiktionary_antonyms
168 , SQL.toField wiktionary_abbreviation
169 , SQL.toField wiktionary_holonyms
170 , SQL.toField wiktionary_etymology_examples
171 , SQL.toField wiktionary_title
172 , SQL.toField wiktionary_redirect
173 , SQL.toField wiktionary_troponyms
174 ]
175 instance SQL.FromRow Wiktionary where
176 fromRow =
177 Wiktionary
178 <$> SQL.fromFieldWithErrorContext
179 <*> SQL.fromFieldWithErrorContext
180 <*> SQL.fromFieldWithErrorContext
181 <*> SQL.fromFieldWithErrorContext
182 <*> SQL.fromFieldWithErrorContext
183 <*> SQL.fromFieldWithErrorContext
184 <*> SQL.fromFieldWithErrorContext
185 <*> SQL.fromFieldWithErrorContext
186 <*> SQL.fromFieldWithErrorContext
187 <*> SQL.fromFieldWithErrorContext
188 <*> SQL.fromFieldWithErrorContext
189 <*> SQL.fromFieldWithErrorContext
190 <*> SQL.fromFieldWithErrorContext
191 <*> SQL.fromFieldWithErrorContext
192 <*> SQL.fromFieldWithErrorContext
193 <*> SQL.fromFieldWithErrorContext
194 <*> SQL.fromFieldWithErrorContext
195 <*> SQL.fromFieldWithErrorContext
196 <*> SQL.fromFieldWithErrorContext
197 <*> SQL.fromFieldWithErrorContext
198 <*> SQL.fromFieldWithErrorContext
199 <*> SQL.fromFieldWithErrorContext
200 <*> SQL.fromFieldWithErrorContext
201 <*> SQL.fromFieldWithErrorContext
202 <*> SQL.fromFieldWithErrorContext
203 <*> SQL.fromFieldWithErrorContext
204 <*> SQL.fromFieldWithErrorContext
205 <*> SQL.fromFieldWithErrorContext
206 <*> SQL.fromFieldWithErrorContext
207 <*> SQL.fromFieldWithErrorContext
208 <*> SQL.fromFieldWithErrorContext
209 type LangCode = ShortText
210
211 ngramsOfLength :: Int -> Text -> [Text]
212 ngramsOfLength n txt | Text.length txt < n = []
213 ngramsOfLength n txt =
214 [ go q t
215 | (start, t) <- txt & Text.tails & List.take (n) & List.zip [0 :: Int ..]
216 , let q = quot (len - start) n
217 ]
218 & mconcat
219 where
220 len = Text.length txt
221 go q t = case Text.splitAt n t of
222 (a, b)
223 | q <= 0 -> []
224 | otherwise -> a : go (q - 1) b
225
226 ngramsWithinLengths :: Int -> Int -> Text -> [Text]
227 ngramsWithinLengths low high t =
228 [ ngramsOfLength n t
229 | n <- [low .. high]
230 ]
231 & mconcat