1 {-# LANGUAGE AllowAmbiguousTypes #-}
2 {-# LANGUAGE OverloadedLists #-}
3 {-# LANGUAGE ParallelListComp #-}
5 module WiktionarySpec where
7 import Data.GenValidity.Map ()
8 import Data.GenValidity.Sequence ()
9 import Data.GenValidity.Set ()
10 import Data.GenValidity.Text ()
11 import Data.List qualified as List
12 import Data.Map.Strict qualified as Map
13 import Data.Set qualified as Set
14 import Data.Text qualified as Text
15 import Data.Text.Short qualified as ShortText
16 import Data.Validity.Map ()
17 import Data.Validity.Set ()
18 import Data.Validity.Text ()
19 import Language.IPA qualified as IPA
20 import System.Directory qualified as IO
21 import System.FilePath (joinPath, pathSeparator, (<.>), (</>))
22 import System.IO qualified as IO
25 import Data.RadixTree.Word8.Strict qualified as RadixTree
27 import Wiktionary qualified
28 import Worksheets.Utils.IPA qualified as IPA
29 import Worksheets.Utils.JSON qualified as JSON
30 import Worksheets.Utils.Prelude
31 import Worksheets.Utils.SQL qualified as SQL
35 wiktionaryQueryWord :: LangCode -> ShortText -> IO [Wiktionary]
36 wiktionaryQueryWord lang_code word =
37 SQL.withConnection "data/langs/français/kaikki/wiktionary=fr.lang=fr.sqlite3" \conn ->
38 SQL.query conn ("SELECT " <> SQL.selectors @Wiktionary <> " FROM wiktionary WHERE word = ? AND lang_code = ?") (word, lang_code)
40 wiktionaryQueryAll :: IO [Wiktionary]
45 testAll = wiktionaryQueryAll <&> List.length
47 testError0 :: IO [Wiktionary]
49 SQL.withConnection "data/langs/français/kaikki/wiktionary=fr.lang=fr.sqlite3" \conn ->
50 SQL.query conn ("SELECT " <> SQL.selectors @Wiktionary <> " FROM wiktionary WHERE redirect = ?") (SQL.Only ("quelqu\8217un"::Text))
53 -- | Join all `IPA.Segment` containing a `IPA.Linking` feature
54 -- with any following `IPA.Segment`.
55 -- And drop all other `IPA.SuprasegmentalFeature`.
56 mangleSupraSegmentalFeatures :: [IPA.Syllable []] -> [[IPA.Segment]]
57 mangleSupraSegmentalFeatures = \case
59 IPA.Syllable syl : ts -> syl : mangleSupraSegmentalFeatures ts
60 IPA.WithSuprasegmentalFeature IPA.Linking syl : ts ->
61 case mangleSupraSegmentalFeatures ts of
62 [] -> [dropSupraSegmentalFeatures syl]
63 x : xs -> (dropSupraSegmentalFeatures syl <> x) : xs
64 IPA.WithSuprasegmentalFeature _feat syl : xs ->
65 mangleSupraSegmentalFeatures (syl : xs)
69 -- A frequent itemset is closed,
70 -- when no (immediate) superset has the same support.
72 supports :: Set Text -> Map Text (Sum Int)
74 [ (prefix, Set.size words & Sum)
75 | (prefix, words) <- prefixToWords & Map.toList
77 & Map.fromListWith (<>)
79 len = corpus & toList <&> Text.length & maximum
81 [ (prefix, Set.singleton word)
82 | word <- corpus & toList
83 , prefixLen <- [1 .. len]
84 , let prefix = word & Text.take prefixLen
86 & Map.fromListWith (<>)
89 closedForms (prefix, "") corpus = [(prefix, corpus & Set.size)]
90 closedForms (prefix, suffix) corpus =
91 let newPrefix = prefix <> ShortText.take 1 suffix in
92 let newSuffix = suffix & ShortText.drop 1 in
93 let corpusSize = corpus & Set.size in
94 let corpusWithLongerPrefix =
96 & Set.filter (\t -> t & ShortText.isPrefixOf newPrefix) in
97 if corpusSize == Set.size corpusWithLongerPrefix
98 then closedForms corpus (newPrefix, newSuffix)
101 : closedForms corpusWithLongerPrefix (newPrefix, newSuffix)
103 dropSupraSegmentalFeatures = \case
104 IPA.Syllable syl -> syl
105 IPA.WithSuprasegmentalFeature _feat syl -> dropSupraSegmentalFeatures syl
107 lcp :: [Text] -> Text
109 -- use foldr1 to apply to the non-empty strings list given
110 lcp lines = List.foldr1 mapCommonPrefix lines
112 mapCommonPrefix {-new-} :: Text {-acc-} -> Text -> Text
113 mapCommonPrefix len1 len2 =
116 List.takeWhile (uncurry (==)) $
122 describe "Wiktionary" do
126 around (\k -> SQL.withConnection "data/langs/français/kaikki/wiktionary=fr.lang=fr.sqlite3" k) do
127 -- runWiktionaryQuery @(SQL.Only [Wiktionary.Sound])
129 -- "SELECT sounds FROM wiktionary WHERE word=?"
130 -- (SQL.Only ("exercice" :: Text))
133 -- forM (["e", "ex", "exercice"] & list) \word -> do
134 -- outPath <- goldenPath $ word
135 -- it word \conn -> do
136 -- res <- testIPAs conn (word & ShortText.pack)
137 -- return $ goldenPrettyShowInstance outPath res
138 let ipasDB = "data/langs/français/kaikki/wiktionary=fr.lang=fr.ipas.sqlite3"
139 ipasDBExists <- IO.doesFileExist ipasDB & liftIO
142 SQL.withConnection ipasDB \ipasConn ->
143 k (wiktConn, ipasConn)
147 unless (ipasDBExists) do
148 it "can-create-ipas" \(wiktConn, ipasConn) -> do
149 SQL.execute_ ipasConn $ "PRAGMA journal_mode = OFF"
150 SQL.execute_ ipasConn $ "PRAGMA synchronous = OFF"
151 -- SQL.execute_ ipasConn $ "PRAGMA mmap_size = " <> fromString (show (256 * 1024 * 1024 :: Int))
152 -- SQL.execute_ ipasConn $ "PRAGMA page_size = " <> fromString (show (512 * 8 * 2 :: Int))
153 -- SQL.execute_ ipasConn $ "VACUUM"
154 SQL.execute_ ipasConn $ "CREATE TABLE IF NOT EXISTS broad_to_words (broad TEXT NON NULL, broads TEXT NON NULL, word TEXT NON NULL, wordID INTEGER NON NULL, UNIQUE (broad, word))"
155 -- SQL.execute_ ipasConn $ "CREATE TABLE IF NOT EXISTS ipas_broad (wordID INTEGER NON NULL, broad1 TEXT NON NULL, word TEXT NON NULL, broad TEXT NON NULL, UNIQUE(broad, word), UNIQUE(broad1, word))"
156 -- SQL.execute_ ipasConn $ "CREATE TABLE IF NOT EXISTS ipas_narrow (wordID INTEGER NON NULL, narrow1 TEXT NON NULL, word TEXT NON NULL, narrow TEXT NON NULL, UNIQUE(narrow, word), UNIQUE(narrow1, word))"
157 -- SQL.execute_ ipasConn $ "CREATE TABLE IF NOT EXISTS ipas_error (wordID INTEGER NON NULL, error TEXT NON NULL, word TEXT NON NULL)"
158 SQL.execute_ ipasConn "CREATE INDEX broad_to_words__broad ON broad_to_words (broad);"
159 SQL.execute_ ipasConn "CREATE INDEX broad_to_words__broad_word ON broad_to_words (broad, word);"
160 -- SQL.execute_ ipasConn "CREATE INDEX ipas_broad1 ON ipas_broad (broad1);"
161 -- SQL.execute_ ipasConn "CREATE INDEX ipas_narrow1 ON ipas_narrow (narrow1);"
162 -- SQL.execute_ ipasConn "DELETE FROM ipas"
165 "SELECT id,word,sounds FROM wiktionary WHERE lang_code='fr'"
171 , (sounds :: [Wiktionary.Sound])
174 forM_ sounds \Wiktionary.Sound{..} -> do
175 -- IO.hPrint IO.stderr (i, ident::Int, maybeWord)
176 case (maybeWord, sound_ipa) of
177 (Just word, Just (IPA.IPAPhons ipas)) -> do
179 IPA.IPAPhonemic ipaSyllables -> do
180 let dropTonicAccent = ShortText.toText >>> Text.replace "'" ""
181 when (i `mod` 5000 == 0) do
186 , show (List.length ipaSyllables)
187 , ipaSyllables & IPA.ipaWordsToText (Just IPA.Phonemic)
192 "INSERT OR IGNORE INTO broad_to_words (broad, broads, word, wordID) VALUES (?,?,?,?)"
193 [ ( ipaSyl & dropSupraSegmentalFeatures & IPA.Syllable & IPA.toIPA_ & IPA.unIPA
194 , ipaSyllables & IPA.ipaWordsToText Nothing
195 , (word :: ShortText)
198 | ipaWord :: NonEmpty (IPA.Syllable []) <- ipaSyllables & toList -- & foldMap (fmap IPA.toIPA_ >>> toList)
199 , ipaSyl :: IPA.Syllable [] <- ipaWord & toList
203 -- "INSERT OR IGNORE INTO ipas_broad (wordID, broad1, word, broad) VALUES (?,?,?,?)"
205 -- , ipaSyllables & nonEmptyHead <&> IPA.toIPA_ & nonEmptyHead & IPA.unIPA
206 -- , (word :: ShortText)
207 -- , ipaSyllables & Wiktionary.ipaWordsToText IPA.Phonemic
209 IPA.IPAPhonError (t, _err) -> do
213 -- "INSERT INTO ipas_error (word, error) VALUES (?,?)"
214 -- ( (word :: ShortText)
221 let ngramsDB = "data/langs/français/kaikki/wiktionary=fr.lang=fr.ngrams.sqlite3"
222 ngramsDBExists <- IO.doesFileExist ngramsDB & liftIO
225 SQL.withConnection ngramsDB \ngramsConn ->
226 k (wiktConn, ngramsConn)
230 unless (ngramsDBExists) do
231 it "can-compute-ngrams" \(wiktConn, ngramsConn) -> do
232 SQL.execute_ ngramsConn $ "PRAGMA journal_mode = OFF"
233 SQL.execute_ ngramsConn $ "PRAGMA synchronous = OFF"
234 SQL.execute_ ngramsConn $ "CREATE TABLE IF NOT EXISTS broad_to_ngrams (broad TEXT NON NULL, ngram TEXT NON NULL, count INT DEFAULT 1, UNIQUE (broad, ngram))"
235 SQL.execute_ ngramsConn "CREATE INDEX broad_to_ngrams__broad ON broad_to_ngrams (broad);"
236 SQL.execute_ ngramsConn "CREATE INDEX broad_to_ngrams__ngram ON broad_to_ngrams (ngram);"
237 -- SQL.execute_ ngramsConn "CREATE INDEX broad_to_ngrams__broad_ngram ON broad_to_ngrams (broad, ngram);"
238 SQL.execute_ ngramsConn $ "CREATE TABLE IF NOT EXISTS errors (wordID INTEGER NON NULL, word TEXT NON NULL, error TEXT NON NULL)"
241 "SELECT id,word,sounds FROM wiktionary WHERE lang_code='fr'"
247 , (sounds :: [Wiktionary.Sound])
249 forM_ sounds \Wiktionary.Sound{..} -> do
250 -- IO.hPrint IO.stderr (i, ident::Int, maybeWord)
251 case (maybeWord, sound_ipa) of
252 (Just word, Just (IPA.IPAPhons ipas)) -> do
254 IPA.IPAPhonemic ipaSyllables -> do
255 let ngrams = word & Wiktionary.ngramsWithinLengths 1 6
256 when (i `mod` 5000 == 0) do
257 IO.hPrint IO.stderr (i, word, ngrams, ipaSyllables)
260 "INSERT INTO broad_to_ngrams(broad,ngram) VALUES(?,?) ON CONFLICT(broad,ngram) DO UPDATE SET count=broad_to_ngrams.count+1;"
262 , ngram & Text.toLower
264 | wordIPA :: NonEmpty (IPA.Syllable []) <- ipaSyllables & toList
265 , syllableIPA :: IPA.Syllable [] <- wordIPA & toList
266 , let broad = syllableIPA & dropSupraSegmentalFeatures & IPA.Syllable & IPA.toIPA_ & IPA.unIPA
269 IPA.IPAPhonError (errMsg, _err) -> do
272 "INSERT INTO errors (wordID, word, error) VALUES (?,?,?)"
281 let b2eDB = "data/langs/français/kaikki/wiktionary=fr.lang=fr.broads_to_expr.sqlite3"
282 b2eDBExists <- IO.doesFileExist b2eDB & liftIO
283 unless b2eDBExists do
284 aroundWith (\k wiktConn -> SQL.withConnection b2eDB \b2eConn -> k (wiktConn, b2eConn)) do
285 it "can-compute-broads_to_expr" do
290 around (\k -> SQL.withConnection "data/langs/english/kaikki/wiktionary=en.lang=en.sqlite3" k) do
292 let b2eDB = "data/langs/english/kaikki/wiktionary=en.lang=en.broads_to_expr.sqlite3"
293 -- IO.removeFile b2eDB & liftIO
294 b2eDBExists <- IO.doesFileExist b2eDB & liftIO
295 unless b2eDBExists do
296 aroundWith (\k wiktConn -> SQL.withConnection b2eDB \b2eConn -> k (wiktConn, b2eConn)) do
297 it "can-compute-broads_to_expr" $
300 broadsToExprSQL :: Text -> (SQL.Connection, SQL.Connection) -> _
301 broadsToExprSQL lang_code (wiktConn, b2eConn) = do
302 SQL.execute_ b2eConn $ "PRAGMA journal_mode = OFF"
303 SQL.execute_ b2eConn $ "PRAGMA synchronous = OFF"
304 SQL.execute_ b2eConn $ "CREATE TABLE IF NOT EXISTS broads (exprLit TEXT NON NULL, exprBroad TEXT NON NULL, sylBroad TEXT NON NULL, wordPos INTEGER NON NULL, sylPos INTEGER NON NULL, wordEnd INTEGER NON NULL, sylEnd INTEGER NON NULL)"
305 SQL.execute_ b2eConn $ "CREATE INDEX broads__broad ON broads (sylBroad);"
306 SQL.execute_ b2eConn $ "CREATE INDEX broads__sylPos ON broads (sylPos);"
307 SQL.execute_ b2eConn $ "CREATE INDEX broads__exprLit ON broads (exprLit);"
308 SQL.execute_ b2eConn $ "CREATE INDEX broads__broad_and_exprLit ON broads (sylBroad, exprLit);"
309 SQL.execute_ b2eConn $ "CREATE TABLE IF NOT EXISTS errors (exprId INTEGER NON NULL, exprLit TEXT NON NULL, error TEXT NON NULL)"
312 "SELECT id,word,sounds FROM wiktionary WHERE lang_code=?"
317 , maybeWord :: Maybe Text
318 , sounds :: [Wiktionary.Sound]
320 forM_ (sounds) \snd@Wiktionary.Sound{..} -> do
321 -- pHPrint IO.stderr (i, exprId::Int, maybeWord, snd)
323 | lang_code == "en" = sound_ipa <> sound_enpr
324 | otherwise = sound_ipa
325 case (maybeWord, ipas) of
326 (Just exprLit, Just (IPA.IPAPhons exprIPAs)) -> do
327 when (i `mod` 5000 == 0) do
328 IO.hPrint IO.stderr (i, exprLit, exprIPAs)
330 IPA.IPAPhonemic exprBroad -> do
333 "INSERT INTO broads(exprLit, exprBroad, sylBroad,wordPos,sylPos,wordEnd,sylEnd) VALUES(?,?,?,?,?,?,?);"
335 , exprBroad & IPA.ipaWordsToText Nothing
342 | let wordEnd = exprBroad & List.length & \x -> x - 1
343 , (wordPos, wordIPA :: NonEmpty (IPA.Syllable [])) <-
346 & List.zip [0 :: Int ..]
347 , let sylEnd = wordIPA & List.length & \x -> x - 1
348 , (sylPos, syllableIPA :: [IPA.Segment]) <-
351 & mangleSupraSegmentalFeatures
352 & List.zip [0 :: Int ..]
359 IPA.IPAPhonError (errMsg, _err) -> do
362 "INSERT INTO errors (exprId, exprLit, error) VALUES (?,?,?)"
373 let dbName = "broads_to_prefix"
374 let b2pDB = "data/langs/français/kaikki/wiktionary=fr.lang=fr."<>dbName<>".sqlite3"
375 b2pDBExists <- IO.doesFileExist b2pDB & liftIO
378 SQL.withConnection b2pDB \b2pConn ->
379 k (wiktConn, b2pConn)
383 unless (b2pDBExists) do
384 it ("can-compute-"<>dbName) \(wiktConn, b2pConn) -> do
385 SQL.execute_ b2pConn $ "PRAGMA journal_mode = OFF"
386 SQL.execute_ b2pConn $ "PRAGMA synchronous = OFF"
387 SQL.execute_ b2pConn $ "CREATE TABLE IF NOT EXISTS broads (sylBroad TEXT NON NULL, litPrefix1 TEXT NON NULL, litPrefix2 TEXT, litPrefix3 TEXT, litPrefix4 TEXT, litPrefix5 TEXT)"
388 SQL.execute_ b2pConn $ "CREATE INDEX broads__sylBroad ON broads (sylBroad);"
389 SQL.execute_ b2pConn $ "CREATE INDEX broads__litPrefix1 ON broads (litPrefix1);"
390 SQL.execute_ b2pConn $ "CREATE INDEX broads__litPrefix2 ON broads (litPrefix2);"
391 SQL.execute_ b2pConn $ "CREATE INDEX broads__litPrefix3 ON broads (litPrefix3);"
392 SQL.execute_ b2pConn $ "CREATE INDEX broads__litPrefix4 ON broads (litPrefix4);"
393 SQL.execute_ b2pConn $ "CREATE INDEX broads__litPrefix5 ON broads (litPrefix5);"
394 SQL.execute_ b2pConn $ "CREATE TABLE IF NOT EXISTS errors (exprId INTEGER NON NULL, exprLit TEXT NON NULL, error TEXT NON NULL)"
397 "SELECT id,word,sounds FROM wiktionary WHERE lang_code='fr'"
402 , maybeWord :: Maybe Text
403 , sounds :: [Wiktionary.Sound]
405 forM_ (sounds & List.take 1) \Wiktionary.Sound{..} -> do
406 -- IO.hPrint IO.stderr (i, ident::Int, maybeWord)
407 case (maybeWord, sound_ipa) of
408 (Just exprLit, Just (IPA.IPAPhons exprIPAs)) -> do
409 let exprLen = exprLit & Text.length
411 IPA.IPAPhonemic exprBroads -> do
412 when (i `mod` 5000 == 0) do
413 IO.hPrint IO.stderr (i, exprLit, exprBroads)
416 "INSERT INTO broads(sylBroad, litPrefix1, litPrefix2, litPrefix3, litPrefix4, litPrefix5) VALUES(?,?,?,?,?,?);"
431 , let sylBroad = syllableIPA
434 , let litPrefix1 = exprLit & Text.take 1
435 , let litPrefix2 | 2 <= exprLen = Just $ exprLit & Text.take 2
436 | otherwise = Nothing
437 , let litPrefix3 | 3 <= exprLen = Just $ exprLit & Text.take 3
438 | otherwise = Nothing
439 , let litPrefix4 | 4 <= exprLen = Just $ exprLit & Text.take 4
440 | otherwise = Nothing
441 , let litPrefix5 | 5 <= exprLen = Just $ exprLit & Text.take 5
442 | otherwise = Nothing
444 IPA.IPAPhonError (errMsg, _err) -> do
447 "INSERT INTO errors (exprId, exprLit, error) VALUES (?,?,?)"
458 -- entrevue|\ɑ̃.tʁə.vy\
459 -- exercice|\ɛg.zɛʁ.sis\
460 -- exercice|\œ̃.n‿e.gzɛʁ.sis\
461 -- forM_ (["ɛg."] & list) \syllable -> do
462 -- outPath <- goldenPath $ syllable
463 -- it syllable \(wiktConn, ipasConn) -> do
464 -- res :: [SQL.Only Text] <- SQL.query
466 -- "SELECT word FROM ipas_broad WHERE broad1=?"
467 -- (SQL.Only ("ɛg."::Text))
468 -- let corpus = res <&> SQL.fromOnly & Set.fromList
469 -- return $ goldenPrettyShowInstance outPath $
474 let testWord = "exercice"
475 let title = "lcp-" <> ShortText.unpack testWord
476 let db = "data/langs/français/kaikki/wiktionary=fr.lang=fr.ipas.sqlite3"
477 outPath <- goldenPath title
478 aroundWith (\k _ -> SQL.withConnection db k) do
479 it title \ipasConn -> do
480 let ipas :: [ShortText] = ["ɛɡ", "zɛʁ", "sis"]
481 -- foldlM :: (Foldable t, Monad m) => (b -> a -> m b) -> b -> t a -> m b
485 corpus :: [SQL.Only ShortText] <-
488 "SELECT LOWER(word) FROM ipas WHERE broad LIKE (?||'.%') OR broad=?"
490 -- return $ [ (ipa, word) | SQL.Only word <- corpus]<>acc
491 return $ (ipa, corpus) : acc
496 $ goldenPrettyShowInstance outPath
497 $ [ (ipa, corpus & List.length)
498 | (ipa, corpus) <- res
502 -- closedForms (corpus & Set.fromList) ("", word)
503 -- res & List.filter (\t -> t & Text.isPrefixOf "ex" & not)
504 -- aroundWith (\k wiktDB -> k (wiktDB, "data/langs/français/kaikki/wiktionary=fr.lang=fr.ipas.sqlite3" :: FilePath)) do
507 testIPAs :: SQL.Connection -> ShortText -> IO [IPA.IPA]
510 SQL.fold @(SQL.Only [Wiktionary.Sound])
512 "SELECT sounds FROM wiktionary WHERE word=?"
515 \ !acc (SQL.Only sounds) ->
516 return (foldMap (\Wiktionary.Sound{sound_ipa} -> sound_ipa & maybeToList & foldMap Wiktionary.unIPAs) sounds List.++ acc)
521 ("SELECT " <> SQL.selectors @Wiktionary.Wiktionary <> " FROM wiktionary")
524 (\ !(acc :: Int) (_row :: Wiktionary.Wiktionary) -> return (acc + 1))
528 forall row params outers acc.
536 (acc -> row -> IO acc) ->
537 TestDefM outers FilePath ()
538 runWiktionaryFold title qry params init merge = do
539 outPath <- goldenPath title "txt"
540 aroundWith (\k db -> SQL.withConnection db k) do
544 res <- SQL.fold conn qry params init merge
545 return $ goldenPrettyShowInstance outPath res
547 runWiktionaryQuery ::
548 forall row params outers.
555 TestDefM outers FilePath ()
556 runWiktionaryQuery title qry params = do
557 outPath <- goldenPath title "txt"
558 aroundWith (\k db -> SQL.withConnection db k) do
561 res :: [row] <- SQL.query conn qry params
562 return $ goldenPrettyShowInstance outPath res
566 forall wiktRow wiktParams outers.
567 SQL.FromRow wiktRow =>
569 SQL.ToRow wiktParams =>
573 (SQL.Connection -> IO ()) ->
574 (SQL.Connection -> wiktRow -> IO ()) ->
575 TestDefM outers (FilePath, FilePath) ()
576 wiktionaryToIPAs title wiktQry wiktParams init merge = do
577 -- outPath <- goldenPath title
579 (\k (wiktDB, ipasDB) ->
580 SQL.withConnection wiktDB \wiktConn ->
581 SQL.withConnection ipasDB \ipasConn -> do
583 k (wiktConn, ipasConn)) do
586 it title \(wiktConn, ipasConn) -> do -- () do
587 SQL.fold wiktConn wiktQry wiktParams () \() row ->