Spaces:
Sleeping
Sleeping
| import re | |
| import duckdb | |
| from datasets import load_dataset | |
| definitions_ds = load_dataset("adorkin/sonajaht", "definitions") | |
| words_ds = load_dataset("adorkin/sonajaht", "words") | |
| definitions = definitions_ds["definitions"].to_pandas() | |
| definitions.value = definitions.value.str.replace( | |
| re.compile(r"<[^>]*>"), "", regex=True | |
| ).apply(lambda el: " ".join(el.split())) | |
| definitions = duckdb.query( | |
| "SELECT * FROM definitions WHERE lang = 'est' AND LENGTH(value) > 5" | |
| ).df() | |
| definitions.reset_index(inplace=True, names="entry_id") | |
| words = words_ds["words"].to_pandas() | |
| conn = duckdb.connect("sonajaht.db") | |
| conn.execute("CREATE TABLE definitions AS SELECT * FROM definitions") | |
| conn.execute("CREATE TABLE words AS SELECT * FROM words") | |