Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| from huggingface_hub import hf_hub_download | |
| import pandas as pd | |
| import os | |
| from unicodedata import normalize | |
| import tempfile | |
| HF_TOKEN = os.getenv("HF_TOKEN", None) | |
| def process_file(dataset_type, user_file): | |
| if user_file is None: | |
| return gr.Error("Please upload your data.") | |
| if dataset_type == "FLORES+ dev": | |
| reference_file = hf_hub_download( | |
| repo_id="openlanguagedata/flores_plus", | |
| filename="dev/eng_Latn.parquet", | |
| repo_type="dataset", | |
| use_auth_token=HF_TOKEN, | |
| ) | |
| elif dataset_type == "FLORES+ devtest": | |
| reference_file = hf_hub_download( | |
| repo_id="openlanguagedata/flores_plus", | |
| filename="devtest/eng_Latn.parquet", | |
| repo_type="dataset", | |
| use_auth_token=HF_TOKEN, | |
| ) | |
| elif dataset_type == "OLDI-Seed": | |
| reference_file = hf_hub_download( | |
| repo_id="openlanguagedata/oldi_seed", | |
| filename="seed/eng_Latn.parquet", | |
| repo_type="dataset", | |
| use_auth_token=HF_TOKEN, | |
| ) | |
| else: | |
| return gr.Error(f'Invalid dataset type "{dataset_type}".') | |
| reference_df = pd.read_parquet(reference_file) | |
| reference_size = len(reference_df) | |
| with open(user_file.name, "rt", encoding="utf-8") as f: | |
| user_lines = f.readlines() | |
| user_size = len(user_lines) | |
| if reference_size != user_size: | |
| return gr.Error( | |
| f"Line count mismatch: reference has {reference_size} rows, " | |
| f"the file you uploaded has {user_size} lines." | |
| ) | |
| def normalise(raw): | |
| return normalize("NFC", raw).strip() | |
| user_data = [] | |
| for i, line in enumerate(user_lines): | |
| user_data.append( | |
| { | |
| "id": i, | |
| "iso_639_3": "xxx", | |
| "iso_15924": "Xxxx", | |
| "glottocode": "xxxx1234", | |
| "text": normalise(line), | |
| "last_updated": "2.1", | |
| } | |
| ) | |
| temp_dir = tempfile.mkdtemp() | |
| filename = "xxx_Xxxx.parquet" | |
| target_path = os.path.join(temp_dir, filename) | |
| pd.DataFrame(user_data).to_parquet(target_path, index=False) | |
| return target_path | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# Dataset checker") | |
| dataset_type = gr.Dropdown( | |
| ["FLORES+ dev", "FLORES+ devtest", "OLDI-Seed"], | |
| label="Dataset type", | |
| ) | |
| dataset_file = gr.File(label="Dataset file") | |
| parquet_file = gr.File(label="Download Parquet file") | |
| btn = gr.Button("Check") | |
| btn.click( | |
| fn=process_file, | |
| inputs=[dataset_type, dataset_file], | |
| outputs=parquet_file, | |
| ) | |
| demo.launch() | |