Spaces:

cstr
/

translate_datasets

Running

App Files Files Community

cstr commited on May 17, 2024

Commit

55f037d

verified ·

1 Parent(s): 964e0c7

Update app.py

Browse files

+other target languages

Files changed (1) hide show

app.py +17 -10

app.py CHANGED Viewed

@@ -550,7 +550,7 @@ logger = logging.getLogger(__name__)
 # Main function to handle the translation workflow
 # Main function to handle the translation workflow
-def main(dataset_url, model_type, output_dataset_name, range_specification, token: gr.OAuthToken | None, profile: gr.OAuthProfile | None):
     try:
         # Login to Hugging Face
         if token is None or profile is None or token.token is None or profile.username is None:
@@ -574,6 +574,7 @@ def main(dataset_url, model_type, output_dataset_name, range_specification, toke
             # Load the tokenizer
             tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_name)
             tokenizer.src_lang = "en"
             logger.info("Tokenizer loaded successfully.")
             # Define the task based on user input
@@ -581,10 +582,11 @@ def main(dataset_url, model_type, output_dataset_name, range_specification, toke
                 "url": dataset_url,
                 "local_path": "train.parquet",
                 "input_file": f"{model_type}_en.jsonl",
-                "output_file": f"{model_type}_de.jsonl",
-                "raw_file": f"{model_type}_de_raw.jsonl",
                 "range_spec": range_specification,
-                "model_type": model_type
             }
             # Call the translate_dataset function with the provided parameters
@@ -601,6 +603,7 @@ def main(dataset_url, model_type, output_dataset_name, range_specification, toke
                 model_type=task["model_type"],
                 translator=translator,
                 tokenizer=tokenizer,
             )
             logger.info("Dataset translation completed!")
             return "Dataset translation completed!\n\n### Logs:\n" + log_stream.getvalue()
@@ -608,15 +611,17 @@ def main(dataset_url, model_type, output_dataset_name, range_specification, toke
             return "Login failed. Please try again."
     except Exception as e:
         logger.error(f"An error occurred in the main function: {e}")
-        # Ensure logs are flushed and captured
         return f"An error occurred: {e}\n\n### Logs:\n{log_stream.getvalue()}"
 # Gradio interface setup
 gradio_title = "🧐 WMT21 Dataset Translation"
-gradio_desc = """This tool translates datasets using the WMT21 translation model.
 ## 💭 What Does This Tool Do:
-- Translates datasets based on the selected model type.
-- Uploads the translated dataset to Hugging Face."""
 datasets_desc = """## 📊 Dataset Types:
 - **mix**:
   - `prompt`: List of dictionaries with 'content' and 'role' fields (multi-turn conversation).
@@ -650,12 +655,14 @@ with gr.Blocks(theme=theme) as demo:
             model_type = gr.Dropdown(choices=["mix", "ufb_cached", "ufb"], label="Dataset Type")
             output_dataset_name = gr.Textbox(label="Output Dataset Name", lines=1, placeholder = "cstr/translated_datasets")
             range_specification = gr.Textbox(label="Range Specification", lines=1, placeholder="e.g., 1-100")
         with gr.Column():
             output = gr.Markdown(label="Output")
     submit_btn = gr.Button("Translate Dataset", variant="primary")
-    submit_btn.click(main, inputs=[dataset_url, model_type, output_dataset_name, range_specification], outputs=output)
     gr.Markdown(datasets_desc)

 # Main function to handle the translation workflow
 # Main function to handle the translation workflow
+def main(dataset_url, model_type, output_dataset_name, range_specification, target_language, token: gr.OAuthToken | None, profile: gr.OAuthProfile | None):
     try:
         # Login to Hugging Face
         if token is None or profile is None or token.token is None or profile.username is None:
             # Load the tokenizer
             tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_name)
             tokenizer.src_lang = "en"
+            tokenizer.tgt_lang = target_language  # Set target language
             logger.info("Tokenizer loaded successfully.")
             # Define the task based on user input
                 "url": dataset_url,
                 "local_path": "train.parquet",
                 "input_file": f"{model_type}_en.jsonl",
+                "output_file": f"{model_type}_{target_language}.jsonl",  # Include target language in the filename
+                "raw_file": f"{model_type}_{target_language}_raw.jsonl",
                 "range_spec": range_specification,
+                "model_type": model_type,
+                "target_language": target_language  # Include target language in the task
             }
             # Call the translate_dataset function with the provided parameters
                 model_type=task["model_type"],
                 translator=translator,
                 tokenizer=tokenizer,
+                target_language=task["target_language"]  # Pass the target language
             )
             logger.info("Dataset translation completed!")
             return "Dataset translation completed!\n\n### Logs:\n" + log_stream.getvalue()
             return "Login failed. Please try again."
     except Exception as e:
         logger.error(f"An error occurred in the main function: {e}")
         return f"An error occurred: {e}\n\n### Logs:\n{log_stream.getvalue()}"
 # Gradio interface setup
 gradio_title = "🧐 WMT21 Dataset Translation"
+gradio_desc = """This tool translates english datasets using the WMT21 translation model.
 ## 💭 What Does This Tool Do:
+- Translates datasets with structures based on the selected model type.
+- The translation model (facebook/wmt21-dense-24-wide-en-x) supports as target languages: Hausa (ha), Icelandic (is), Japanese (ja), Czech (cs), Russian (ru), Chinese (zh), German (de)
+- Uploads the translated dataset to Hugging Face.
+- At the moment, this works only on CPU, and therefore is very very slow (>1 minute per item depending on string lengths)."""
 datasets_desc = """## 📊 Dataset Types:
 - **mix**:
   - `prompt`: List of dictionaries with 'content' and 'role' fields (multi-turn conversation).
             model_type = gr.Dropdown(choices=["mix", "ufb_cached", "ufb"], label="Dataset Type")
             output_dataset_name = gr.Textbox(label="Output Dataset Name", lines=1, placeholder = "cstr/translated_datasets")
             range_specification = gr.Textbox(label="Range Specification", lines=1, placeholder="e.g., 1-100")
+            target_language = gr.Dropdown(choices=["ha", "is", "ja", "cs", "ru", "zh", "de"], label="Target Language")  # New dropdown for target language
         with gr.Column():
             output = gr.Markdown(label="Output")
     submit_btn = gr.Button("Translate Dataset", variant="primary")
+    submit_btn.click(main, inputs=[dataset_url, model_type, output_dataset_name, range_specification, target_language], outputs=output)
     gr.Markdown(datasets_desc)