Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -89,7 +89,7 @@ def parse_range_specification(range_specification, file_length):
|
|
| 89 |
line_indices.append(single_line)
|
| 90 |
return line_indices
|
| 91 |
|
| 92 |
-
def translate_text(text, translator, tokenizer):
|
| 93 |
"""
|
| 94 |
Translates the given text from English to German using CTranslate2 and the WMT21 model,
|
| 95 |
with special handling for newlines and segmenting text longer than 500 characters.
|
|
@@ -131,7 +131,7 @@ def translate_text(text, translator, tokenizer):
|
|
| 131 |
translated_segments = []
|
| 132 |
for segment in segments:
|
| 133 |
source = tokenizer.convert_ids_to_tokens(tokenizer.encode(segment))
|
| 134 |
-
target_prefix = [tokenizer.lang_code_to_token[
|
| 135 |
results = translator.translate_batch([source], target_prefix=[target_prefix])
|
| 136 |
target = results[0].hypotheses[0][1:]
|
| 137 |
translated_segment = tokenizer.decode(tokenizer.convert_tokens_to_ids(target))
|
|
@@ -150,7 +150,7 @@ def translate_text(text, translator, tokenizer):
|
|
| 150 |
logging.error(f"An error occurred during translation: {e}")
|
| 151 |
return None
|
| 152 |
|
| 153 |
-
def translate_item_ufb(item, raw_file_path, translator, tokenizer):
|
| 154 |
try:
|
| 155 |
# Translate the prompt directly since it's a string
|
| 156 |
translated_prompt = translate_text(item['prompt'], translator, tokenizer)
|
|
@@ -158,12 +158,12 @@ def translate_item_ufb(item, raw_file_path, translator, tokenizer):
|
|
| 158 |
# Translate the chosen and rejected contents
|
| 159 |
translated_chosen = []
|
| 160 |
for choice in item['chosen']:
|
| 161 |
-
translated_content = translate_text(choice['content'], translator, tokenizer)
|
| 162 |
translated_chosen.append({'content': translated_content, 'role': choice['role']})
|
| 163 |
|
| 164 |
translated_rejected = []
|
| 165 |
for choice in item['rejected']:
|
| 166 |
-
translated_content = translate_text(choice['content'], translator, tokenizer)
|
| 167 |
translated_rejected.append({'content': translated_content, 'role': choice['role']})
|
| 168 |
|
| 169 |
# Write the raw response to a backup file
|
|
@@ -211,7 +211,7 @@ def validate_item_ufb(item):
|
|
| 211 |
|
| 212 |
|
| 213 |
|
| 214 |
-
def translate_item_mix(item, raw_file_path, translator, tokenizer):
|
| 215 |
"""
|
| 216 |
Translates the relevant fields in the given item from English to German using CTranslate2 and the WMT21 model,
|
| 217 |
and saves the raw response to a backup file.
|
|
@@ -221,12 +221,12 @@ def translate_item_mix(item, raw_file_path, translator, tokenizer):
|
|
| 221 |
# Translate each part of the prompt separately and preserve the order
|
| 222 |
translated_prompts = []
|
| 223 |
for message in item['prompt']:
|
| 224 |
-
translated_content = translate_text(message['content'], translator, tokenizer)
|
| 225 |
translated_prompts.append({'content': translated_content, 'role': message['role']})
|
| 226 |
|
| 227 |
# Translate the chosen and rejected contents
|
| 228 |
-
translated_chosen_content = translate_text(item['chosen'][0]['content'], translator, tokenizer)
|
| 229 |
-
translated_rejected_content = translate_text(item['rejected'][0]['content'], translator, tokenizer)
|
| 230 |
|
| 231 |
# Write the raw response to a backup file
|
| 232 |
with open(raw_file_path, 'a', encoding='utf-8') as raw_file:
|
|
@@ -276,13 +276,13 @@ def validate_item_mix(item):
|
|
| 276 |
|
| 277 |
return True
|
| 278 |
|
| 279 |
-
def translate_item_ufb_cached(item, raw_file_path, translator, tokenizer):
|
| 280 |
try:
|
| 281 |
translated_texts = {} # Cache to store translated texts
|
| 282 |
|
| 283 |
# Translate the prompt if necessary (which is a user input and can appear again)
|
| 284 |
if item['prompt'] not in translated_texts:
|
| 285 |
-
translated_prompt = translate_text(item['prompt'], translator, tokenizer)
|
| 286 |
translated_texts[item['prompt']] = translated_prompt
|
| 287 |
else:
|
| 288 |
translated_prompt = translated_texts[item['prompt']]
|
|
@@ -290,7 +290,7 @@ def translate_item_ufb_cached(item, raw_file_path, translator, tokenizer):
|
|
| 290 |
# Helper function to handle content translation with caching
|
| 291 |
def get_translated_content(content):
|
| 292 |
if content not in translated_texts:
|
| 293 |
-
translated_texts[content] = translate_text(content, translator, tokenizer)
|
| 294 |
return translated_texts[content]
|
| 295 |
|
| 296 |
# Process translations for chosen and rejected sections
|
|
@@ -349,7 +349,7 @@ def validate_item_ufb_cached(item):
|
|
| 349 |
|
| 350 |
return True
|
| 351 |
|
| 352 |
-
def process_file(input_file_path, output_file_path, raw_file_path, line_indices, translator, tokenizer, model_type):
|
| 353 |
try:
|
| 354 |
# Assigning validation and translation functions based on model_type
|
| 355 |
if model_type == "mix":
|
|
@@ -387,7 +387,7 @@ def process_file(input_file_path, output_file_path, raw_file_path, line_indices,
|
|
| 387 |
retry_count = 0
|
| 388 |
while translated_item is None and retry_count < 3:
|
| 389 |
print ("going to translate the item...")
|
| 390 |
-
translated_item = translate_item(item, raw_file_path, translator, tokenizer)
|
| 391 |
retry_count += 1
|
| 392 |
if translated_item is None:
|
| 393 |
logging.warning(f"Translation failed for item. Retry attempt: {retry_count}")
|
|
@@ -485,7 +485,7 @@ def upload_output_to_huggingface(output_file_path, repo_name, token):
|
|
| 485 |
print(f"Failed to upload {output_file_path} to Hugging Face: {e}")
|
| 486 |
raise
|
| 487 |
|
| 488 |
-
def translate_dataset(train_url, local_parquet_path, input_file_path, output_file_path, raw_file_path, range_specification, model_type, output_dir, output_repo_name, token, translator, tokenizer):
|
| 489 |
try:
|
| 490 |
# Download the Parquet file
|
| 491 |
download_parquet(train_url, local_parquet_path)
|
|
@@ -527,7 +527,7 @@ def translate_dataset(train_url, local_parquet_path, input_file_path, output_fil
|
|
| 527 |
|
| 528 |
try:
|
| 529 |
# Process the file with specified model type and line indices
|
| 530 |
-
process_file(input_file_path, output_file_path, raw_file_path, line_indices, translator, tokenizer, model_type)
|
| 531 |
except Exception as e:
|
| 532 |
logging.error(f"Failed to process the file {input_file_path}: {e}")
|
| 533 |
return
|
|
|
|
| 89 |
line_indices.append(single_line)
|
| 90 |
return line_indices
|
| 91 |
|
| 92 |
+
def translate_text(text, translator, tokenizer, target_language):
|
| 93 |
"""
|
| 94 |
Translates the given text from English to German using CTranslate2 and the WMT21 model,
|
| 95 |
with special handling for newlines and segmenting text longer than 500 characters.
|
|
|
|
| 131 |
translated_segments = []
|
| 132 |
for segment in segments:
|
| 133 |
source = tokenizer.convert_ids_to_tokens(tokenizer.encode(segment))
|
| 134 |
+
target_prefix = [tokenizer.lang_code_to_token[target_language]]
|
| 135 |
results = translator.translate_batch([source], target_prefix=[target_prefix])
|
| 136 |
target = results[0].hypotheses[0][1:]
|
| 137 |
translated_segment = tokenizer.decode(tokenizer.convert_tokens_to_ids(target))
|
|
|
|
| 150 |
logging.error(f"An error occurred during translation: {e}")
|
| 151 |
return None
|
| 152 |
|
| 153 |
+
def translate_item_ufb(item, raw_file_path, translator, tokenizer, target_language):
|
| 154 |
try:
|
| 155 |
# Translate the prompt directly since it's a string
|
| 156 |
translated_prompt = translate_text(item['prompt'], translator, tokenizer)
|
|
|
|
| 158 |
# Translate the chosen and rejected contents
|
| 159 |
translated_chosen = []
|
| 160 |
for choice in item['chosen']:
|
| 161 |
+
translated_content = translate_text(choice['content'], translator, tokenizer, target_language)
|
| 162 |
translated_chosen.append({'content': translated_content, 'role': choice['role']})
|
| 163 |
|
| 164 |
translated_rejected = []
|
| 165 |
for choice in item['rejected']:
|
| 166 |
+
translated_content = translate_text(choice['content'], translator, tokenizer, target_language)
|
| 167 |
translated_rejected.append({'content': translated_content, 'role': choice['role']})
|
| 168 |
|
| 169 |
# Write the raw response to a backup file
|
|
|
|
| 211 |
|
| 212 |
|
| 213 |
|
| 214 |
+
def translate_item_mix(item, raw_file_path, translator, tokenizer, target_language):
|
| 215 |
"""
|
| 216 |
Translates the relevant fields in the given item from English to German using CTranslate2 and the WMT21 model,
|
| 217 |
and saves the raw response to a backup file.
|
|
|
|
| 221 |
# Translate each part of the prompt separately and preserve the order
|
| 222 |
translated_prompts = []
|
| 223 |
for message in item['prompt']:
|
| 224 |
+
translated_content = translate_text(message['content'], translator, tokenizer, target_language)
|
| 225 |
translated_prompts.append({'content': translated_content, 'role': message['role']})
|
| 226 |
|
| 227 |
# Translate the chosen and rejected contents
|
| 228 |
+
translated_chosen_content = translate_text(item['chosen'][0]['content'], translator, tokenizer, target_language)
|
| 229 |
+
translated_rejected_content = translate_text(item['rejected'][0]['content'], translator, tokenizer, target_language)
|
| 230 |
|
| 231 |
# Write the raw response to a backup file
|
| 232 |
with open(raw_file_path, 'a', encoding='utf-8') as raw_file:
|
|
|
|
| 276 |
|
| 277 |
return True
|
| 278 |
|
| 279 |
+
def translate_item_ufb_cached(item, raw_file_path, translator, tokenizer, target_language):
|
| 280 |
try:
|
| 281 |
translated_texts = {} # Cache to store translated texts
|
| 282 |
|
| 283 |
# Translate the prompt if necessary (which is a user input and can appear again)
|
| 284 |
if item['prompt'] not in translated_texts:
|
| 285 |
+
translated_prompt = translate_text(item['prompt'], translator, tokenizer, target_language)
|
| 286 |
translated_texts[item['prompt']] = translated_prompt
|
| 287 |
else:
|
| 288 |
translated_prompt = translated_texts[item['prompt']]
|
|
|
|
| 290 |
# Helper function to handle content translation with caching
|
| 291 |
def get_translated_content(content):
|
| 292 |
if content not in translated_texts:
|
| 293 |
+
translated_texts[content] = translate_text(content, translator, tokenizer, target_language)
|
| 294 |
return translated_texts[content]
|
| 295 |
|
| 296 |
# Process translations for chosen and rejected sections
|
|
|
|
| 349 |
|
| 350 |
return True
|
| 351 |
|
| 352 |
+
def process_file(input_file_path, output_file_path, raw_file_path, line_indices, translator, tokenizer, model_type, target_language):
|
| 353 |
try:
|
| 354 |
# Assigning validation and translation functions based on model_type
|
| 355 |
if model_type == "mix":
|
|
|
|
| 387 |
retry_count = 0
|
| 388 |
while translated_item is None and retry_count < 3:
|
| 389 |
print ("going to translate the item...")
|
| 390 |
+
translated_item = translate_item(item, raw_file_path, translator, tokenizer, target_language)
|
| 391 |
retry_count += 1
|
| 392 |
if translated_item is None:
|
| 393 |
logging.warning(f"Translation failed for item. Retry attempt: {retry_count}")
|
|
|
|
| 485 |
print(f"Failed to upload {output_file_path} to Hugging Face: {e}")
|
| 486 |
raise
|
| 487 |
|
| 488 |
+
def translate_dataset(train_url, local_parquet_path, input_file_path, output_file_path, raw_file_path, range_specification, model_type, output_dir, output_repo_name, token, translator, tokenizer, target_language):
|
| 489 |
try:
|
| 490 |
# Download the Parquet file
|
| 491 |
download_parquet(train_url, local_parquet_path)
|
|
|
|
| 527 |
|
| 528 |
try:
|
| 529 |
# Process the file with specified model type and line indices
|
| 530 |
+
process_file(input_file_path, output_file_path, raw_file_path, line_indices, translator, tokenizer, model_type, target_language)
|
| 531 |
except Exception as e:
|
| 532 |
logging.error(f"Failed to process the file {input_file_path}: {e}")
|
| 533 |
return
|