Spaces:
Running
Running
fixed a couple of bugs
Browse files- src/translate_any_doc.py +14 -5
src/translate_any_doc.py
CHANGED
|
@@ -117,7 +117,8 @@ def get_runs_from_paragraph(paragraph: str, paragraph_index: int) -> list[dict[s
|
|
| 117 |
elif is_self_closing:
|
| 118 |
# Self-closing tag like <x id="1"/>
|
| 119 |
if tag_id is None:
|
| 120 |
-
|
|
|
|
| 121 |
runs.append({
|
| 122 |
"text": "",
|
| 123 |
"id": [f"{tag_name}_{tag_id}"],
|
|
@@ -126,7 +127,8 @@ def get_runs_from_paragraph(paragraph: str, paragraph_index: int) -> list[dict[s
|
|
| 126 |
else:
|
| 127 |
# Opening tag <tag id="...">
|
| 128 |
if tag_id is None:
|
| 129 |
-
|
|
|
|
| 130 |
tag_stack.append(f"{tag_name}_{tag_id}")
|
| 131 |
|
| 132 |
pos = end
|
|
@@ -339,8 +341,13 @@ def generate_alignments(original_tokenized_sentences_with_style: list[list[dict[
|
|
| 339 |
else:
|
| 340 |
# WARNING this is a test
|
| 341 |
# since fastalign doesn't know from which word to reference this token, copy the style of the previous word
|
| 342 |
-
|
| 343 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 344 |
translated_sentence_with_style.append(new_entry)
|
| 345 |
|
| 346 |
translated_sentences_with_style.append(translated_sentence_with_style)
|
|
@@ -395,7 +402,9 @@ def runs_to_plain_text(paragraphs_with_style: dict[int, list[dict[str, str, str]
|
|
| 395 |
tag = ""
|
| 396 |
for gid in ids:
|
| 397 |
tag_type, tag_id = gid.split("_")
|
| 398 |
-
tag += f'<{tag_type}
|
|
|
|
|
|
|
| 399 |
return tag
|
| 400 |
|
| 401 |
for key, paragraph in paragraphs_with_style.items():
|
|
|
|
| 117 |
elif is_self_closing:
|
| 118 |
# Self-closing tag like <x id="1"/>
|
| 119 |
if tag_id is None:
|
| 120 |
+
tag_id = -1
|
| 121 |
+
#raise ValueError(f"Self-closing tag <{tag_name}/> missing id")
|
| 122 |
runs.append({
|
| 123 |
"text": "",
|
| 124 |
"id": [f"{tag_name}_{tag_id}"],
|
|
|
|
| 127 |
else:
|
| 128 |
# Opening tag <tag id="...">
|
| 129 |
if tag_id is None:
|
| 130 |
+
tag_id = -1
|
| 131 |
+
#raise ValueError(f"Opening tag <{tag_name}> missing id")
|
| 132 |
tag_stack.append(f"{tag_name}_{tag_id}")
|
| 133 |
|
| 134 |
pos = end
|
|
|
|
| 341 |
else:
|
| 342 |
# WARNING this is a test
|
| 343 |
# since fastalign doesn't know from which word to reference this token, copy the style of the previous word
|
| 344 |
+
try:
|
| 345 |
+
new_entry = translated_sentence_with_style[-1].copy()
|
| 346 |
+
# no previous word? make it up
|
| 347 |
+
except IndexError:
|
| 348 |
+
current_paragraph = original_tokenized_sentences_with_style[sentence_idx][0]["paragraph_index"]
|
| 349 |
+
new_entry = {'id': [], 'paragraph_index': current_paragraph, 'text': translated_token}
|
| 350 |
+
|
| 351 |
translated_sentence_with_style.append(new_entry)
|
| 352 |
|
| 353 |
translated_sentences_with_style.append(translated_sentence_with_style)
|
|
|
|
| 402 |
tag = ""
|
| 403 |
for gid in ids:
|
| 404 |
tag_type, tag_id = gid.split("_")
|
| 405 |
+
tag += f'<{tag_type}'
|
| 406 |
+
if int(tag_id) > 0:
|
| 407 |
+
tag += f' id="{tag_id}">'
|
| 408 |
return tag
|
| 409 |
|
| 410 |
for key, paragraph in paragraphs_with_style.items():
|