Spaces:

LangTech-MT
/

document-translator

Running

App Files Files Community

mjuvilla commited on 25 days ago

Commit

d9b224d

1 Parent(s): ab66871

fixed a couple of bugs

Browse files

Files changed (1) hide show

src/translate_any_doc.py +14 -5

src/translate_any_doc.py CHANGED Viewed

@@ -117,7 +117,8 @@ def get_runs_from_paragraph(paragraph: str, paragraph_index: int) -> list[dict[s
         elif is_self_closing:
             # Self-closing tag like <x id="1"/>
             if tag_id is None:
-                raise ValueError(f"Self-closing tag <{tag_name}/> missing id")
             runs.append({
                 "text": "",
                 "id": [f"{tag_name}_{tag_id}"],
@@ -126,7 +127,8 @@ def get_runs_from_paragraph(paragraph: str, paragraph_index: int) -> list[dict[s
         else:
             # Opening tag <tag id="...">
             if tag_id is None:
-                raise ValueError(f"Opening tag <{tag_name}> missing id")
             tag_stack.append(f"{tag_name}_{tag_id}")
         pos = end
@@ -339,8 +341,13 @@ def generate_alignments(original_tokenized_sentences_with_style: list[list[dict[
             else:
                 # WARNING this is a test
                 # since fastalign doesn't know from which word to reference this token, copy the style of the previous word
-                new_entry = translated_sentence_with_style[-1].copy()
-                new_entry["text"] = translated_token
                 translated_sentence_with_style.append(new_entry)
         translated_sentences_with_style.append(translated_sentence_with_style)
@@ -395,7 +402,9 @@ def runs_to_plain_text(paragraphs_with_style: dict[int, list[dict[str, str, str]
             tag = ""
             for gid in ids:
                 tag_type, tag_id = gid.split("_")
-                tag += f'<{tag_type} id="{tag_id}">'
             return tag
         for key, paragraph in paragraphs_with_style.items():

         elif is_self_closing:
             # Self-closing tag like <x id="1"/>
             if tag_id is None:
+                tag_id = -1
+                #raise ValueError(f"Self-closing tag <{tag_name}/> missing id")
             runs.append({
                 "text": "",
                 "id": [f"{tag_name}_{tag_id}"],
         else:
             # Opening tag <tag id="...">
             if tag_id is None:
+                tag_id = -1
+                #raise ValueError(f"Opening tag <{tag_name}> missing id")
             tag_stack.append(f"{tag_name}_{tag_id}")
         pos = end
             else:
                 # WARNING this is a test
                 # since fastalign doesn't know from which word to reference this token, copy the style of the previous word
+                try:
+                    new_entry = translated_sentence_with_style[-1].copy()
+                # no previous word? make it up
+                except IndexError:
+                    current_paragraph = original_tokenized_sentences_with_style[sentence_idx][0]["paragraph_index"]
+                    new_entry = {'id': [], 'paragraph_index': current_paragraph, 'text': translated_token}
                 translated_sentence_with_style.append(new_entry)
         translated_sentences_with_style.append(translated_sentence_with_style)
             tag = ""
             for gid in ids:
                 tag_type, tag_id = gid.split("_")
+                tag += f'<{tag_type}'
+                if int(tag_id) > 0:
+                    tag += f' id="{tag_id}">'
             return tag
         for key, paragraph in paragraphs_with_style.items():