mjuvilla commited on
Commit
d9b224d
·
1 Parent(s): ab66871

fixed a couple of bugs

Browse files
Files changed (1) hide show
  1. src/translate_any_doc.py +14 -5
src/translate_any_doc.py CHANGED
@@ -117,7 +117,8 @@ def get_runs_from_paragraph(paragraph: str, paragraph_index: int) -> list[dict[s
117
  elif is_self_closing:
118
  # Self-closing tag like <x id="1"/>
119
  if tag_id is None:
120
- raise ValueError(f"Self-closing tag <{tag_name}/> missing id")
 
121
  runs.append({
122
  "text": "",
123
  "id": [f"{tag_name}_{tag_id}"],
@@ -126,7 +127,8 @@ def get_runs_from_paragraph(paragraph: str, paragraph_index: int) -> list[dict[s
126
  else:
127
  # Opening tag <tag id="...">
128
  if tag_id is None:
129
- raise ValueError(f"Opening tag <{tag_name}> missing id")
 
130
  tag_stack.append(f"{tag_name}_{tag_id}")
131
 
132
  pos = end
@@ -339,8 +341,13 @@ def generate_alignments(original_tokenized_sentences_with_style: list[list[dict[
339
  else:
340
  # WARNING this is a test
341
  # since fastalign doesn't know from which word to reference this token, copy the style of the previous word
342
- new_entry = translated_sentence_with_style[-1].copy()
343
- new_entry["text"] = translated_token
 
 
 
 
 
344
  translated_sentence_with_style.append(new_entry)
345
 
346
  translated_sentences_with_style.append(translated_sentence_with_style)
@@ -395,7 +402,9 @@ def runs_to_plain_text(paragraphs_with_style: dict[int, list[dict[str, str, str]
395
  tag = ""
396
  for gid in ids:
397
  tag_type, tag_id = gid.split("_")
398
- tag += f'<{tag_type} id="{tag_id}">'
 
 
399
  return tag
400
 
401
  for key, paragraph in paragraphs_with_style.items():
 
117
  elif is_self_closing:
118
  # Self-closing tag like <x id="1"/>
119
  if tag_id is None:
120
+ tag_id = -1
121
+ #raise ValueError(f"Self-closing tag <{tag_name}/> missing id")
122
  runs.append({
123
  "text": "",
124
  "id": [f"{tag_name}_{tag_id}"],
 
127
  else:
128
  # Opening tag <tag id="...">
129
  if tag_id is None:
130
+ tag_id = -1
131
+ #raise ValueError(f"Opening tag <{tag_name}> missing id")
132
  tag_stack.append(f"{tag_name}_{tag_id}")
133
 
134
  pos = end
 
341
  else:
342
  # WARNING this is a test
343
  # since fastalign doesn't know from which word to reference this token, copy the style of the previous word
344
+ try:
345
+ new_entry = translated_sentence_with_style[-1].copy()
346
+ # no previous word? make it up
347
+ except IndexError:
348
+ current_paragraph = original_tokenized_sentences_with_style[sentence_idx][0]["paragraph_index"]
349
+ new_entry = {'id': [], 'paragraph_index': current_paragraph, 'text': translated_token}
350
+
351
  translated_sentence_with_style.append(new_entry)
352
 
353
  translated_sentences_with_style.append(translated_sentence_with_style)
 
402
  tag = ""
403
  for gid in ids:
404
  tag_type, tag_id = gid.split("_")
405
+ tag += f'<{tag_type}'
406
+ if int(tag_id) > 0:
407
+ tag += f' id="{tag_id}">'
408
  return tag
409
 
410
  for key, paragraph in paragraphs_with_style.items():