Shami96 commited on
Commit
6c1e37b
Β·
verified Β·
1 Parent(s): 8df4ecc

Update updated_word.py

Browse files
Files changed (1) hide show
  1. updated_word.py +155 -2
updated_word.py CHANGED
@@ -1124,8 +1124,157 @@ def process_red_text_in_paragraph(paragraph, context_text, flat_json):
1124
 
1125
  return replacements_made
1126
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1127
  def process_hf(json_file, docx_file, output_file):
1128
- """Your original main function (unchanged)"""
1129
  try:
1130
  # Load JSON
1131
  if hasattr(json_file, "read"):
@@ -1154,7 +1303,10 @@ def process_hf(json_file, docx_file, output_file):
1154
  paragraph_replacements = process_paragraphs(doc, flat_json)
1155
  heading_replacements = process_headings(doc, flat_json)
1156
 
1157
- total_replacements = table_replacements + paragraph_replacements + heading_replacements
 
 
 
1158
 
1159
  # Save output
1160
  if hasattr(output_file, "write"):
@@ -1167,6 +1319,7 @@ def process_hf(json_file, docx_file, output_file):
1167
  print(f" πŸ“Š Tables: {table_replacements}")
1168
  print(f" πŸ“ Paragraphs: {paragraph_replacements}")
1169
  print(f" πŸ“‹ Headings: {heading_replacements}")
 
1170
  print(f"πŸŽ‰ Processing complete!")
1171
 
1172
  except FileNotFoundError as e:
 
1124
 
1125
  return replacements_made
1126
 
1127
+ def force_red_text_replacement(document, flat_json):
1128
+ """Force replacement of any remaining red text by trying ALL JSON values"""
1129
+ replacements_made = 0
1130
+ print(f"\n🎯 FORCE FIX: Scanning for any remaining red text...")
1131
+
1132
+ # Collect ALL possible replacement values from JSON
1133
+ all_values = {}
1134
+ for key, value in flat_json.items():
1135
+ if value and str(value).strip():
1136
+ # Store both the key and variations of the value
1137
+ value_str = get_value_as_string(value, key)
1138
+ if value_str and value_str.strip():
1139
+ all_values[key] = value_str
1140
+
1141
+ # Also store individual words/parts for partial matching
1142
+ if isinstance(value, list):
1143
+ for item in value:
1144
+ if str(item).strip():
1145
+ all_values[f"{key}_item"] = str(item).strip()
1146
+
1147
+ print(f" Found {len(all_values)} potential replacement values")
1148
+
1149
+ # Process all tables
1150
+ for table_idx, table in enumerate(document.tables):
1151
+ for row_idx, row in enumerate(table.rows):
1152
+ for cell_idx, cell in enumerate(row.cells):
1153
+ if has_red_text(cell):
1154
+ print(f" πŸ” Found red text in Table {table_idx + 1}, Row {row_idx + 1}, Cell {cell_idx + 1}")
1155
+
1156
+ # Extract all red text from this cell
1157
+ red_text_parts = []
1158
+ for paragraph in cell.paragraphs:
1159
+ for run in paragraph.runs:
1160
+ if is_red(run) and run.text.strip():
1161
+ red_text_parts.append(run.text.strip())
1162
+
1163
+ combined_red_text = " ".join(red_text_parts).strip()
1164
+ print(f" Red text: '{combined_red_text}'")
1165
+
1166
+ # Try to find a match
1167
+ best_match = None
1168
+ best_key = None
1169
+
1170
+ # First try exact matching
1171
+ for key, value in all_values.items():
1172
+ if combined_red_text.lower() == value.lower():
1173
+ best_match = value
1174
+ best_key = key
1175
+ break
1176
+
1177
+ # If no exact match, try partial matching
1178
+ if not best_match:
1179
+ for key, value in all_values.items():
1180
+ # Try if red text contains this value or vice versa
1181
+ if (len(value) > 3 and value.lower() in combined_red_text.lower()) or \
1182
+ (len(combined_red_text) > 3 and combined_red_text.lower() in value.lower()):
1183
+ best_match = value
1184
+ best_key = key
1185
+ break
1186
+
1187
+ # If still no match, try word-by-word matching for names/dates
1188
+ if not best_match:
1189
+ red_words = set(word.lower() for word in combined_red_text.split() if len(word) > 2)
1190
+ best_score = 0
1191
+
1192
+ for key, value in all_values.items():
1193
+ value_words = set(word.lower() for word in str(value).split() if len(word) > 2)
1194
+ if red_words and value_words:
1195
+ common_words = red_words.intersection(value_words)
1196
+ if common_words:
1197
+ score = len(common_words) / len(red_words)
1198
+ if score > best_score and score >= 0.5: # At least 50% match
1199
+ best_score = score
1200
+ best_match = value
1201
+ best_key = key
1202
+
1203
+ # Replace if we found a match
1204
+ if best_match:
1205
+ print(f" βœ… Replacing with: '{best_match}' (from key: '{best_key}')")
1206
+ cell_replacements = replace_red_text_in_cell(cell, best_match)
1207
+ replacements_made += cell_replacements
1208
+ print(f" Made {cell_replacements} replacements")
1209
+ else:
1210
+ print(f" ❌ No suitable replacement found")
1211
+
1212
+ # Process all paragraphs
1213
+ for para_idx, paragraph in enumerate(document.paragraphs):
1214
+ if has_red_text_in_paragraph(paragraph):
1215
+ red_text_parts = []
1216
+ for run in paragraph.runs:
1217
+ if is_red(run) and run.text.strip():
1218
+ red_text_parts.append(run.text.strip())
1219
+
1220
+ combined_red_text = " ".join(red_text_parts).strip()
1221
+ if combined_red_text:
1222
+ print(f" πŸ” Found red text in Paragraph {para_idx + 1}: '{combined_red_text}'")
1223
+
1224
+ # Same matching logic as above
1225
+ best_match = None
1226
+ best_key = None
1227
+
1228
+ # Exact match
1229
+ for key, value in all_values.items():
1230
+ if combined_red_text.lower() == value.lower():
1231
+ best_match = value
1232
+ best_key = key
1233
+ break
1234
+
1235
+ # Partial match
1236
+ if not best_match:
1237
+ for key, value in all_values.items():
1238
+ if (len(value) > 3 and value.lower() in combined_red_text.lower()) or \
1239
+ (len(combined_red_text) > 3 and combined_red_text.lower() in value.lower()):
1240
+ best_match = value
1241
+ best_key = key
1242
+ break
1243
+
1244
+ # Word match
1245
+ if not best_match:
1246
+ red_words = set(word.lower() for word in combined_red_text.split() if len(word) > 2)
1247
+ best_score = 0
1248
+
1249
+ for key, value in all_values.items():
1250
+ value_words = set(word.lower() for word in str(value).split() if len(word) > 2)
1251
+ if red_words and value_words:
1252
+ common_words = red_words.intersection(value_words)
1253
+ if common_words:
1254
+ score = len(common_words) / len(red_words)
1255
+ if score > best_score and score >= 0.5:
1256
+ best_score = score
1257
+ best_match = value
1258
+ best_key = key
1259
+
1260
+ # Replace if found
1261
+ if best_match:
1262
+ print(f" βœ… Replacing with: '{best_match}' (from key: '{best_key}')")
1263
+ red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
1264
+ if red_runs:
1265
+ red_runs[0].text = best_match
1266
+ red_runs[0].font.color.rgb = RGBColor(0, 0, 0)
1267
+ for run in red_runs[1:]:
1268
+ run.text = ''
1269
+ replacements_made += 1
1270
+ print(f" Made 1 paragraph replacement")
1271
+ else:
1272
+ print(f" ❌ No suitable replacement found")
1273
+
1274
+ return replacements_made
1275
+
1276
  def process_hf(json_file, docx_file, output_file):
1277
+ """Your original main function with force fix added at the end"""
1278
  try:
1279
  # Load JSON
1280
  if hasattr(json_file, "read"):
 
1303
  paragraph_replacements = process_paragraphs(doc, flat_json)
1304
  heading_replacements = process_headings(doc, flat_json)
1305
 
1306
+ # 🎯 ADD THIS: Force fix for any remaining red text
1307
+ force_replacements = force_red_text_replacement(doc, flat_json)
1308
+
1309
+ total_replacements = table_replacements + paragraph_replacements + heading_replacements + force_replacements
1310
 
1311
  # Save output
1312
  if hasattr(output_file, "write"):
 
1319
  print(f" πŸ“Š Tables: {table_replacements}")
1320
  print(f" πŸ“ Paragraphs: {paragraph_replacements}")
1321
  print(f" πŸ“‹ Headings: {heading_replacements}")
1322
+ print(f" 🎯 Force fixes: {force_replacements}")
1323
  print(f"πŸŽ‰ Processing complete!")
1324
 
1325
  except FileNotFoundError as e: