Shami96 commited on
Commit
5fb575a
·
verified ·
1 Parent(s): 88a026a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -42
app.py CHANGED
@@ -1,41 +1,38 @@
 
1
  import gradio as gr
2
  import tempfile
3
- import shutil
4
- from pdf_extractor import extract_text_pdf_raw
5
  from word_extractor import extract_red_text_with_labels, is_red_font
6
  from docx import Document
7
  from docx.shared import RGBColor
8
  import difflib
9
 
10
 
11
- def find_best_match(target, candidates):
12
- match = difflib.get_close_matches(target, candidates, n=1, cutoff=0.5)
 
13
  return match[0] if match else None
14
 
15
 
16
- def replace_red_text_in_doc(doc_path, replacements):
17
- doc = Document(doc_path)
18
-
19
- for para in doc.paragraphs:
20
- for run in para.runs:
21
- if is_red_font(run):
22
- old_text = run.text.strip()
23
- new_text = find_best_match(old_text, replacements)
24
- if new_text:
25
- run.text = new_text
26
- run.font.color.rgb = RGBColor(0, 0, 0) # Set to black
27
 
28
  for table in doc.tables:
29
  for row in table.rows:
30
- for cell in row.cells:
31
- for para in cell.paragraphs:
 
 
 
 
 
 
 
 
32
  for run in para.runs:
33
  if is_red_font(run):
34
- old_text = run.text.strip()
35
- new_text = find_best_match(old_text, replacements)
36
- if new_text:
37
- run.text = new_text
38
- run.font.color.rgb = RGBColor(0, 0, 0) # Set to black
39
 
40
  temp_dir = tempfile.mkdtemp()
41
  updated_path = f"{temp_dir}/updated.docx"
@@ -47,25 +44,10 @@ def process_files(pdf_file, word_file):
47
  pdf_path = pdf_file.name
48
  word_path = word_file.name
49
 
50
- pdf_text = extract_text_pdf_raw(pdf_path)
51
- word_data = extract_red_text_with_labels(word_path)
52
-
53
- # Flatten red text entries
54
- red_values = []
55
- for values in word_data.values():
56
- red_values.extend(values)
57
- red_values = list(set(red_values)) # dedupe
58
-
59
- # Match red values to PDF
60
- replacements = []
61
- for val in red_values:
62
- match = find_best_match(val, pdf_text)
63
- if match:
64
- replacements.append(match)
65
-
66
- # Replace in Word
67
- updated_doc_path = replace_red_text_in_doc(word_path, replacements)
68
 
 
69
  return updated_doc_path
70
 
71
 
@@ -76,6 +58,6 @@ gr.Interface(
76
  gr.File(label="Upload Word File", type="file")
77
  ],
78
  outputs=gr.File(label="Download Updated Word File"),
79
- title="Red Text Replacer",
80
- description="Upload a PDF and Word document. Red-colored text in the Word doc will be replaced by matching content from the PDF."
81
  ).launch()
 
1
+ # app.py
2
  import gradio as gr
3
  import tempfile
4
+ from pdf_extractor import extract_label_value_pairs
 
5
  from word_extractor import extract_red_text_with_labels, is_red_font
6
  from docx import Document
7
  from docx.shared import RGBColor
8
  import difflib
9
 
10
 
11
+ def find_best_match_label(target_label, pdf_data):
12
+ keys = list(pdf_data.keys())
13
+ match = difflib.get_close_matches(target_label.lower(), keys, n=1, cutoff=0.4)
14
  return match[0] if match else None
15
 
16
 
17
+ def replace_red_text_by_label(word_path, label_value_map):
18
+ doc = Document(word_path)
 
 
 
 
 
 
 
 
 
19
 
20
  for table in doc.tables:
21
  for row in table.rows:
22
+ cells = row.cells
23
+ if len(cells) >= 2:
24
+ label = cells[0].text.strip().replace(":", "").replace("\n", " ")
25
+ matched_label = find_best_match_label(label, label_value_map)
26
+ if not matched_label:
27
+ continue
28
+
29
+ new_value = label_value_map[matched_label]
30
+
31
+ for para in cells[1].paragraphs:
32
  for run in para.runs:
33
  if is_red_font(run):
34
+ run.text = new_value
35
+ run.font.color.rgb = RGBColor(0, 0, 0) # make black
 
 
 
36
 
37
  temp_dir = tempfile.mkdtemp()
38
  updated_path = f"{temp_dir}/updated.docx"
 
44
  pdf_path = pdf_file.name
45
  word_path = word_file.name
46
 
47
+ pdf_data = extract_label_value_pairs(pdf_path) # {label: value}
48
+ word_data = extract_red_text_with_labels(word_path) # {label: [red_texts]}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
+ updated_doc_path = replace_red_text_by_label(word_path, pdf_data)
51
  return updated_doc_path
52
 
53
 
 
58
  gr.File(label="Upload Word File", type="file")
59
  ],
60
  outputs=gr.File(label="Download Updated Word File"),
61
+ title="Red Text Replacer (Label-Aware)",
62
+ description="Upload a PDF and Word document. Red-colored text in the Word doc will be replaced by matching values from the PDF using label-based matching."
63
  ).launch()