Spaces:
Sleeping
Sleeping
| # Import the required libraries | |
| import gradio as gr | |
| import cv2 # OpenCV, to read and manipulate images | |
| import easyocr # EasyOCR, for OCR | |
| import torch # PyTorch, for deep learning | |
| import pymupdf # PDF manipulation | |
| from transformers import pipeline # Hugging Face Transformers, for NER | |
| import os # OS, for file operations | |
| from glob import glob # Glob, to get file paths | |
| ########################################################################################################## | |
| # Initiate the models | |
| # Easyocr model | |
| print("Initiating easyocr") | |
| reader = easyocr.Reader(['en'], gpu=torch.cuda.is_available(), model_storage_directory='.') | |
| # Use gpu if available | |
| print("Using gpu if available") | |
| device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") | |
| print(f"Using device: {device}") | |
| # Ner model | |
| print("Initiating nlp pipeline") | |
| nlp = pipeline("token-classification", model="dslim/distilbert-NER", device=device) | |
| ########################################################################################################## | |
| ## Functions | |
| # Define img_format | |
| img_format = "png" | |
| # Convert pdf to set of images | |
| def convert_to_images(pdf_file_path): | |
| # Create a directory to store pdf images | |
| pdf_images_dir = f'{pdf_file_path}_images' | |
| os.makedirs(pdf_images_dir, exist_ok=True) | |
| # DPI | |
| dpi = 150 | |
| # Convert the PDF to images | |
| print("Converting PDF to images...") | |
| doc = pymupdf.open(pdf_file_path) # open document | |
| for page in doc: # iterate through the pages | |
| pix = page.get_pixmap(dpi=dpi) # render page to an image | |
| pix.save(f"{pdf_images_dir}/page-{page.number}.{img_format}") # store image as a PNG | |
| # Return the directory with the images | |
| return pdf_images_dir | |
| # Do the redaction | |
| def redact_image(pdf_image_path, redaction_score_threshold): | |
| # Loop through the images | |
| print("Redacting sensitive information...") | |
| print(f"Processing {pdf_image_path}...") | |
| # Read the image | |
| cv_image = cv2.imread(pdf_image_path) | |
| # Read the text from the image | |
| result = reader.readtext(cv_image, height_ths=0, width_ths=0, x_ths=0, y_ths=0) | |
| # Get the text from the result | |
| text = ' '.join([text for (bbox, text, prob) in result]) | |
| # Perform NER on the text | |
| ner_results = nlp(text) | |
| # Draw bounding boxes | |
| for ((bbox, text, prob),ner_result) in zip(result, ner_results): | |
| # Get the coordinates of the bounding box | |
| (top_left, top_right, bottom_right, bottom_left) = bbox | |
| top_left = tuple(map(int, top_left)) | |
| bottom_right = tuple(map(int, bottom_right)) | |
| # Calculate the centers of the top and bottom of the bounding box | |
| # center_top = (int((top_left[0] + top_right[0]) / 2), int((top_left[1] + top_right[1]) / 2)) | |
| # center_bottom = (int((bottom_left[0] + bottom_right[0]) / 2), int((bottom_left[1] + bottom_right[1]) / 2)) | |
| # If the NER result is not empty, and the score is high | |
| if len(ner_result) > 0 and ner_result['score'] > redaction_score_threshold: | |
| # Get the entity and score | |
| # entity = ner_result[0]['entity'] | |
| # score = str(ner_result[0]['score']) | |
| # Apply a irreversible redaction | |
| cv2.rectangle(cv_image, top_left, bottom_right, (0, 0, 0), -1) | |
| # else: | |
| # entity = 'O' | |
| # score = '0' | |
| # # Draw the bounding box | |
| # cv2.rectangle(cv_image, top_left, bottom_right, (0, 255, 0), 1) | |
| # # Draw the entity and score | |
| # cv2.putText(cv_image, entity, center_top, cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2) | |
| # cv2.putText(cv_image, score, center_bottom, cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2) | |
| # Save the redacted image | |
| print(f"Saving redacted {pdf_image_path}...") | |
| redacted_image_path = pdf_image_path.replace(f'.{img_format}', f'_redacted.{img_format}') | |
| # Save the redacted image in png format | |
| cv2.imwrite(redacted_image_path, cv_image) | |
| return redacted_image_path | |
| # Convert the set of redacted images to a pdf | |
| def stich_images_to_pdf(redacted_image_files, input_pdf_path): | |
| # Sort the redacted images | |
| redacted_image_files.sort() | |
| # Convert the redacted images to a single PDF | |
| print("Converting redacted images to PDF...") | |
| redacted_pdf_path = input_pdf_path.replace('.pdf', '_redacted.pdf') | |
| doc = pymupdf.open() | |
| for redacted_image_file in redacted_image_files: | |
| img = pymupdf.open(redacted_image_file) # open pic as document | |
| rect = img[0].rect # pic dimension | |
| pdfbytes = img.convert_to_pdf() # make a PDF stream | |
| img.close() # no longer needed | |
| imgPDF = pymupdf.open("pdf", pdfbytes) # open stream as PDF | |
| page = doc.new_page(width = rect.width, # new page with ... | |
| height = rect.height) # pic dimension | |
| page.show_pdf_page(rect, imgPDF, 0) # image fills the page | |
| doc.save(redacted_pdf_path) | |
| # print(f"PDF saved as {redacted_pdf_path}") | |
| return redacted_pdf_path | |
| def cleanup(redacted_image_files, pdf_images, pdf_images_dir, original_pdf): | |
| # Remove the directory with the images | |
| print("Cleaning up...") | |
| # Remove the redacted images | |
| for file in redacted_image_files: | |
| os.remove(file) | |
| # Remove the pdf images | |
| for file in pdf_images: | |
| os.remove(file) | |
| # Remove the pdf images directory | |
| os.rmdir(pdf_images_dir) | |
| # Remove original pdf | |
| os.remove(original_pdf) | |
| return None | |
| # Func to control ui | |
| def predict(input_pdf_path, sensitivity): | |
| print("Setting threshold") | |
| # Convert sensitivity to threshold | |
| redaction_score_threshold = (100-sensitivity)/100 | |
| # Convert the PDF to images | |
| print("Converting pdf to images") | |
| pdf_images_dir = convert_to_images(input_pdf_path) | |
| # Get the file paths of the images | |
| print("Gathering converted images") | |
| pdf_images = glob(f'{pdf_images_dir}/*.{img_format}', recursive=True) | |
| pdf_images.sort() | |
| # Redact images | |
| print("Redacting images") | |
| redacted_image_files = [] | |
| for pdf_image in pdf_images: | |
| redacted_image_files.append(redact_image(pdf_image, redaction_score_threshold)) | |
| # Convert the redacted images to a single PDF | |
| print("Stitching images to pdf") | |
| redacted_pdf_path = stich_images_to_pdf(redacted_image_files, input_pdf_path) | |
| print("Cleaning up") | |
| cleanup(redacted_image_files, pdf_images, pdf_images_dir, input_pdf_path) | |
| return redacted_pdf_path | |
| ########################################################################################################## | |
| contact_text = """ | |
| # Contact Information | |
| π€ [Mitanshu Sukhwani](https://www.linkedin.com/in/mitanshusukhwani/) | |
| βοΈ mitanshu.sukhwani@gmail.com | |
| π [mitanshu7](https://github.com/mitanshu7) | |
| """ | |
| ########################################################################################################## | |
| # Gradio interface | |
| with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
| # Title and description | |
| gr.Markdown("# RedactNLP: Redact your PDF!") | |
| gr.Markdown("## How redaction happens:") | |
| gr.Markdown(""" | |
| 1. The PDF pages are converted to images using **[PyMuPDF](https://github.com/pymupdf/PyMuPDF)**. | |
| 2. **[EasyOCR](https://github.com/JaidedAI/EasyOCR)** is run on the converted images to extract text. | |
| 3. **[dslim/distilbert-NER](https://huggingface.co/dslim/distilbert-NER)** model does the token classification. | |
| 4. Non-recoverable mask is applied to identified elements using **[OpenCV](https://github.com/opencv/opencv)**. | |
| 5. The masked images are converted back to a PDF again using **[PyMuPDF](https://github.com/pymupdf/PyMuPDF)**. | |
| """) | |
| gr.Markdown("*Note: If you already have a ML setup, it is preferable that you download the [github repo](https://github.com/mitanshu7/RedactNLP) and use it offline. It offers better privacy and can use GPU for (much) faster computations while utilising a better model like **[FacebookAI/xlm-roberta-large-finetuned-conll03-english](https://huggingface.co/FacebookAI/xlm-roberta-large-finetuned-conll03-english)** or **[blaze999/Medical-NER](https://huggingface.co/blaze999/Medical-NER)***") | |
| # Input Section | |
| pdf_file_input = gr.File(file_count='single', file_types=['pdf'], label='Upload PDF', show_label=True, interactive=True) | |
| # Slider for results count | |
| slider_input = gr.Slider( | |
| minimum=0, maximum=100, value=80, step=1, | |
| label="Sensitivity to remove elements. Higher is more sensitive, hence will redact aggresively." | |
| ) | |
| # Submission Button | |
| submit_btn = gr.Button("Redact") | |
| # Output section | |
| output = gr.File(file_count='single', file_types=['pdf'], label='Download redacted PDF', show_label=True, interactive=False) | |
| # Attribution | |
| gr.Markdown(contact_text) | |
| # Link button click to the prediction function | |
| submit_btn.click(predict, [pdf_file_input, slider_input], output) | |
| ################################################################################ | |
| if __name__ == "__main__": | |
| demo.launch() | |