Spaces:
Build error
Build error
| import os | |
| import traceback | |
| import argparse | |
| from typing import List, Tuple, Set, Dict | |
| import time | |
| from PIL import Image | |
| import numpy as np | |
| from doctr.models import ocr_predictor | |
| import logging | |
| import pandas as pd | |
| from bs4 import BeautifulSoup | |
| import gradio | |
| from utils import cropImages | |
| from utils import draw_only_box,draw_box_with_text,getlogger,Annotation | |
| from ocr_component1 import OCRComponent1 | |
| from detectionAndOcrTable1 import DetectionAndOcrTable1 | |
| from detectionAndOcrTable2 import DetectionAndOcrTable2 | |
| from detectionAndOcrTable3 import DetectionAndOcrTable3 | |
| from detectionAndOcrTable4 import DetectionAndOcrTable4 | |
| from ocrTable1 import OcrTable1 | |
| from ocrTable2 import OcrTable2 | |
| from pdf2image import convert_from_path | |
| def convertHTMLToCSV(html:str,output_path:str)->str: | |
| # empty list | |
| data = [] | |
| # for getting the header from | |
| # the HTML file | |
| list_header = [] | |
| soup = BeautifulSoup(html,'html.parser') | |
| header = soup.find_all("table")[0].find("tr") | |
| for items in header: | |
| try: | |
| list_header.append(items.get_text()) | |
| except: | |
| continue | |
| # for getting the data | |
| HTML_data = soup.find_all("table")[0].find_all("tr")[1:] | |
| for element in HTML_data: | |
| sub_data = [] | |
| for sub_element in element: | |
| try: | |
| sub_data.append(sub_element.get_text()) | |
| except: | |
| continue | |
| data.append(sub_data) | |
| # Storing the data into Pandas | |
| # DataFrame | |
| dataFrame = pd.DataFrame(data = data, columns = list_header) | |
| # Converting Pandas DataFrame | |
| # into CSV file | |
| dataFrame.to_csv(output_path) | |
| def saveResults(image_list, results, labels, output_dir='output/', threshold=0.5): | |
| if not os.path.exists(output_dir): | |
| os.makedirs(output_dir) | |
| for idx, im in enumerate(image_list): | |
| im = draw_only_box(im, results[idx], labels, threshold=threshold) | |
| out_path = os.path.join(output_dir, f"{idx}.jpg") | |
| im.save(out_path, quality=95) | |
| print("save result to: " + out_path) | |
| def InputToImages(input_path:str,resolution=300)-> List[Image.Image]: | |
| """ | |
| input is file location to image | |
| return : List of Pillow image objects | |
| """ | |
| images=[] | |
| try: | |
| img =Image.open(input_path) | |
| if img.mode == 'RGBA': | |
| img = img.convert('RGB') | |
| images.append(img) | |
| except Exception as e: | |
| traceback.print_exc() | |
| return images | |
| def drawTextDetRes(bxs :List[List[float]],img:Image.Image,output_path:str): | |
| """ | |
| draw layout analysis results | |
| """ | |
| """bxs_draw is xmin, ymin, xmax, ymax""" | |
| bxs_draw = [[b[0][0], b[0][1], b[1][0], b[-1][1]] for b in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]] | |
| #images_to_recognizer = cropImage(bxs, img) | |
| img_to_save = draw_only_box(img, bxs_draw) | |
| img_to_save.save(output_path, quality=95) | |
| def test_ocr_component1(test_file="TestingFiles/OCRTest1German.pdf", debug_folder = './res/table1/',englishFlag = False): | |
| #Takes as input image of a single page and returns the detected lines and words | |
| images = convert_from_path(test_file) | |
| ocr = OCRComponent1(englishFlag) | |
| ocr_results = {} | |
| all_text_in_pages = {} | |
| for page_number,img in enumerate(images): | |
| text_in_page = "" | |
| line_annotations= ocr.predict(img = np.array(img)) | |
| ocr_results[page_number] = line_annotations | |
| """ | |
| boxes_to_draw =[] | |
| for list_of_ann in word_annotations: | |
| for ann in list_of_ann: | |
| logger.info(ann.text) | |
| b = ann.box | |
| boxes_to_draw.append(b) | |
| img_to_save = draw_only_box(img,boxes_to_draw) | |
| img_to_save.save("res/12June_2_lines.png", quality=95) | |
| """ | |
| line_boxes_to_draw =[] | |
| #print("Detected lines are ") | |
| #print(len(line_annotations.items())) | |
| for index,ann in line_annotations.items(): | |
| b = ann.box | |
| line_boxes_to_draw.append(b) | |
| line_words = "" | |
| #print("detected words per line") | |
| #print(len(ann.words)) | |
| for wordann in ann.words: | |
| line_words += wordann.text +" " | |
| print(line_words) | |
| text_in_page += line_words +"\n" | |
| img_to_save1 = draw_only_box(img,line_boxes_to_draw) | |
| imgname = test_file.split("/")[-1][:-4] | |
| img_to_save1.save(debug_folder+imgname+"_"+str(page_number)+"_bbox_detection.png", quality=95) | |
| all_text_in_pages[page_number] = text_in_page | |
| return ocr_results, all_text_in_pages | |
| def test_tableOcrOnly1(test_file :Image.Image , debug_folder = './res/table1/',denoise = False,englishFlag = False): | |
| #Hybrid Unitable +DocTR | |
| #Good at these kind of tables - with a lot of texts | |
| table = OcrTable1(englishFlag) | |
| image = test_file.convert("RGB") | |
| """ | |
| parts = test_file.split("/") | |
| filename = parts[-1][:-4] | |
| debugfolder_filename_page_name= debug_folder+filename+"_" | |
| table_code = table.predict([image],debugfolder_filename_page_name,denoise = denoise) | |
| with open(debugfolder_filename_page_name+'output.txt', 'w') as file: | |
| file.write(table_code) | |
| """ | |
| table_code = table.predict([image],denoise = denoise) | |
| return table_code | |
| def test_tableOcrOnly2(test_file:Image.Image, debug_folder = './res/table2/'): | |
| table = OcrTable2() | |
| #FullUnitable | |
| #Good at these kind of tables - with not much text | |
| image = test_file.convert("RGB") | |
| table.predict([image],debug_folder) | |
| def test_table_component1(test_file = 'TestingFiles/TableOCRTestEnglish.pdf', debug_folder ='./res/table_debug2/',denoise = False,englishFlag = True): | |
| table_predictor = DetectionAndOcrTable1(englishFlag) | |
| images = convert_from_path(test_file) | |
| for page_number,img in enumerate(images): | |
| #print(img.mode) | |
| print("Looking at page:") | |
| print(page_number) | |
| parts = test_file.split("/") | |
| filename = parts[-1][:-4] | |
| debugfolder_filename_page_name= debug_folder+filename+"_"+ str(page_number)+'_' | |
| table_codes = table_predictor.predict(img,debugfolder_filename_page_name=debugfolder_filename_page_name,denoise = denoise) | |
| for index, table_code in enumerate(table_codes): | |
| with open(debugfolder_filename_page_name+str(index)+'output.xls', 'w') as file: | |
| file.write(table_code) | |
| return table_codes | |
| def test_table_component2(test_file = 'TestingFiles/TableOCRTestEnglish.pdf', debug_folder ='./res/table_debug2/'): | |
| #This components can take in entire pdf page as input , scan for tables and return the table in html format | |
| #Uses the full unitable model | |
| table_predictor = DetectionAndOcrTable2() | |
| images = convert_from_path(test_file) | |
| for page_number,img in enumerate(images): | |
| print("Looking at page:") | |
| print(page_number) | |
| parts = test_file.split("/") | |
| filename = parts[-1][:-4] | |
| debugfolder_filename_page_name= debug_folder+filename+"_"+ str(page_number)+'_' | |
| table_codes = table_predictor.predict(img,debugfolder_filename_page_name=debugfolder_filename_page_name) | |
| for index, table_code in enumerate(table_codes): | |
| with open(debugfolder_filename_page_name+str(index)+'output.xls', 'w') as file: | |
| file.write(table_code) | |
| return table_codes | |
| def test_table_component3(test_file = 'TestingFiles/TableOCRTestEnglish.pdf',debug_folder ='./res/table_debug3/',denoise = False,englishFlag = True): | |
| table_predictor = DetectionAndOcrTable3(englishFlag) | |
| images = convert_from_path(test_file) | |
| for page_number,img in enumerate(images): | |
| #print(img.mode) | |
| print("Looking at page:") | |
| print(page_number) | |
| parts = test_file.split("/") | |
| filename = parts[-1][:-4] | |
| debugfolder_filename_page_name= debug_folder+filename+"_"+ str(page_number)+'_' | |
| table_codes = table_predictor.predict(img,debugfolder_filename_page_name=debugfolder_filename_page_name) | |
| for index, table_code in enumerate(table_codes): | |
| with open(debugfolder_filename_page_name+str(index)+'output.xls', 'w') as file: | |
| file.write(table_code) | |
| return table_codes | |
| def test_table_component4(test_file = 'TestingFiles/TableOCRTestEnglish.pdf',debug_folder ='./res/table_debug3/'): | |
| table_predictor = DetectionAndOcrTable4() | |
| images = convert_from_path(test_file) | |
| for page_number,img in enumerate(images): | |
| #print(img.mode) | |
| print("Looking at page:") | |
| print(page_number) | |
| parts = test_file.split("/") | |
| filename = parts[-1][:-4] | |
| debugfolder_filename_page_name= debug_folder+filename+"_"+ str(page_number)+'_' | |
| table_codes = table_predictor.predict(img,debugfolder_filename_page_name=debugfolder_filename_page_name) | |
| for index, table_code in enumerate(table_codes): | |
| with open(debugfolder_filename_page_name+str(index)+'output.xls', 'w') as file: | |
| file.write(table_code) | |
| return table_codes | |
| """ | |
| parser = argparse.ArgumentParser(description='Process some strings.') | |
| parser.add_argument('ocr', type=str, help='type in id of the component to test') | |
| parser.add_argument('--test_file',type=str, help='path to the testing file') | |
| parser.add_argument('--debug_folder',type=str, help='path to the folder you want to save your results in') | |
| parser.add_argument('--englishFlag',type=bool, help='Whether your pdf is in english => could lead to better results ') | |
| parser.add_argument('--denoise',type=bool, help='preprocessing for not clean scans ') | |
| args = parser.parse_args() | |
| start = time.time() | |
| if args.ocr == "ocr1": | |
| test_ocr_component1(args.test_file,args.debug_folder, args.englishFlag) | |
| elif args.ocr == "table1": | |
| test_tableOcrOnly1(args.test_file,args.debug_folder,args.englishFlag,args.denoise) | |
| elif args.ocr == "table2": | |
| test_tableOcrOnly2(args.test_file,args.debug_folder) | |
| elif args.ocr =="pdftable1": | |
| test_table_component1(args.test_file,args.debug_folder,args.englishFlag,args.denoise) | |
| elif args.ocr =="pdftable2": | |
| test_table_component2(args.test_file,args.debug_folder) | |
| elif args.ocr =="pdftable3": | |
| test_table_component3(args.test_file,args.debug_folder,args.englishFlag,args.denoise) | |
| elif args.ocr =="pdftable4": | |
| test_table_component4(args.test_file,args.debug_folder) | |
| """ | |
| import gradio as gr | |
| from gradio_pdf import PDF | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# OCR component") | |
| inputs_for_ocr = [PDF(label="Document"), gr.Textbox(label="internal debug folder",placeholder = "./res/table1/"),gr.Checkbox(label ="English Document?",value =False)] | |
| ocr_btn = gr.Button("Run ocr") | |
| gr.Examples( | |
| examples=[["TestingFiles/OCRTest1German.pdf",'./res/table1/',False]], | |
| inputs=inputs_for_ocr | |
| ) | |
| outputs_for_ocr = [gr.Textbox(label="List of annotation objects"), gr.Textbox("Text in page")] | |
| ocr_btn.click(fn=test_ocr_component1, | |
| inputs = inputs_for_ocr, | |
| outputs = outputs_for_ocr, | |
| api_name="OCR" | |
| ) | |
| gr.Markdown("# Table OCR components that takes a pdf, extract table and return their html code ") | |
| gr.Markdown("## Component 1 uses table transformer and doctr +Unitable") | |
| inputs_for_pdftable1 = [PDF(label="Document"), gr.Textbox(label="internal debug folder",placeholder = "./res/table1/"),gr.Checkbox(label ="Denoise?",value =False),gr.Checkbox(label ="English Document?",value =False)] | |
| table1_btn = gr.Button("Run pdftable1") | |
| gr.Examples( | |
| examples=[["TestingFiles/OCRTest5English.pdf",'./res/table1/',False]], | |
| inputs=inputs_for_pdftable1 | |
| ) | |
| outputs_for_pdftable1 = [gr.Textbox(label="Table code")] | |
| table1_btn.click(fn=test_table_component1, | |
| inputs = inputs_for_pdftable1, | |
| outputs = outputs_for_pdftable1, | |
| api_name="pdfTable1" | |
| ) | |
| gr.Markdown("## Component 2 uses table transformer and Unitable") | |
| inputs_for_pdftable2 = [PDF(label="Document"), gr.Textbox(label="internal debug folder",placeholder = "./res/table1/")] | |
| table2_btn = gr.Button("Run pdftable2") | |
| gr.Examples( | |
| examples=[["TestingFiles/OCRTest5English.pdf",'./res/table1/',False]], | |
| inputs=inputs_for_pdftable1 | |
| ) | |
| outputs_for_pdftable2 = [gr.Textbox(label="Table code")] | |
| table2_btn.click(fn=test_table_component2, | |
| inputs = inputs_for_pdftable2, | |
| outputs = outputs_for_pdftable2, | |
| api_name="pdfTable2" | |
| ) | |
| gr.Markdown("## Component 3 uses Yolo and Unitable+doctr") | |
| inputs_for_pdftable3 = [PDF(label="Document"), gr.Textbox(label="internal debug folder",placeholder = "./res/table1/"),gr.Checkbox(label ="Denoise?",value =False),gr.Checkbox(label ="English Document?",value =False)] | |
| table3_btn = gr.Button("Run pdftable3") | |
| gr.Examples( | |
| examples=[["TestingFiles/TableOCRTestEnglish.pdf",'./res/table1/',False]], | |
| inputs=inputs_for_pdftable1 | |
| ) | |
| outputs_for_pdftable3 = [gr.Textbox(label="Table code")] | |
| table3_btn.click(fn=test_table_component3, | |
| inputs = inputs_for_pdftable3, | |
| outputs = outputs_for_pdftable3, | |
| api_name="pdfTable3" | |
| ) | |
| gr.Markdown("## Component 4 uses Yolo and Unitable") | |
| inputs_for_pdftable4 = [PDF(label="Document"), gr.Textbox(label="internal debug folder",placeholder = "./res/table1/")] | |
| table4_btn = gr.Button("Run pdftable4") | |
| gr.Examples( | |
| examples=[["TestingFiles/TableOCRTestEasier.pdf",'./res/table1/',False]], | |
| inputs=inputs_for_pdftable1 | |
| ) | |
| outputs_for_pdftable4 = [gr.Textbox(label="Table code")] | |
| table4_btn.click(fn=test_table_component4, | |
| inputs = inputs_for_pdftable4, | |
| outputs = outputs_for_pdftable4, | |
| api_name="pdfTable4" | |
| ) | |
| gr.Markdown("# Table OCR component that takes image of an cropped tavle, extract table and return their html code ") | |
| inputs_for_table1 = [gr.Image(label="Image of cropped table",type='pil'), gr.Textbox(label="internal debug folder",placeholder = "./res/table1/"),gr.Checkbox(label ="Denoise?",value =False),gr.Checkbox(label ="English Document?",value =False)] | |
| onlytable1_btn = gr.Button("Run table1") | |
| gr.Examples( | |
| examples=[[Image.open("cropped_table.png"),'./res/table1/',False]], | |
| inputs=inputs_for_table1 | |
| ) | |
| outputs_for_table1 = [gr.HTML(label="Table code")] | |
| onlytable1_btn.click(fn=test_tableOcrOnly1, | |
| inputs = inputs_for_table1, | |
| outputs = outputs_for_table1, | |
| api_name="table1" | |
| ) | |
| gr.Markdown("## Another Table OCR component that takes image of an cropped table, extract table and return their html code ") | |
| inputs_for_table2 = [gr.Image(label="Image of cropped table",type='pil'), gr.Textbox(label="internal debug folder",placeholder = "./res/table1/")] | |
| onlytable2_btn = gr.Button("Run table2") | |
| gr.Examples( | |
| examples=[[Image.open("cropped_table.png"),'./res/table1/',False]], | |
| inputs=inputs_for_table2 | |
| ) | |
| outputs_for_table2 = [gr.HTML(label="Table code")] | |
| onlytable2_btn.click(fn=test_tableOcrOnly2, | |
| inputs = inputs_for_table2, | |
| outputs = outputs_for_table2, | |
| api_name="table2" | |
| ) | |
| demo.launch(share=True) |