Spaces:

digitiamosrl
/

document_info_extractor

Build error

App Files Files Community

document_info_extractor / app.py

MrFeelgoood

Update app.py

846b634 over 2 years ago

raw

history blame contribute delete

10.2 kB

	import spacy
	from spacy.language import Language
	from spacy.lang.it import Italian
	import re
	from transformers import pipeline
	from gradio.inputs import File
	import gradio as gr
	from pdf2image import convert_from_path
	import pytesseract
	import tempfile
	import os
	from gradio.inputs import Dropdown
	import gradio as gr
	import tempfile
	import os
	from pdf2image import convert_from_path
	import pytesseract
	import fitz
	from pdf2image import convert_from_bytes


	def preprocess_punctuation(text):
	pattern = r'(?<![a-z])[a-zA-Z\.]{1,4}(?:\.[a-zA-Z\.]{1,4})\.(?!\s[A-Z])'
	matches = re.findall(pattern, text)
	res = [*set(matches)]
	#res = [r for r in res if not nlp(r).ents or
	#not any(ent.label_ in nlp.get_pipe('ner').labels for ent in nlp(r).ents)] #optimized
	return res


	def preprocess_text(text):
	prep_text = re.sub(r'\n\s*\n', '\n', text)
	prep_text = re.sub(r'\n{2,}', '\n', prep_text)
	#string_with_single_newlines_and_no_blank_lines = re.sub(r' {2,}', ' ', string_with_single_newlines_and_no_blank_lines)
	#print(string_with_single_newlines_and_no_blank_lines)
	return prep_text



	@Language.component('custom_tokenizer')
	def custom_tokenizer(doc):
	# Define a custom rule to ignore colons as a sentence boundary
	for token in doc[:-1]:
	if (token.text == ":"):
	doc[token.i+1].is_sent_start = False
	return doc



	def get_sentences(text, dictionary = None):
	cl_sentences = []
	chars_to_strip = [' ', '\n']
	chars_to_strip_str = ''.join(set(chars_to_strip))
	nlp = spacy.load("it_core_news_lg") #load ita moodel
	nlp.add_pipe("custom_tokenizer", before="parser")

	for punct in preprocess_punctuation(text):
	nlp.tokenizer.add_special_case(punct, [{spacy.symbols.ORTH: punct, spacy.symbols.NORM: punct}])

	doc = nlp(text) # Process the text with spaCy
	sentences = list(doc.sents) # Split the text into sentences
	for sentence in sentences:
	sent = sentence.text
	cl_sentence = ' '.join(filter(None, sent.lstrip(chars_to_strip_str).rstrip(chars_to_strip_str).split(' ')))
	if cl_sentence!= '':
	cl_sentences.append(cl_sentence)
	return cl_sentences




	def extract_numbers(text, given_strings):
	# Split text into a list of words
	words = text.split()
	# Find the indices of the given strings in the list of words
	indices = [i for i, word in enumerate(words) if any(s in word for s in given_strings)]
	# Initialize an empty list to store the numbers
	numbers = []
	# Loop through each index
	for index in indices:
	# Define the range of words to search for numbers
	start = max(index - 1, 0)
	end = min(index + 2, len(words))
	# Extract the words within the range
	context = words[start:end]
	# Check if the context contains mathematical operators
	if any(re.match(r'[+\*/]', word) for word in context):
	continue
	# Find all numbers in the context
	context_numbers = [
	float(re.sub('[^0-9\.,]+', '', word).replace(',', '.'))
	if re.sub('[^0-9\.,]+', '', word).replace(',', '.').replace('.', '', 1).isdigit()
	else int(re.sub('[^0-9]+', '', word))
	if re.sub('[^0-9]+', '', word).isdigit()
	else None
	for word in context
	]
	# Add the numbers to the list
	numbers.extend(context_numbers)
	return numbers



	def get_text_and_values(text, key_list):
	sentences = get_sentences(text)
	total_numbers= []
	infoDict = {}
	for sentence in sentences:
	numbers = extract_numbers(text = sentence, given_strings = key_list)
	total_numbers.append(numbers)
	if not numbers:
	continue
	else: infoDict[sentence] = numbers
	return infoDict


	def get_useful_text(dictionary):
	keysList = list(dictionary.keys())
	tx = ('\n------------------------\n'.join(keysList))
	return tx

	def get_values(dictionary):
	pr = list(dictionary.values())
	return pr


	def initialize_qa_transformer(model):
	qa = pipeline("text2text-generation", model=model)
	return qa


	def get_answers_unfiltered(dictionary, question, qa_pipeline):
	keysList = list(dictionary.keys())
	answers = []
	for kl in keysList:
	answer = qa_pipeline(f'{kl} Domanda: {question}')
	answers.append(answer)
	return answers


	def get_total(answered_values, text, keywords, raw_values, unique_values = False):
	numeric_list = [num for sublist in raw_values for num in sublist if isinstance(num, (int, float))]
	#numbers = [float(x[0]['generated_text']) for x in answered_values if x[0]['generated_text'].isdigit()]
	pattern = r'\d+(?:[.,]\d+)?'
	numbers = []
	for sub_lst in answered_values:
	for d in sub_lst:
	for k, v in d.items():
	# Replace commas with dots
	v = v.replace(',', '.')
	# Extract numbers and convert to float
	numbers += [float(match) for match in re.findall(pattern, v) if (float(match) >= 5.0) and (float(match) in numeric_list)]
	###### remove duplicates
	if unique_values:
	numbers = list(set(numbers))
	######
	total = 0
	sum = 0
	total_list = []
	# Define a regular expression pattern that will match a number
	pattern = r'\d+'
	# Loop through the keywords and search for them in the text
	found = False
	for keyword in keywords:
	# Build a regular expression pattern that looks for the keyword
	# followed by up to three words, then a number
	keyword_pattern = f'{keyword}(\\s+\\w+){{0,3}}\\s+({pattern})'
	match = re.search(keyword_pattern, text, re.IGNORECASE)
	if match:
	# If we find a match, print the number and set found to True
	number = match.group(2)
	if (number in numbers) and (number in numeric_list):
	total_list.append(int(number))
	print(f"Found a value ({number}) for keyword '{keyword}'.")
	found = True

	# If we didn't find a match
	if not found:
	for value in numbers:
	if value in numeric_list:
	total += value
	total_list.append(total)
	#If there is more than one total, it means different lots with many total measures for each house. Calculate the sum of the totals mq
	for value in total_list:
	sum += value
	return numbers, sum



	def extractor_clean(text, k_words, transformer, question, total_kwords, return_text = False):

	tex = ''
	dictionary = get_text_and_values(text, k_words)
	raw = get_values(dictionary)
	qa = initialize_qa_transformer(transformer)
	val = get_answers_unfiltered(dictionary, question = question, qa_pipeline = qa)
	keywords = ['totale', 'complessivo', 'complessiva']
	values = get_total(answered_values= val, raw_values = raw, text = text, keywords = total_kwords, unique_values = True)
	if return_text:
	tex = get_useful_text(dictionary)
	return values, return_text, tex
	elif return_text == False:
	return values, return_text



	def pdf_ocr(file, model_t, question):
	# Convert PDF to image
	with tempfile.TemporaryDirectory() as path:
	with open(file, "rb") as f:
	content = f.read()

	with fitz.open(stream=content, filetype="pdf") as doc:
	num_pages = len(doc)

	# Extract text from the PDF
	text = ""
	for page in doc:
	text += page.get_text()

	# Perform OCR on the PDF if the extracted text is empty
	if not text:
	# Convert PDF pages to images
	images = convert_from_bytes(content)
	for i, img in enumerate(images):
	text += pytesseract.image_to_string(img, lang='ita')

	# Clear the image list to free up memory
	del images

	ks = ('mq', 'MQ', 'Mq', 'metri quadri', 'm2')
	quest = "Quanti metri quadri misura la superficie?"
	totalK = ['totale', 'complessivo', 'complessiva']

	extracted_values = extractor_clean(text=text, k_words=ks, transformer=model_t, question=question, total_kwords=totalK, return_text=True)
	values_output = extracted_values[0][0]
	sor_values = sorted(values_output)
	total_output = f'{extracted_values[0][1]} Mq'
	text_output = extracted_values[2]

	immobile_values = [f'{i + 1}. Immobile : {value} Mq\n' for i, value in enumerate(sor_values)]
	immobile_values = '\n'.join(immobile_values)

	return immobile_values, total_output, text_output


	def ocr_interface(pdf_file, model_t='it5/it5-base-question-answering', question="Quanti metri quadri misura l'immobile?"):
	# Call the pdf_ocr function
	values, total, text = pdf_ocr(pdf_file.name, model_t, question)
	return values, total, text


	# Start the UI
	with gr.Blocks(theme=gr.themes.Soft()) as demo:

	gr.Markdown(
	'''
	# PDF Mq Extractor
	Demo for ITAL-IA
	''')
	with gr.Tab("Extractor"):
	with gr.Row():
	pdf_input = gr.components.File(label="PDF File")

	with gr.Row():
	model_input = gr.components.Dropdown(['it5/it5-base-question-answering', 'it5/it5-small-question-answering'],
	value='it5/it5-base-question-answering', label = 'Select model')
	question_input = gr.components.Dropdown(["Quanti metri quadri misura l'immobile?"],
	value = "Quanti metri quadri misura l'immobile?", label = 'Question')

	with gr.Column():
	gr.Markdown(
	'''
	# Output values
	Values extracted from the pdf document
	''')

	with gr.Row():

	text_output = gr.components.Textbox(label="Ref. Text")
	values_output = gr.components.Textbox(label="Area Values - sorted by value")
	total_output = gr.components.Textbox(label="Total")

	with gr.Row():
	extract_button = gr.Button("Extract")


	extract_button.click(fn = ocr_interface,
	inputs=[pdf_input, model_input, question_input], outputs=[values_output, total_output, text_output])

	gr.Examples(['Example1(scannedDoc).pdf', 'Example2.pdf', 'Example3Large.pdf'], inputs = pdf_input,
	cache_examples = True, fn = ocr_interface, outputs = [values_output, total_output, text_output])


	demo.launch()