Spaces:

bziiit
/

AGENT_ANALYSE_RAG

Running

AGENT_ANALYSE_RAG / utils /audit /audit_doc.py

Ilyas KHIAT

texte graph debug

18b9dfc about 1 year ago

6.95 kB


	import pymupdf
	import tiktoken
	import textstat
	from docx import Document
	import io
	# from rake_nltk import Rake
	# import nltk
	# from nltk.corpus import stopwords
	from openai import OpenAI

	# Download NLTK stopwords
	# nltk.download('stopwords')
	# nltk.download('punkt')

	#function to use gpt4o-mini
	def extract_relevant_keywords(prompt: str) -> str:
	client = OpenAI()
	response = client.chat.completions.create(
	model="gpt-4o-mini",
	messages=[
	{"role": "user", "content": prompt}
	]
	)
	return response.choices[0].message.content


	def evaluate_text_quality(text: str) -> dict:
	# Calculate readability metrics
	flesch_reading_ease = textstat.flesch_reading_ease(text)
	flesch_kincaid_grade = textstat.flesch_kincaid_grade(text)
	gunning_fog = textstat.gunning_fog(text)
	smog_index = textstat.smog_index(text)
	automated_readability_index = textstat.automated_readability_index(text)

	# Normalize readability scores to a 0-1 scale
	def normalize_score(score, min_score, max_score):
	return (score - min_score) / (max_score - min_score)

	# Normalize each readability score
	n_flesch_reading_ease = normalize_score(flesch_reading_ease, 0, 100)
	n_flesch_kincaid_grade = 1 - normalize_score(flesch_kincaid_grade, 0, 18) # Higher is more difficult
	n_gunning_fog = 1 - normalize_score(gunning_fog, 0, 18) # Higher is more difficult
	n_smog_index = 1 - normalize_score(smog_index, 0, 18) # Higher is more difficult
	n_automated_readability_index = 1 - normalize_score(automated_readability_index, 0, 18) # Higher is more difficult

	# Weights for each metric (adjust these as needed)
	weights = {
	"flesch_reading_ease": 0.25,
	"flesch_kincaid_grade": 0.25,
	"gunning_fog": 0.2,
	"smog_index": 0.15,
	"automated_readability_index": 0.15
	}

	# Calculate the global readability score
	global_score = (
	n_flesch_reading_ease * weights["flesch_reading_ease"] +
	n_flesch_kincaid_grade * weights["flesch_kincaid_grade"] +
	n_gunning_fog * weights["gunning_fog"] +
	n_smog_index * weights["smog_index"] +
	n_automated_readability_index * weights["automated_readability_index"]
	)

	# Scale the global score to 0-5
	global_score_0_5 = global_score * 5

	# def extract_keywords(text):
	# rake = Rake(stopwords.words('french'))
	# rake.extract_keywords_from_text(text)
	# return rake.get_ranked_phrases()



	def count_tokens(input_string: str) -> int:
	tokenizer = tiktoken.get_encoding("cl100k_base")
	tokens = tokenizer.encode(input_string)
	return len(tokens)

	def audit_descriptif_pdf(file,max_img_width) -> dict:
	document = pymupdf.open(stream=file.read())

	audit_dict_doc = {
	"number_of_pages": len(document),
	"number_of_images": 0,
	"number_of_links": 0,
	"number_of_tables": 0,
	"number_of_tokens": 0,
	"number_of_words": 0,
	"key_words": []
	}

	doc_content = dict()

	for page in document:

	audit_dict_page = {}
	page_content = {
	"images": [],
	"texte": "",
	"liens": [],
	"tableaux": []
	}

	#number of images
	images = page.get_images()
	number_images = len(images)
	audit_dict_page["number_of_images"] = number_images
	audit_dict_doc["number_of_images"] += number_images

	#get images
	for _, img in enumerate(images):
	xref = img[0]
	base_image = document.extract_image(xref)

	image_bytes = base_image["image"]
	image_width = base_image["width"]
	image_height = base_image["height"]

	# Adjust image size if it exceeds the maximum width
	if image_width > max_img_width:
	ratio = max_img_width / image_width
	image_width = max_img_width
	image_height = int(image_height * ratio)

	page_content["images"].append((image_bytes, image_width, image_height))



	#get links with uri
	links = []
	for link in page.get_links():
	if link['kind'] == pymupdf.LINK_URI and 'uri' in link:
	links.append({"uri": link["uri"], "page": page.number})

	page_content["liens"] = links

	#number of links
	number_links = len(links)
	audit_dict_page["number_of_links"] = number_links
	audit_dict_doc["number_of_links"] += number_links

	#number of tables
	tables = page.find_tables().tables
	number_tables = len(tables)
	for tab in tables:
	page_content["tableaux"].append(tab.to_pandas())
	audit_dict_page["number_of_tables"] = number_tables
	audit_dict_doc["number_of_tables"] += number_tables

	#number of tokens and words
	text = page.get_text("text")
	number_tokens = count_tokens(text)
	number_words = len(text.split())

	audit_dict_page["number_of_tokens"] = number_tokens
	audit_dict_page["number_of_words"] = number_words

	#get text
	page_content["texte"] = text

	audit_dict_doc["number_of_tokens"] += number_tokens
	audit_dict_doc["number_of_words"] += number_words

	audit_dict_doc[f"page_{page.number}"] = audit_dict_page

	doc_content[f"page_{page.number}"] = page_content

	# Extract key words from the document
	text = " ".join([page["texte"] for page in doc_content.values()])
	# key_words = extract_keywords(text)
	# list_key_words_text = "\n".join(key_words[:10])
	prompt = f'''Voici le document:
	- {text}
	Veuillez extraire les cinq mots clés les plus pertinents de cette liste. Chaque mot clé doit contenir au maximum deux mots.

	TA REPONSE DOIT RESPECTER LE FORMAT SUIVANT :
	key_word1, key_word2, key_word3, key_word4, key_word5
	'''
	key_words_extracted = extract_relevant_keywords(prompt)
	audit_dict_doc["key_words"] = "\n" + key_words_extracted

	#merge 2 dicts
	global_audit = {
	"audit": audit_dict_doc,
	"content": doc_content
	}

	return global_audit

	def audit_text(text: str) -> dict:

	prompt = f'''Voici le document:
	- {text}
	Veuillez extraire les cinq mots clés les plus pertinents de cette liste. Chaque mot clé doit contenir au maximum deux mots.

	TA REPONSE DOIT RESPECTER LE FORMAT SUIVANT :
	key_word1, key_word2, key_word3, key_word4, key_word5
	'''
	key_words_extracted = extract_relevant_keywords(prompt)


	audit_dict = {
	"number_of_tokens": count_tokens(text),
	"number_of_words": len(text.split()),
	}

	audit_dict["key_words"] = "\n" + key_words_extracted

	global_audit = {
	"audit": audit_dict,
	"content": text
	}

	return global_audit