Spaces:
Running
Running
| import pymupdf | |
| import tiktoken | |
| import textstat | |
| from docx import Document | |
| import io | |
| # from rake_nltk import Rake | |
| # import nltk | |
| # from nltk.corpus import stopwords | |
| from openai import OpenAI | |
| # Download NLTK stopwords | |
| # nltk.download('stopwords') | |
| # nltk.download('punkt') | |
| #function to use gpt4o-mini | |
| def extract_relevant_keywords(prompt: str) -> str: | |
| client = OpenAI() | |
| response = client.chat.completions.create( | |
| model="gpt-4o-mini", | |
| messages=[ | |
| {"role": "user", "content": prompt} | |
| ] | |
| ) | |
| return response.choices[0].message.content | |
| def evaluate_text_quality(text: str) -> dict: | |
| # Calculate readability metrics | |
| flesch_reading_ease = textstat.flesch_reading_ease(text) | |
| flesch_kincaid_grade = textstat.flesch_kincaid_grade(text) | |
| gunning_fog = textstat.gunning_fog(text) | |
| smog_index = textstat.smog_index(text) | |
| automated_readability_index = textstat.automated_readability_index(text) | |
| # Normalize readability scores to a 0-1 scale | |
| def normalize_score(score, min_score, max_score): | |
| return (score - min_score) / (max_score - min_score) | |
| # Normalize each readability score | |
| n_flesch_reading_ease = normalize_score(flesch_reading_ease, 0, 100) | |
| n_flesch_kincaid_grade = 1 - normalize_score(flesch_kincaid_grade, 0, 18) # Higher is more difficult | |
| n_gunning_fog = 1 - normalize_score(gunning_fog, 0, 18) # Higher is more difficult | |
| n_smog_index = 1 - normalize_score(smog_index, 0, 18) # Higher is more difficult | |
| n_automated_readability_index = 1 - normalize_score(automated_readability_index, 0, 18) # Higher is more difficult | |
| # Weights for each metric (adjust these as needed) | |
| weights = { | |
| "flesch_reading_ease": 0.25, | |
| "flesch_kincaid_grade": 0.25, | |
| "gunning_fog": 0.2, | |
| "smog_index": 0.15, | |
| "automated_readability_index": 0.15 | |
| } | |
| # Calculate the global readability score | |
| global_score = ( | |
| n_flesch_reading_ease * weights["flesch_reading_ease"] + | |
| n_flesch_kincaid_grade * weights["flesch_kincaid_grade"] + | |
| n_gunning_fog * weights["gunning_fog"] + | |
| n_smog_index * weights["smog_index"] + | |
| n_automated_readability_index * weights["automated_readability_index"] | |
| ) | |
| # Scale the global score to 0-5 | |
| global_score_0_5 = global_score * 5 | |
| # def extract_keywords(text): | |
| # rake = Rake(stopwords.words('french')) | |
| # rake.extract_keywords_from_text(text) | |
| # return rake.get_ranked_phrases() | |
| def count_tokens(input_string: str) -> int: | |
| tokenizer = tiktoken.get_encoding("cl100k_base") | |
| tokens = tokenizer.encode(input_string) | |
| return len(tokens) | |
| def audit_descriptif_pdf(file,max_img_width) -> dict: | |
| document = pymupdf.open(stream=file.read()) | |
| audit_dict_doc = { | |
| "number_of_pages": len(document), | |
| "number_of_images": 0, | |
| "number_of_links": 0, | |
| "number_of_tables": 0, | |
| "number_of_tokens": 0, | |
| "number_of_words": 0, | |
| "key_words": [] | |
| } | |
| doc_content = dict() | |
| for page in document: | |
| audit_dict_page = {} | |
| page_content = { | |
| "images": [], | |
| "texte": "", | |
| "liens": [], | |
| "tableaux": [] | |
| } | |
| #number of images | |
| images = page.get_images() | |
| number_images = len(images) | |
| audit_dict_page["number_of_images"] = number_images | |
| audit_dict_doc["number_of_images"] += number_images | |
| #get images | |
| for _, img in enumerate(images): | |
| xref = img[0] | |
| base_image = document.extract_image(xref) | |
| image_bytes = base_image["image"] | |
| image_width = base_image["width"] | |
| image_height = base_image["height"] | |
| # Adjust image size if it exceeds the maximum width | |
| if image_width > max_img_width: | |
| ratio = max_img_width / image_width | |
| image_width = max_img_width | |
| image_height = int(image_height * ratio) | |
| page_content["images"].append((image_bytes, image_width, image_height)) | |
| #get links with uri | |
| links = [] | |
| for link in page.get_links(): | |
| if link['kind'] == pymupdf.LINK_URI and 'uri' in link: | |
| links.append({"uri": link["uri"], "page": page.number}) | |
| page_content["liens"] = links | |
| #number of links | |
| number_links = len(links) | |
| audit_dict_page["number_of_links"] = number_links | |
| audit_dict_doc["number_of_links"] += number_links | |
| #number of tables | |
| tables = page.find_tables().tables | |
| number_tables = len(tables) | |
| for tab in tables: | |
| page_content["tableaux"].append(tab.to_pandas()) | |
| audit_dict_page["number_of_tables"] = number_tables | |
| audit_dict_doc["number_of_tables"] += number_tables | |
| #number of tokens and words | |
| text = page.get_text("text") | |
| number_tokens = count_tokens(text) | |
| number_words = len(text.split()) | |
| audit_dict_page["number_of_tokens"] = number_tokens | |
| audit_dict_page["number_of_words"] = number_words | |
| #get text | |
| page_content["texte"] = text | |
| audit_dict_doc["number_of_tokens"] += number_tokens | |
| audit_dict_doc["number_of_words"] += number_words | |
| audit_dict_doc[f"page_{page.number}"] = audit_dict_page | |
| doc_content[f"page_{page.number}"] = page_content | |
| # Extract key words from the document | |
| text = " ".join([page["texte"] for page in doc_content.values()]) | |
| # key_words = extract_keywords(text) | |
| # list_key_words_text = "\n".join(key_words[:10]) | |
| prompt = f'''Voici le document: | |
| - {text} | |
| Veuillez extraire les cinq mots clés les plus pertinents de cette liste. Chaque mot clé doit contenir au maximum deux mots. | |
| TA REPONSE DOIT RESPECTER LE FORMAT SUIVANT : | |
| key_word1, key_word2, key_word3, key_word4, key_word5 | |
| ''' | |
| key_words_extracted = extract_relevant_keywords(prompt) | |
| audit_dict_doc["key_words"] = "\n" + key_words_extracted | |
| #merge 2 dicts | |
| global_audit = { | |
| "audit": audit_dict_doc, | |
| "content": doc_content | |
| } | |
| return global_audit | |
| def audit_text(text: str) -> dict: | |
| prompt = f'''Voici le document: | |
| - {text} | |
| Veuillez extraire les cinq mots clés les plus pertinents de cette liste. Chaque mot clé doit contenir au maximum deux mots. | |
| TA REPONSE DOIT RESPECTER LE FORMAT SUIVANT : | |
| key_word1, key_word2, key_word3, key_word4, key_word5 | |
| ''' | |
| key_words_extracted = extract_relevant_keywords(prompt) | |
| audit_dict = { | |
| "number_of_tokens": count_tokens(text), | |
| "number_of_words": len(text.split()), | |
| } | |
| audit_dict["key_words"] = "\n" + key_words_extracted | |
| global_audit = { | |
| "audit": audit_dict, | |
| "content": text | |
| } | |
| return global_audit | |