Spaces:
Sleeping
Sleeping
| """ | |
| Agent for extracting profile information from resumes | |
| """ | |
| import groq | |
| from models import Profile, SocialMedia, Project, Skill, Education, Experience, Category | |
| from typing import List, Dict, Any, Optional | |
| from langchain.output_parsers import PydanticOutputParser | |
| from langchain.prompts import PromptTemplate | |
| from langchain_groq import ChatGroq | |
| import json | |
| from config import get_settings | |
| import logging | |
| settings = get_settings() | |
| # Configure logging | |
| logging.basicConfig( | |
| level=logging.DEBUG if settings.DEBUG else logging.INFO, | |
| format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", | |
| ) | |
| logger = logging.getLogger(__name__) | |
| class ProfileExtractor: | |
| """ | |
| Class for extracting profile information from resume text | |
| """ | |
| def __init__(self): | |
| logger.debug("Initializing ProfileExtractor") | |
| self.groq_api_key = settings.GROQ_API_KEY | |
| self.model_name = settings.MODEL_NAME | |
| self.temperature = settings.TEMPERATURE | |
| self.max_tokens = settings.MAX_TOKENS | |
| self.llm = self._initialize_llm() | |
| def _initialize_llm(self) -> ChatGroq: | |
| """Initialize the language model client""" | |
| logger.debug("Initializing language model client") | |
| return ChatGroq( | |
| groq_api_key=self.groq_api_key, | |
| model_name=self.model_name, | |
| temperature=self.temperature, | |
| max_tokens=self.max_tokens | |
| ) | |
| def extract_profile(self, pdf_text: str) -> Profile: | |
| """ | |
| Main method to extract profile information from PDF text | |
| Args: | |
| pdf_text: Text extracted from a resume PDF | |
| Returns: | |
| Profile object with extracted information | |
| """ | |
| logger.info("Extracting profile information") | |
| try: | |
| profile = self._extract_with_langchain(pdf_text) | |
| logger.info("Profile extracted successfully with LangChain") | |
| return profile | |
| except Exception as e: | |
| logger.error(f"LangChain extraction failed: {e}") | |
| if settings.DEBUG: | |
| print(f"LangChain extraction failed: {e}") | |
| return Profile(name="N/A", title="N/A", email="N/A", bio="N/A") | |
| def _extract_with_langchain(self, pdf_text: str) -> Profile: | |
| """Extract profile with structured LangChain approach""" | |
| logger.debug("Extracting profile with LangChain") | |
| format_instructions = """ | |
| Extract the following information from the resume: | |
| 1. Full name | |
| 2. Professional title | |
| 3. Email address | |
| 4. Bio (a 50-100 word professional summary) | |
| 5. Tagline (a short 5-10 word catchy phrase summarizing professional identity) | |
| 6. Social media links (LinkedIn, GitHub, Instagram) | |
| 7. Projects (with title, description, and tech stack) | |
| 8. Skills (with category only one of : Technical, Soft Skills, or Domain Knowledge) | |
| 9. Education history (with school, degree, field of study, start date and end date) | |
| 10. Work experience (with company, position, start date, end date, and description) | |
| Return the information in the following JSON format: | |
| { | |
| "name": "Full Name", | |
| "title": "Professional Title", | |
| "email": "email@example.com", | |
| "bio": "Professional biography...", | |
| "tagline": "Catchy professional tagline", | |
| "social": { | |
| "linkedin": "LinkedIn URL or null", | |
| "github": "GitHub URL or null", | |
| "instagram": "Instagram URL or null" | |
| }, | |
| "projects": [ | |
| { | |
| "title": "Project Title", | |
| "description": "Project Description", | |
| "techStack": "Technologies used" | |
| } | |
| ], | |
| "skills": [ | |
| {"name": "Skill 1", "category": "Technical"}, | |
| {"name": "Skill 2", "category": "Soft Skills"}, | |
| {"name": "Skill 3", "category": "Domain Knowledge"} | |
| ], | |
| "educations": [ | |
| { | |
| "school": "University Name", | |
| "degree": "Degree Type (e.g., Bachelor's, Master's)", | |
| "fieldOfStudy": "Major or Field", | |
| "startDate": "Start Year", | |
| "endDate": "End Year or Present" | |
| } | |
| ], | |
| "experiences": [ | |
| { | |
| "company": "Company Name", | |
| "position": "Job Title", | |
| "startDate": "Start Date", | |
| "endDate": "End Date or Present", | |
| "description": "Job Description" | |
| } | |
| ] | |
| } | |
| If any information is not available, use null for that field. | |
| """ | |
| template = """ | |
| You are a professional resume parser. Extract structured information from the following resume: | |
| {pdf_text} | |
| {format_instructions} | |
| """ | |
| prompt = PromptTemplate( | |
| template=template, | |
| input_variables=["pdf_text"], | |
| partial_variables={"format_instructions": format_instructions} | |
| ) | |
| chain = prompt | self.llm | |
| result = chain.invoke({"pdf_text": pdf_text}) | |
| response_text = result.content | |
| json_start = response_text.find('{') | |
| json_end = response_text.rfind('}') + 1 | |
| if (json_start >= 0 and json_end > json_start): | |
| json_str = response_text[json_start:json_end] | |
| profile_dict = json.loads(json_str) | |
| profile = Profile.model_validate(profile_dict) | |
| profile = self._fill_missing_information(profile, pdf_text) | |
| logger.debug("Profile extracted and validated") | |
| return profile | |
| else: | |
| logger.error("No JSON found in the response") | |
| raise ValueError("No JSON found in the response") | |
| def _fill_missing_information(self, profile: Profile, pdf_text: str) -> Profile: | |
| """ | |
| Attempts to fill in any missing information in the profile | |
| """ | |
| logger.debug("Filling missing information in the profile") | |
| if not profile.name or profile.name == "N/A": | |
| try: | |
| response = self.llm.invoke("Extract only the full name from this resume text. Respond with just the name: " + pdf_text[:settings.CHUNK_SIZE]) | |
| name = response.content.strip() | |
| if name and name != "N/A": | |
| profile.name = name | |
| logger.debug(f"Extracted name: {name}") | |
| except Exception as e: | |
| logger.error(f"Error extracting name: {e}") | |
| if not profile.title or profile.title == "N/A": | |
| try: | |
| response = self.llm.invoke("Extract only the professional title from this resume text. Respond with just the title: " + pdf_text[:settings.CHUNK_SIZE]) | |
| title = response.content.strip() | |
| if title and title != "N/A": | |
| profile.title = title | |
| logger.debug(f"Extracted title: {title}") | |
| except Exception as e: | |
| logger.error(f"Error extracting title: {e}") | |
| if not profile.email or profile.email == "N/A": | |
| try: | |
| response = self.llm.invoke("Extract only the email address from this resume text. Respond with just the email: " + pdf_text) | |
| email = response.content.strip() | |
| if email and email != "N/A" and "@" in email: | |
| profile.email = email | |
| logger.debug(f"Extracted email: {email}") | |
| except Exception as e: | |
| logger.error(f"Error extracting email: {e}") | |
| if not profile.bio or profile.bio == "N/A": | |
| try: | |
| response = self.llm.invoke("Create a short professional biography (around 50-100 words) based on this resume. Focus on skills and experience: " + pdf_text) | |
| bio = response.content.strip() | |
| if bio and bio != "N/A": | |
| profile.bio = bio | |
| logger.debug(f"Created bio: {bio}") | |
| except Exception as e: | |
| logger.error(f"Error creating bio: {e}") | |
| if not profile.educations: | |
| try: | |
| education_prompt = "Extract education history from this resume. For each education entry, provide the school name, degree type, field of study, start date, and end date. Format the response as a list of JSON objects." | |
| response = self.llm.invoke(education_prompt + "\n\n" + pdf_text) | |
| education_text = response.content.strip() | |
| json_start = education_text.find('[') | |
| json_end = education_text.rfind(']') + 1 | |
| if json_start >= 0 and json_end > json_start: | |
| edu_json = education_text[json_start:json_end] | |
| educations = json.loads(edu_json) | |
| for edu in educations: | |
| education = Education( | |
| school=edu.get("school", "Unknown"), | |
| degree=edu.get("degree", ""), | |
| fieldOfStudy=edu.get("fieldOfStudy", ""), | |
| startDate=edu.get("startDate", ""), | |
| endDate=edu.get("endDate", "") | |
| ) | |
| profile.educations.append(education) | |
| logger.debug(f"Added education: {education}") | |
| except Exception as e: | |
| logger.error(f"Error extracting education: {e}") | |
| if not profile.skills: | |
| try: | |
| skills_prompt = """ | |
| Extract skills from this resume text and categorize them. | |
| For each skill, determine if it's a Technical skill, Soft Skill, or Domain Knowledge. | |
| Format the response as a JSON array of objects with 'name' and 'category' fields. | |
| Example: [{"name": "Python", "category": "Technical"}, {"name": "Communication", "category": "Soft Skills"}] | |
| """ | |
| response = self.llm.invoke(skills_prompt + "\n\n" + pdf_text) | |
| skills_text = response.content.strip() | |
| json_start = skills_text.find('[') | |
| json_end = skills_text.rfind(']') + 1 | |
| if json_start >= 0 and json_end > json_start: | |
| skills_json = skills_text[json_start:json_end] | |
| skills_list = json.loads(skills_json) | |
| for skill_data in skills_list: | |
| category = None | |
| skill_name = skill_data.get("name", "").strip() | |
| category_str = skill_data.get("category", "").strip() | |
| # Map the category string to our Category enum | |
| if category_str.lower() == "technical": | |
| category = Category.TECHNICAL | |
| elif category_str.lower() in ["soft skills", "soft skill"]: | |
| category = Category.SOFT_SKILLS | |
| elif category_str.lower() in ["domain knowledge", "domain"]: | |
| category = Category.DOMAIN_KNOWLEDGE | |
| if skill_name: | |
| profile.skills.append(Skill(name=skill_name, category=category)) | |
| logger.debug(f"Added categorized skill: {skill_name} ({category})") | |
| except Exception as e: | |
| logger.error(f"Error extracting categorized skills: {e}") | |
| if not profile.experiences: | |
| try: | |
| experience_prompt = """ | |
| Extract work experience from this resume. For each position, provide: | |
| - Company name | |
| - Position/job title | |
| - Start date | |
| - End date (or "Present" if current) | |
| - Job description (summarize responsibilities and achievements) | |
| Format the response as a list of JSON objects. | |
| """ | |
| response = self.llm.invoke(experience_prompt + "\n\n" + pdf_text) | |
| exp_text = response.content.strip() | |
| json_start = exp_text.find('[') | |
| json_end = exp_text.rfind(']') + 1 | |
| if json_start >= 0 and json_end > json_start: | |
| exp_json = exp_text[json_start:json_end] | |
| experiences = json.loads(exp_json) | |
| for exp in experiences: | |
| experience = Experience( | |
| company=exp.get("company", "Unknown"), | |
| position=exp.get("position", ""), | |
| startDate=exp.get("startDate", ""), | |
| endDate=exp.get("endDate", ""), | |
| description=exp.get("description", "") | |
| ) | |
| profile.experiences.append(experience) | |
| logger.debug(f"Added experience: {experience.company} - {experience.position}") | |
| except Exception as e: | |
| logger.error(f"Error extracting work experience: {e}") | |
| return profile | |
| # Create module-level instance for easier imports | |
| profile_extractor = ProfileExtractor() | |
| # Export function for backward compatibility | |
| def extract_profile_information(pdf_text: str) -> Profile: | |
| """Legacy function for backward compatibility""" | |
| return profile_extractor.extract_profile(pdf_text) | |
| # Export the class and the function | |
| __all__ = ['ProfileExtractor', 'extract_profile_information'] | |