Spaces:
Build error
Build error
Update similarity_score_refined.py
Browse files- similarity_score_refined.py +114 -138
similarity_score_refined.py
CHANGED
|
@@ -1,146 +1,122 @@
|
|
| 1 |
-
|
| 2 |
-
"""Similarity_score_refined (2).ipynb
|
| 3 |
|
| 4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
""
|
| 9 |
-
|
| 10 |
-
# !pip install sentence_transformers
|
| 11 |
-
# !pip install openai==0.28
|
| 12 |
-
# !pip install docx2txt PyPDF2 transformers
|
| 13 |
-
|
| 14 |
-
# from google.colab import drive,userdata
|
| 15 |
-
# drive.mount("/content/drive")
|
| 16 |
-
# print("Google Drive mounted.")
|
| 17 |
-
|
| 18 |
-
import re
|
| 19 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 20 |
-
from nltk.corpus import stopwords
|
| 21 |
-
from nltk.stem import WordNetLemmatizer
|
| 22 |
-
import os
|
| 23 |
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
nltk.download('stopwords')
|
| 27 |
-
nltk.download('wordnet')
|
| 28 |
-
|
| 29 |
-
def extract_text(file_path):
|
| 30 |
-
import docx2txt
|
| 31 |
-
import PyPDF2
|
| 32 |
-
if file_path.endswith(".docx"):
|
| 33 |
-
# Extract text from DOCX file
|
| 34 |
-
return docx2txt.process(file_path)
|
| 35 |
-
|
| 36 |
-
elif file_path.endswith(".pdf"):
|
| 37 |
-
# Extract text from PDF file
|
| 38 |
-
text = ""
|
| 39 |
-
with open(file_path, 'rb') as file:
|
| 40 |
reader = PyPDF2.PdfReader(file)
|
|
|
|
| 41 |
for page_num in range(len(reader.pages)):
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
# Remove stop words
|
| 59 |
-
stop_words = set(stopwords.words('english'))
|
| 60 |
-
words = [word for word in words if word not in stop_words]
|
| 61 |
-
|
| 62 |
-
# Lemmatize the words (to get root form)
|
| 63 |
-
lemmatizer = WordNetLemmatizer()
|
| 64 |
-
words = [lemmatizer.lemmatize(word) for word in words]
|
| 65 |
-
|
| 66 |
-
# Join words back into a single string
|
| 67 |
-
return ' '.join(words)
|
| 68 |
-
|
| 69 |
-
def calculate_tfidf(doc):
|
| 70 |
-
vectorizer = TfidfVectorizer()
|
| 71 |
-
tfidf_matrix = vectorizer.fit_transform([doc]) # Only fit on the individual document
|
| 72 |
-
feature_names = vectorizer.get_feature_names_out()
|
| 73 |
-
dense_tfidf_matrix = tfidf_matrix.todense()
|
| 74 |
-
|
| 75 |
-
# Extract important terms from the document with a threshold
|
| 76 |
-
important_terms = [feature_names[i] for i in range(len(feature_names)) if dense_tfidf_matrix[0, i] > 0.2]
|
| 77 |
-
|
| 78 |
-
return ' '.join(important_terms)
|
| 79 |
-
|
| 80 |
-
def call_chatgpt_api(prompt, api_key,model="gpt-3.5-turbo"):
|
| 81 |
-
import openai
|
| 82 |
-
openai.api_key = api_key
|
| 83 |
-
response = openai.ChatCompletion.create(
|
| 84 |
-
model="gpt-3.5-turbo",
|
| 85 |
-
messages=[
|
| 86 |
-
{"role": "system", "content": "You are a helpful assistant."},
|
| 87 |
-
{"role": "user", "content": prompt}
|
| 88 |
-
],
|
| 89 |
-
max_tokens=500,
|
| 90 |
-
temperature= 0,
|
| 91 |
-
top_p=1,
|
| 92 |
-
frequency_penalty= 0,
|
| 93 |
-
presence_penalty= 0
|
| 94 |
-
)
|
| 95 |
-
return response['choices'][0]['message']['content'].strip()
|
| 96 |
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
model = SentenceTransformer(model_name)
|
| 100 |
-
|
| 101 |
-
# Convert texts to embeddings
|
| 102 |
-
embeddings1 = model.encode(resume, convert_to_tensor=True)
|
| 103 |
-
embeddings2 = model.encode(job_desc, convert_to_tensor=True)
|
| 104 |
-
|
| 105 |
-
# Calculate cosine similarity
|
| 106 |
-
similarity_score = util.pytorch_cos_sim(embeddings1, embeddings2)
|
| 107 |
-
return similarity_score.item() # return as a scalar
|
| 108 |
-
|
| 109 |
-
def similarity_main(resume_path,job_description_path):
|
| 110 |
-
|
| 111 |
-
# Extract text from files (replace with actual file paths)
|
| 112 |
-
Resume_text = extract_text(resume_path)
|
| 113 |
-
job_des = extract_text(job_description_path)
|
| 114 |
-
api_key=os.environ.get('OPENAI_KEY')
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
prompt=f"Extract the skills or competencies section from the resume. Avoid using name of the candidate:\n\n{Resume_text}"
|
| 118 |
-
resume_skills = call_chatgpt_api(prompt,api_key)
|
| 119 |
-
experience_prompt = f"Extract the experience of the candidate from the resume. Avoid using name of the candidate:\n\n{Resume_text}"
|
| 120 |
-
resume_experience = call_chatgpt_api(experience_prompt,api_key)
|
| 121 |
-
|
| 122 |
-
# Extract sections from job description (JD)
|
| 123 |
-
jd_skills_prompt = f"Extract the skills section from the job description:\n\n{job_des}"
|
| 124 |
-
jd_skills = call_chatgpt_api(jd_skills_prompt,api_key)
|
| 125 |
-
|
| 126 |
-
jd_experience_prompt = f"Extract the experience section from the job description:\n\n{job_des}"
|
| 127 |
-
jd_experience = call_chatgpt_api(jd_experience_prompt,api_key)
|
| 128 |
-
|
| 129 |
-
resume_skills_clean = preprocess(resume_skills)
|
| 130 |
-
jd_skills_clean = preprocess(jd_skills)
|
| 131 |
-
|
| 132 |
-
resume_experience_clean = preprocess(resume_experience)
|
| 133 |
-
jd_experience_clean = preprocess(jd_experience)
|
| 134 |
-
|
| 135 |
-
filtered_resume = calculate_tfidf(resume_skills_clean)
|
| 136 |
-
filtered_jd = calculate_tfidf(jd_skills_clean)
|
| 137 |
-
similarity_skills=calculate_similarity(filtered_resume,filtered_jd)
|
| 138 |
-
|
| 139 |
-
filtered_resume_ex = calculate_tfidf(resume_experience_clean)
|
| 140 |
-
filtered_jd_ex = calculate_tfidf(jd_experience_clean)
|
| 141 |
-
similarity_ex=calculate_similarity(filtered_resume_ex,filtered_jd_ex)
|
| 142 |
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from docx.opc.exceptions import PackageNotFoundError
|
|
|
|
| 2 |
|
| 3 |
+
def read_file(file_path):
|
| 4 |
+
"""
|
| 5 |
+
Reads the content of a file. If the file is a PDF, it extracts the text using PyPDF2.
|
| 6 |
+
If the file is a docx, it extracts the text using python-docx.
|
| 7 |
+
Otherwise, it reads the file as a text file, trying different encodings if 'utf-8' fails.
|
| 8 |
+
"""
|
| 9 |
|
| 10 |
+
# Check if the file exists before proceeding
|
| 11 |
+
if not os.path.exists(file_path):
|
| 12 |
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
+
if file_path.lower().endswith('.pdf'):
|
| 15 |
+
with open(file_path, 'rb') as file: # Open in binary read mode for PDFs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
reader = PyPDF2.PdfReader(file)
|
| 17 |
+
text = ""
|
| 18 |
for page_num in range(len(reader.pages)):
|
| 19 |
+
page = reader.pages[page_num]
|
| 20 |
+
text += page.extract_text()
|
| 21 |
+
return text
|
| 22 |
+
elif file_path.lower().endswith('.docx'):
|
| 23 |
+
# Handle docx files using python-docx
|
| 24 |
+
try:
|
| 25 |
+
doc = Document(file_path)
|
| 26 |
+
text = ""
|
| 27 |
+
for paragraph in doc.paragraphs:
|
| 28 |
+
text += paragraph.text + "\n" # Add newline for paragraph separation
|
| 29 |
+
return text
|
| 30 |
+
# Use the imported exception class
|
| 31 |
+
except PackageNotFoundError:
|
| 32 |
+
# Provide a more informative error message if the file is not a valid docx
|
| 33 |
+
raise PackageNotFoundError(f"The file {file_path} is not a valid docx file. It may be corrupted or of a different format.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
+
import os
|
| 36 |
+
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/content/drive/MyDrive/Resume/firm-capsule-436804-b5-5f553d9f1043.json"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
+
import os
|
| 39 |
+
# from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 40 |
+
# from langchain_community.vectorstores.faiss import FAISS
|
| 41 |
+
from google.colab import drive
|
| 42 |
+
from docx import Document
|
| 43 |
+
import google.generativeai as genai
|
| 44 |
+
from datetime import datetime
|
| 45 |
+
import PyPDF2
|
| 46 |
+
|
| 47 |
+
api_key_google = userdata.get('google_cloud')
|
| 48 |
+
genai.configure(api_key=api_key_google)
|
| 49 |
+
|
| 50 |
+
# Mount Google Drive
|
| 51 |
+
drive.mount('/content/drive')
|
| 52 |
+
|
| 53 |
+
model = genai.GenerativeModel('gemini-pro')
|
| 54 |
+
|
| 55 |
+
def check_relevance_gemini(tailored_resume, job_description):
|
| 56 |
+
"""
|
| 57 |
+
Use Gemini Pro to evaluate the relevance score between a tailored resume and job description.
|
| 58 |
+
|
| 59 |
+
Args:
|
| 60 |
+
- tailored_resume (str): Tailored resume content.
|
| 61 |
+
- job_description (str): Job description content.
|
| 62 |
+
|
| 63 |
+
Returns:
|
| 64 |
+
- dict: A dictionary containing the 'score' and 'reason'.
|
| 65 |
+
"""
|
| 66 |
+
prompt = f"""
|
| 67 |
+
You are a recruitment expert evaluating how well a tailored resume aligns with a job description. Provide a realistic and concise evaluation based on the following criteria:
|
| 68 |
+
1. Relevance of skills and experience: Do the candidate’s skills, accomplishments, and experience meet the job's core requirements?
|
| 69 |
+
2. Domain Match: Are the candidate's experiences and achievements relevant to the industry or role?
|
| 70 |
+
3. Clarity and Conciseness: Is the resume well-structured and focused on the job requirements?
|
| 71 |
+
4. Highlight any gaps or mismatched qualifications realistically.
|
| 72 |
+
|
| 73 |
+
Provide your response in this exact format:
|
| 74 |
+
Score: [Score between 0 and 1]
|
| 75 |
+
Reason: [One or two sentences explaining the score]
|
| 76 |
+
|
| 77 |
+
Here is the tailored resume:
|
| 78 |
+
[Resume Start]
|
| 79 |
+
{tailored_resume}
|
| 80 |
+
[Resume End]
|
| 81 |
+
|
| 82 |
+
And the job description below:
|
| 83 |
+
[Job Description Start]
|
| 84 |
+
{job_description}
|
| 85 |
+
[Job Description End]
|
| 86 |
+
"""
|
| 87 |
|
| 88 |
+
try:
|
| 89 |
+
# Get the response from Gemini Pro
|
| 90 |
+
response = model.generate_content(prompt)
|
| 91 |
+
candidates = response.candidates
|
| 92 |
+
if not candidates or len(candidates) == 0:
|
| 93 |
+
raise ValueError("No candidates found in the response.")
|
| 94 |
+
|
| 95 |
+
# Extract content text
|
| 96 |
+
content_text = candidates[0].content.parts[0].text
|
| 97 |
+
|
| 98 |
+
# Extract score and reason with simple parsing
|
| 99 |
+
lines = content_text.split("\n")
|
| 100 |
+
score = None
|
| 101 |
+
reason = None
|
| 102 |
+
print(content_text)
|
| 103 |
+
for line in lines:
|
| 104 |
+
if line.lower().startswith("score:"):
|
| 105 |
+
try:
|
| 106 |
+
score = float(line.split(":", 1)[1].strip())
|
| 107 |
+
except ValueError:
|
| 108 |
+
raise ValueError(f"Invalid score format: {line}")
|
| 109 |
+
elif line.lower().startswith("reason:"):
|
| 110 |
+
reason = line.split(":", 1)[1].strip()
|
| 111 |
+
|
| 112 |
+
# Ensure both score and reason are extracted
|
| 113 |
+
if score is None:
|
| 114 |
+
raise ValueError("Failed to extract score from the response.")
|
| 115 |
+
if not reason:
|
| 116 |
+
reason = "No reason provided."
|
| 117 |
+
|
| 118 |
+
return {"score": score, "reason": reason}
|
| 119 |
+
|
| 120 |
+
except Exception as e:
|
| 121 |
+
print(f"Error in relevance checking: {e}")
|
| 122 |
+
return None
|