Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import pandas as pd | |
| import spacy | |
| from spacy import displacy | |
| import plotly.express as px | |
| import numpy as np | |
| import re | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk.stem import WordNetLemmatizer | |
| nltk.download(['stopwords','wordnet']) | |
| nltk.download('omw-1.4') | |
| # Load the CSV file into a DataFrame | |
| dataset_path = "Resume.csv" | |
| df = pd.read_csv(dataset_path) | |
| df= df.reindex(np.random.permutation(df.index)) | |
| data = df.copy().iloc[0:500,] | |
| # Load the spaCy English language model with large vocabulary and pre-trained word vectors | |
| spacy_model = spacy.load("en_core_web_lg") | |
| # Path to the file containing skill patterns in JSONL format (2129 skills) | |
| skill_pattern_path = "jz_skill_patterns.jsonl" | |
| # Add an entity ruler to the spaCy pipeline | |
| ruler = spacy_model.add_pipe("entity_ruler") | |
| # Load skill patterns from disk into the entity ruler | |
| ruler.from_disk(skill_pattern_path) | |
| def get_unique_skills(text): | |
| doc = spacy_model(text) | |
| skills = set() | |
| for ent in doc.ents: | |
| if ent.label_ == "SKILL": | |
| skills.add(ent.text) | |
| return list(skills) | |
| def preprocess_resume(resume_str): | |
| # Remove special characters, URLs, and Twitter mentions | |
| review = re.sub(r'(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?"', " ", resume_str) | |
| # Convert to lowercase and tokenize | |
| review = review.lower().split() | |
| # Lemmatize and remove stopwords | |
| lm = WordNetLemmatizer() | |
| review = [lm.lemmatize(word) for word in review if word not in set(stopwords.words("english"))] | |
| # Join the words back into a string | |
| review = " ".join(review) | |
| return review | |
| # Apply the preprocess_resume function to each resume string and store the result in a new column | |
| data["Clean_Resume"] = data["Resume_str"].apply(preprocess_resume) | |
| # Extract skills from each preprocessed resume and store them in a new column | |
| data["skills"] = data["Clean_Resume"].str.lower().apply(get_unique_skills) | |
| def get_skills_distribution(Job_Category): | |
| if Job_Category != "ALL": | |
| filtered_data = data[data["Category"] == Job_Category]["skills"] | |
| else: | |
| filtered_data = data["skills"] | |
| total_skills = [skill for sublist in filtered_data for skill in sublist] | |
| fig = px.histogram( | |
| x=total_skills, | |
| labels={"x": "Skills"}, | |
| title=f"{Job_Category} Distribution of Skills", | |
| ).update_xaxes(categoryorder="total descending") | |
| return fig.show() | |
| # Apply the preprocess_resume function to each resume string and store the result in a new column | |
| data["Clean_Resume"] = data["Resume_str"].apply(preprocess_resume) | |
| # Extract skills from each preprocessed resume and store them in a new column | |
| data["skills"] = data["Clean_Resume"].str.lower().apply(get_unique_skills) | |
| patterns = data.Category.unique() | |
| for a in patterns: | |
| ruler.add_patterns([{"label": "Job-Category", "pattern": a}]) | |
| # Define the options for highlighting entities | |
| options = { | |
| "ents": [ | |
| "Job-Category", | |
| "SKILL", | |
| "ORG", | |
| "PERSON", | |
| "GPE", | |
| "DATE", | |
| "ORDINAL", | |
| "PRODUCT", | |
| ], | |
| } | |
| # Define a function to process the resume text and highlight entities | |
| def highlight_entities(resume_text): | |
| # Process the resume text with spaCy | |
| doc = spacy_model(resume_text) | |
| # Render the entities with displacy and return the HTML | |
| html = displacy.render(doc, style="ent", options=options, jupyter=False) | |
| return html | |
| def calculate_semantic_similarity(required_skills, resume_skills): | |
| """ | |
| Calculate the semantic similarity between required skills and resume skills. | |
| """ | |
| required_skills_str = " ".join(required_skills) | |
| resume_skills_str = " ".join(resume_skills) | |
| required_skills_doc = spacy_model(required_skills_str) | |
| resume_skills_doc = spacy_model(resume_skills_str) | |
| similarity_score = required_skills_doc.similarity(resume_skills_doc) | |
| return similarity_score | |
| def find_matching_resumes(input_skills, n=5): | |
| """ | |
| Find and rank the top matching resumes based on input skills. | |
| """ | |
| req_skills = input_skills.lower().split(",") | |
| ranked_resumes = [] | |
| for idx, row in data.iterrows(): | |
| resume_skills = row['skills'] | |
| similarity_score = calculate_semantic_similarity(req_skills, resume_skills) | |
| ranked_resumes.append((row['Resume_str'], similarity_score)) # Accessing 'resume_str' directly | |
| # Sort resumes by similarity scores in descending order | |
| ranked_resumes.sort(key=lambda x: x[1], reverse=True) | |
| # Get the top N matching resumes | |
| top_matching_resumes = ranked_resumes[:n] | |
| # Construct output in a structured format | |
| output = [] | |
| for resume_str, score in top_matching_resumes: | |
| output.append(f"Similarity Score: {score}\nResume: {resume_str}") # Return 'resume_str' instead of 'resume_id' | |
| return output | |
| with gr.Blocks() as demo: | |
| gr.Markdown("Enter your resume text and perform NER, or enter the required skills and find the top matching resumes.") | |
| with gr.Tab("Enter your resume text and perform NER"): | |
| text_input = gr.Textbox(lines=10, label="Input Resume Text") | |
| text_output = gr.HTML(label="Highlighted Entities") | |
| text_button = gr.Button("Submit") | |
| with gr.Tab("Enter the required skills (comma-separated) and find the top matching resumes."): | |
| text_input2 = gr.Textbox(lines=5, label="Input Required Skills (comma-separated)") | |
| text_output2 = gr.Textbox(label="Top Matching Resumes") | |
| text_button2 = gr.Button("Submit") | |
| text_button.click(highlight_entities, inputs=text_input, outputs=text_output) | |
| text_button2.click(find_matching_resumes, inputs=text_input2, outputs=text_output2) | |
| demo.launch() |