Spaces:

manasagangotri
/

classify

Sleeping

App Files Files Community

classify / modules /RandomForest.py

manasagangotri

Upload folder using huggingface_hub

e062e72 verified about 1 year ago

raw

history blame

3.51 kB

	import pandas as pd
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.model_selection import train_test_split
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.preprocessing import LabelEncoder
	from datasets import load_dataset
	import joblib
	import os
	import numpy as np

	# Define paths for the Random Forest model, TF-IDF vectorizer, and label encoder
	rf_model_path = 'random_forest_model.pkl'
	vectorizer_path = "tfidf_vectorizer.pkl"
	label_encoder_path = "label_encoder.pkl"
	multi_rf_model_path= "random_forest_multi_model.pkl"

	# Check if models and encoder exist
	if os.path.exists(rf_model_path) and os.path.exists(vectorizer_path) and os.path.exists(label_encoder_path) and os.path.exists(multi_rf_model_path):
	# Load the models if they already exist
	rf_single = joblib.load(rf_model_path)
	vectorizer = joblib.load(vectorizer_path)
	le = joblib.load(label_encoder_path)
	rf_multi = joblib.load(multi_rf_model_path)
	print("Random Forest model, vectorizer, and label encoder loaded from disk.")
	else:
	# Load the dataset
	ds = load_dataset('ahmedheakl/resume-atlas', cache_dir="C:/Users/dell/.cache/huggingface/datasets")

	# Create a DataFrame from the 'train' split
	df_train = pd.DataFrame(ds['train'])

	# Initialize the Label Encoder and encode the 'Category' labels
	le = LabelEncoder()
	df_train['Category_encoded'] = le.fit_transform(df_train['Category'])

	# Split the dataset into training and test sets
	X_train, X_test, y_train, y_test = train_test_split(
	df_train['Text'], df_train['Category_encoded'], test_size=0.2, random_state=42)

	# Initialize TF-IDF Vectorizer and transform the text data
	vectorizer = TfidfVectorizer(max_features=1000)
	X_train_tfidf = vectorizer.fit_transform(X_train)
	X_test_tfidf = vectorizer.transform(X_test)

	# Initialize and train the Random Forest models
	rf_single = RandomForestClassifier(n_estimators=100, random_state=42)
	rf_single.fit(X_train_tfidf, y_train)

	rf_multi = RandomForestClassifier(n_estimators=100, random_state=42)
	rf_multi.fit(X_train_tfidf, y_train)

	# Save the Random Forest models, TF-IDF vectorizer, and label encoder
	joblib.dump(rf_single, rf_model_path)
	joblib.dump(rf_multi, multi_rf_model_path)
	joblib.dump(vectorizer, vectorizer_path)
	joblib.dump(le, label_encoder_path)
	print("Random Forest model, vectorizer, and label encoder trained and saved to disk.")

	# Single-label classification function for Random Forest model
	def classify_text_rf(text):
	try:
	text_tfidf = vectorizer.transform([text])
	predicted_class_index = rf_single.predict(text_tfidf)[0]
	predicted_category = le.inverse_transform([predicted_class_index])[0]
	return predicted_category
	except Exception as e:
	print(f"Error in classify_text_rf: {e}")
	return None

	# Multi-label classification function with top N predictions
	def classify_text_rf_multi(text, top_n=3):
	try:
	text_tfidf = vectorizer.transform([text])
	probabilities = rf_multi.predict_proba(text_tfidf)[0]
	top_n_indices = np.argsort(probabilities)[::-1][:min(top_n, len(probabilities))]
	top_n_categories = le.inverse_transform(top_n_indices)
	return top_n_categories
	except Exception as e:
	print(f"Error in classify_text_rf_multi: {e}")
	return None