Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.preprocessing import LabelEncoder | |
| from datasets import load_dataset | |
| import joblib | |
| import os | |
| import numpy as np | |
| # Define paths for the Random Forest model, TF-IDF vectorizer, and label encoder | |
| rf_model_path = 'random_forest_model.pkl' | |
| vectorizer_path = "tfidf_vectorizer.pkl" | |
| label_encoder_path = "label_encoder.pkl" | |
| multi_rf_model_path= "random_forest_multi_model.pkl" | |
| # Check if models and encoder exist | |
| if os.path.exists(rf_model_path) and os.path.exists(vectorizer_path) and os.path.exists(label_encoder_path) and os.path.exists(multi_rf_model_path): | |
| # Load the models if they already exist | |
| rf_single = joblib.load(rf_model_path) | |
| vectorizer = joblib.load(vectorizer_path) | |
| le = joblib.load(label_encoder_path) | |
| rf_multi = joblib.load(multi_rf_model_path) | |
| print("Random Forest model, vectorizer, and label encoder loaded from disk.") | |
| else: | |
| # Load the dataset | |
| ds = load_dataset('ahmedheakl/resume-atlas', cache_dir="C:/Users/dell/.cache/huggingface/datasets") | |
| # Create a DataFrame from the 'train' split | |
| df_train = pd.DataFrame(ds['train']) | |
| # Initialize the Label Encoder and encode the 'Category' labels | |
| le = LabelEncoder() | |
| df_train['Category_encoded'] = le.fit_transform(df_train['Category']) | |
| # Split the dataset into training and test sets | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| df_train['Text'], df_train['Category_encoded'], test_size=0.2, random_state=42) | |
| # Initialize TF-IDF Vectorizer and transform the text data | |
| vectorizer = TfidfVectorizer(max_features=1000) | |
| X_train_tfidf = vectorizer.fit_transform(X_train) | |
| X_test_tfidf = vectorizer.transform(X_test) | |
| # Initialize and train the Random Forest models | |
| rf_single = RandomForestClassifier(n_estimators=100, random_state=42) | |
| rf_single.fit(X_train_tfidf, y_train) | |
| rf_multi = RandomForestClassifier(n_estimators=100, random_state=42) | |
| rf_multi.fit(X_train_tfidf, y_train) | |
| # Save the Random Forest models, TF-IDF vectorizer, and label encoder | |
| joblib.dump(rf_single, rf_model_path) | |
| joblib.dump(rf_multi, multi_rf_model_path) | |
| joblib.dump(vectorizer, vectorizer_path) | |
| joblib.dump(le, label_encoder_path) | |
| print("Random Forest model, vectorizer, and label encoder trained and saved to disk.") | |
| # Single-label classification function for Random Forest model | |
| def classify_text_rf(text): | |
| try: | |
| text_tfidf = vectorizer.transform([text]) | |
| predicted_class_index = rf_single.predict(text_tfidf)[0] | |
| predicted_category = le.inverse_transform([predicted_class_index])[0] | |
| return predicted_category | |
| except Exception as e: | |
| print(f"Error in classify_text_rf: {e}") | |
| return None | |
| # Multi-label classification function with top N predictions | |
| def classify_text_rf_multi(text, top_n=3): | |
| try: | |
| text_tfidf = vectorizer.transform([text]) | |
| probabilities = rf_multi.predict_proba(text_tfidf)[0] | |
| top_n_indices = np.argsort(probabilities)[::-1][:min(top_n, len(probabilities))] | |
| top_n_categories = le.inverse_transform(top_n_indices) | |
| return top_n_categories | |
| except Exception as e: | |
| print(f"Error in classify_text_rf_multi: {e}") | |
| return None | |