import pandas as pd import numpy as np import re from sklearn.ensemble import IsolationForest from detoxify import Detoxify import textstat # Clean text def clean_text(text): text = str(text).lower() text = re.sub(r"http\S+|www\S+|https\S+", '', text) text = re.sub(r'\@\w+|\#','', text) text = re.sub(r'[^\w\s]', '', text) return text # Load and preprocess def load_data(filepath): df = pd.read_csv(filepath) df.dropna(subset=["text", "helpfulness_score"], inplace=True) return df def preprocess_text(df): df['text'] = df['text'].apply(clean_text) # Toxicity score print("Computing toxicity scores...") toxicity_results = Detoxify('original').predict(df['text'].tolist()) df['toxicity_score'] = toxicity_results['toxicity'] # Readability score (lower is harder to read) print("Computing readability scores...") df['readability_score'] = df['text'].apply(lambda x: textstat.flesch_reading_ease(x)) # Anomaly Detection print("Running anomaly detection...") meta_features = df[["toxicity_score", "readability_score"]].fillna(0) clf = IsolationForest(contamination=0.05, random_state=42) df['is_anomalous'] = clf.fit_predict(meta_features) df['is_anomalous'] = df['is_anomalous'].apply(lambda x: 1 if x == -1 else 0) return df # For custom evaluation later def compute_custom_metrics(y_true, y_pred): from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score return { "MAE": mean_absolute_error(y_true, y_pred), "MSE": mean_squared_error(y_true, y_pred), "R2": r2_score(y_true, y_pred) }