Spaces:

Seyomi
/

community-rating-system

Runtime error

community-rating-system / src /data_preprocessing.py

Add application file

d75e318 6 months ago

1.64 kB

	import pandas as pd
	import numpy as np
	import re
	from sklearn.ensemble import IsolationForest
	from detoxify import Detoxify
	import textstat

	# Clean text
	def clean_text(text):
	text = str(text).lower()
	text = re.sub(r"http\S+\|www\S+\|https\S+", '', text)
	text = re.sub(r'\@\w+\|\#','', text)
	text = re.sub(r'[^\w\s]', '', text)
	return text

	# Load and preprocess
	def load_data(filepath):
	df = pd.read_csv(filepath)
	df.dropna(subset=["text", "helpfulness_score"], inplace=True)
	return df

	def preprocess_text(df):
	df['text'] = df['text'].apply(clean_text)

	# Toxicity score
	print("Computing toxicity scores...")
	toxicity_results = Detoxify('original').predict(df['text'].tolist())
	df['toxicity_score'] = toxicity_results['toxicity']

	# Readability score (lower is harder to read)
	print("Computing readability scores...")
	df['readability_score'] = df['text'].apply(lambda x: textstat.flesch_reading_ease(x))

	# Anomaly Detection
	print("Running anomaly detection...")
	meta_features = df[["toxicity_score", "readability_score"]].fillna(0)
	clf = IsolationForest(contamination=0.05, random_state=42)
	df['is_anomalous'] = clf.fit_predict(meta_features)
	df['is_anomalous'] = df['is_anomalous'].apply(lambda x: 1 if x == -1 else 0)

	return df

	# For custom evaluation later
	def compute_custom_metrics(y_true, y_pred):
	from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
	return {
	"MAE": mean_absolute_error(y_true, y_pred),
	"MSE": mean_squared_error(y_true, y_pred),
	"R2": r2_score(y_true, y_pred)
	}