import numpy as np
import re
import string
from detoxify import Detoxify

detox_model = Detoxify('original')

def preprocess_comment(text):
    text = re.sub(r"http\S+", "", text)
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    return text.strip()

def extract_metadata_features(text):
    toxicity = detox_model.predict(text)["toxicity"]
    word_count = len(text.split())
    readability = min(1.0, word_count / 50)  # Normalize to [0,1]
    engagement = min(1.0, sum(1 for w in text.split() if len(w) > 6) / word_count) if word_count else 0
    return np.array([toxicity, readability, engagement], dtype=np.float32)