toxic-comment-classifier_rlhf

Paused

App Files Files Community

JanviMl commited on Mar 25

Commit

829572e

verified ·

1 Parent(s): bb4164b

Create metrics.py

Browse files

Files changed (1) hide show

metrics.py +64 -0

metrics.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# metrics.py
+import torch
+from sentence_transformers import SentenceTransformer, util
+from transformers import pipeline
+# Load Sentence-BERT model for semantic similarity
+sentence_bert_model = SentenceTransformer('all-MiniLM-L6-v2')
+# Load a pre-trained emotion classifier (placeholder; replace with a specific model if available)
+emotion_classifier = pipeline("text-classification", model="bhadresh-savani/distilbert-base-uncased-emotion", top_k=None)
+def compute_semantic_similarity(original_comment, paraphrased_comment):
+    """
+    Compute the semantic similarity between the original and paraphrased comments using Sentence-BERT.
+    Returns a score between 0 and 1 (higher is better).
+    """
+    # Encode the comments into embeddings
+    original_embedding = sentence_bert_model.encode(original_comment, convert_to_tensor=True)
+    paraphrased_embedding = sentence_bert_model.encode(paraphrased_comment, convert_to_tensor=True)
+    # Compute cosine similarity
+    similarity_score = util.cos_sim(original_embedding, paraphrased_embedding)[0][0].item()
+    return round(similarity_score, 2)
+def compute_emotion_shift(original_comment, paraphrased_comment):
+    """
+    Compute the shift in emotional tone between the original and paraphrased comments.
+    Returns the dominant emotion labels for both comments and a flag indicating if the shift is positive.
+    """
+    # Classify emotions in the original comment
+    original_emotions = emotion_classifier(original_comment)
+    original_dominant_emotion = max(original_emotions, key=lambda x: x['score'])['label']
+    # Classify emotions in the paraphrased comment
+    paraphrased_emotions = emotion_classifier(paraphrased_comment)
+    paraphrased_dominant_emotion = max(paraphrased_emotions, key=lambda x: x['score'])['label']
+    # Define negative and positive emotions
+    negative_emotions = ['anger', 'sadness', 'fear']
+    positive_emotions = ['joy', 'love']
+    # Check if the shift is positive (e.g., from a negative emotion to a neutral/positive one)
+    is_positive_shift = (
+        original_dominant_emotion in negative_emotions and
+        (paraphrased_dominant_emotion in positive_emotions or paraphrased_dominant_emotion not in negative_emotions)
+    )
+    return original_dominant_emotion, paraphrased_dominant_emotion, is_positive_shift
+def compute_empathy_score(paraphrased_comment):
+    """
+    Compute a proxy empathy score based on politeness keywords.
+    Returns a score between 0 and 1 (higher indicates more empathy).
+    """
+    # Define a list of politeness/empathy-related keywords
+    empathy_keywords = ['please', 'thank you', 'appreciate', 'understand', 'sorry', 'consider', 'kindly', 'help', 'support']
+    # Count the number of empathy keywords in the paraphrased comment
+    comment_lower = paraphrased_comment.lower()
+    keyword_count = sum(1 for keyword in empathy_keywords if keyword in comment_lower)
+    # Normalize the score (arbitrary scaling; max 3 keywords for a score of 1)
+    empathy_score = min(keyword_count / 3, 1.0)
+    return round(empathy_score, 2)