Update metrics.py
Browse files- metrics.py +48 -7
metrics.py
CHANGED
|
@@ -8,6 +8,50 @@ def softmax(logits):
|
|
| 8 |
exp_logits = np.exp(logits - np.max(logits))
|
| 9 |
return exp_logits / exp_logits.sum()
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
def compute_reward_scores(original, paraphrase):
|
| 12 |
"""
|
| 13 |
Compute reward scores for a paraphrased comment.
|
|
@@ -44,17 +88,14 @@ def compute_reward_scores(original, paraphrase):
|
|
| 44 |
probs = softmax(logits)
|
| 45 |
|
| 46 |
toxicity = probs[1] # Assuming label 1 is toxic
|
| 47 |
-
empathy =
|
| 48 |
-
bias =
|
| 49 |
print(f"Classification took {time.time() - start_time:.2f} seconds")
|
| 50 |
|
| 51 |
# Compute semantic similarity using Sentence-BERT
|
| 52 |
print("Computing semantic similarity...")
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
similarity = torch.cosine_similarity(embeddings[0], embeddings[1], dim=0).item()
|
| 56 |
-
hallucination = 1.0 - similarity # High difference means potential hallucination
|
| 57 |
-
print(f"Semantic similarity computed: {similarity}")
|
| 58 |
|
| 59 |
# Compute reward score (weighted combination)
|
| 60 |
reward = 0.4 * empathy - 0.2 * toxicity - 0.2 * bias - 0.2 * hallucination
|
|
|
|
| 8 |
exp_logits = np.exp(logits - np.max(logits))
|
| 9 |
return exp_logits / exp_logits.sum()
|
| 10 |
|
| 11 |
+
def compute_semantic_similarity(original, paraphrase):
|
| 12 |
+
"""
|
| 13 |
+
Compute semantic similarity between original and paraphrased text using Sentence-BERT.
|
| 14 |
+
Returns a similarity score between 0 and 1.
|
| 15 |
+
"""
|
| 16 |
+
try:
|
| 17 |
+
if not isinstance(original, str) or not isinstance(paraphrase, str):
|
| 18 |
+
print(f"Invalid input for semantic similarity: original={original}, paraphrase={paraphrase}")
|
| 19 |
+
return 0.0
|
| 20 |
+
if "Error: Unable to generate paraphrase" in paraphrase:
|
| 21 |
+
print(f"Invalid paraphrase: {paraphrase}. Returning similarity 0.0.")
|
| 22 |
+
return 0.0
|
| 23 |
+
|
| 24 |
+
sentence_bert = metrics_models.sentence_bert
|
| 25 |
+
embeddings = sentence_bert.encode([original, paraphrase], convert_to_tensor=True)
|
| 26 |
+
similarity = torch.cosine_similarity(embeddings[0], embeddings[1], dim=0).item()
|
| 27 |
+
print(f"Semantic similarity computed: {similarity}")
|
| 28 |
+
return similarity
|
| 29 |
+
|
| 30 |
+
except Exception as e:
|
| 31 |
+
print(f"Error computing semantic similarity: {str(e)}")
|
| 32 |
+
return 0.0
|
| 33 |
+
|
| 34 |
+
def compute_empathy_score(toxicity):
|
| 35 |
+
"""
|
| 36 |
+
Placeholder for empathy score computation.
|
| 37 |
+
For now, inversely proportional to toxicity.
|
| 38 |
+
"""
|
| 39 |
+
return 1.0 - toxicity
|
| 40 |
+
|
| 41 |
+
def compute_bias_score(toxicity):
|
| 42 |
+
"""
|
| 43 |
+
Placeholder for bias score computation.
|
| 44 |
+
For now, equal to toxicity.
|
| 45 |
+
"""
|
| 46 |
+
return toxicity
|
| 47 |
+
|
| 48 |
+
def compute_hallucination_score(similarity):
|
| 49 |
+
"""
|
| 50 |
+
Compute hallucination score based on semantic similarity.
|
| 51 |
+
High difference means potential hallucination.
|
| 52 |
+
"""
|
| 53 |
+
return 1.0 - similarity
|
| 54 |
+
|
| 55 |
def compute_reward_scores(original, paraphrase):
|
| 56 |
"""
|
| 57 |
Compute reward scores for a paraphrased comment.
|
|
|
|
| 88 |
probs = softmax(logits)
|
| 89 |
|
| 90 |
toxicity = probs[1] # Assuming label 1 is toxic
|
| 91 |
+
empathy = compute_empathy_score(toxicity)
|
| 92 |
+
bias = compute_bias_score(toxicity)
|
| 93 |
print(f"Classification took {time.time() - start_time:.2f} seconds")
|
| 94 |
|
| 95 |
# Compute semantic similarity using Sentence-BERT
|
| 96 |
print("Computing semantic similarity...")
|
| 97 |
+
similarity = compute_semantic_similarity(original, paraphrase)
|
| 98 |
+
hallucination = compute_hallucination_score(similarity)
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
# Compute reward score (weighted combination)
|
| 101 |
reward = 0.4 * empathy - 0.2 * toxicity - 0.2 * bias - 0.2 * hallucination
|