Spaces:

AvocadoMuffin
/

eval_model

Sleeping

App Files Files Community

AvocadoMuffin commited on Jul 11

Commit

e010197

verified ·

1 Parent(s): 380b5ba

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -88

app.py CHANGED Viewed

@@ -4,151 +4,126 @@ import numpy as np
 from datasets import load_dataset
 from transformers import AutoTokenizer, AutoModelForQuestionAnswering
 import torch
-from sklearn.metrics import f1_score
-import re
 from collections import Counter
 import string
-from huggingface_hub import login
-import gradio as gr
 import pandas as pd
 from datetime import datetime
-import matplotlib.pyplot as plt
-# Normalization functions (same as extractor)
 def normalize_answer(s):
-    def remove_articles(text):
-        return re.sub(r'\b(a|an|the)\b', ' ', text)
-    def white_space_fix(text):
-        return ' '.join(text.split())
-    def remove_punc(text):
-        exclude = set(string.punctuation)
-        return ''.join(ch for ch in text if ch not in exclude)
-    def lower(text):
-        return text.lower()
     return white_space_fix(remove_articles(remove_punc(lower(s))))
-def f1_score_qa(prediction, ground_truth):
-    prediction_tokens = normalize_answer(prediction).split()
-    ground_truth_tokens = normalize_answer(ground_truth).split()
-    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
     num_same = sum(common.values())
-    if num_same == 0:
-        return 0
-    precision = 1.0 * num_same / len(prediction_tokens)
-    recall = 1.0 * num_same / len(ground_truth_tokens)
     return (2 * precision * recall) / (precision + recall)
-def exact_match_score(prediction, ground_truth):
-    return normalize_answer(prediction) == normalize_answer(ground_truth)
-# Identical confidence calculation to extractor
-def calculate_confidence(model, tokenizer, question, context):
     inputs = tokenizer(
-        question,
-        context,
         return_tensors="pt",
         truncation=True,
         max_length=512,
         stride=128,
         padding=True
     )
     if torch.cuda.is_available():
-        inputs = {k: v.cuda() for k, v in inputs.items()}
         model = model.cuda()
     with torch.no_grad():
         outputs = model(**inputs)
     start_probs = torch.softmax(outputs.start_logits, dim=1)
     end_probs = torch.softmax(outputs.end_logits, dim=1)
     answer_start = torch.argmax(outputs.start_logits)
     answer_end = torch.argmax(outputs.end_logits) + 1
-    start_prob = start_probs[0, answer_start].item()
-    end_prob = end_probs[0, answer_end-1].item()
-    confidence = np.sqrt(start_prob * end_prob)
     answer_tokens = inputs["input_ids"][0][answer_start:answer_end]
-    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True).strip()
-    return answer, float(confidence)
 def run_evaluation(num_samples=100):
-    # Authenticate
-    if token := os.getenv("HF_TOKEN"):
-        login(token=token)
-    # Load model same as extractor
     model_name = "AvocadoMuffin/roberta-cuad-qa-v2"
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     model = AutoModelForQuestionAnswering.from_pretrained(model_name)
-    # Load CUAD dataset
-    dataset = load_dataset("theatticusproject/cuad-qa", token=token)
-    test_data = dataset["test"].select(range(min(num_samples, len(dataset["test"]))))
     results = []
     for example in test_data:
         context = example["context"]
         question = example["question"]
         gt_answer = example["answers"]["text"][0] if example["answers"]["text"] else ""
-        pred_answer, confidence = calculate_confidence(model, tokenizer, question, context)
         results.append({
-            "question": question,
-            "prediction": pred_answer,
-            "ground_truth": gt_answer,
-            "confidence": confidence,
-            "exact_match": exact_match_score(pred_answer, gt_answer),
-            "f1": f1_score_qa(pred_answer, gt_answer)
         })
     # Generate report
     df = pd.DataFrame(results)
-    avg_metrics = {
-        "exact_match": df["exact_match"].mean() * 100,
-        "f1": df["f1"].mean() * 100,
-        "confidence": df["confidence"].mean() * 100
-    }
-    # Confidence calibration analysis
-    high_conf_correct = df[(df["confidence"] > 0.8) & (df["exact_match"] == 1)].shape[0]
-    high_conf_total = df[df["confidence"] > 0.8].shape[0]
     report = f"""
-    CUAD Evaluation Report (n={len(df)})
-    ========================
-    Accuracy:
-    - Exact Match: {avg_metrics['exact_match']:.2f}%
-    - F1 Score: {avg_metrics['f1']:.2f}%
-    Confidence Analysis:
-    - Avg Confidence: {avg_metrics['confidence']:.2f}%
-    - High-Confidence (>80%) Accuracy: {high_conf_correct}/{high_conf_total} ({high_conf_correct/max(1,high_conf_total)*100:.1f}%)
-    Confidence vs Accuracy:
-    {df[['confidence', 'exact_match']].corr().iloc[0,1]:.3f} correlation
     """
-    # Save results
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    results_file = f"cuad_eval_{timestamp}.json"
-    with open(results_file, "w") as f:
         json.dump({
-            "metrics": avg_metrics,
-            "samples": results,
-            "config": {
-                "model": model_name,
-                "confidence_method": "geometric_mean_start_end_probs"
-            }
         }, f, indent=2)
     return report, df, results_file
 if __name__ == "__main__":
-    report, df, _ = run_evaluation()
     print(report)
     print("\nSample predictions:")
-    print(df.head())

 from datasets import load_dataset
 from transformers import AutoTokenizer, AutoModelForQuestionAnswering
 import torch
 from collections import Counter
 import string
 import pandas as pd
 from datetime import datetime
+# Normalization functions
 def normalize_answer(s):
+    def remove_articles(text): return re.sub(r'\b(a|an|the)\b', ' ', text)
+    def white_space_fix(text): return ' '.join(text.split())
+    def remove_punc(text):
+        return ''.join(ch for ch in text if ch not in set(string.punctuation))
+    def lower(text): return text.lower()
     return white_space_fix(remove_articles(remove_punc(lower(s))))
+# Metrics
+def exact_match_score(pred, truth):
+    return int(normalize_answer(pred) == normalize_answer(truth))
+def f1_score_qa(pred, truth):
+    pred_tokens = normalize_answer(pred).split()
+    truth_tokens = normalize_answer(truth).split()
+    common = Counter(pred_tokens) & Counter(truth_tokens)
     num_same = sum(common.values())
+    if num_same == 0: return 0
+    precision = num_same / len(pred_tokens)
+    recall = num_same / len(truth_tokens)
     return (2 * precision * recall) / (precision + recall)
+# Identical to extractor's QA confidence
+def get_qa_confidence(model, tokenizer, question, context):
     inputs = tokenizer(
+        question, context,
         return_tensors="pt",
         truncation=True,
         max_length=512,
         stride=128,
         padding=True
     )
     if torch.cuda.is_available():
+        inputs = {k:v.cuda() for k,v in inputs.items()}
         model = model.cuda()
     with torch.no_grad():
         outputs = model(**inputs)
     start_probs = torch.softmax(outputs.start_logits, dim=1)
     end_probs = torch.softmax(outputs.end_logits, dim=1)
     answer_start = torch.argmax(outputs.start_logits)
     answer_end = torch.argmax(outputs.end_logits) + 1
+    confidence = np.sqrt(
+        start_probs[0, answer_start].item() *
+        end_probs[0, answer_end-1].item()
+    )
     answer_tokens = inputs["input_ids"][0][answer_start:answer_end]
+    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
+    return answer.strip(), float(confidence)
 def run_evaluation(num_samples=100):
+    # Load CUAD with remote code trust
+    dataset = load_dataset(
+        "theatticusproject/cuad-qa",
+        trust_remote_code=True,
+        token=os.getenv("HF_TOKEN", True)  # True allows anonymous access
+    )
+    test_data = dataset["test"].select(range(min(num_samples, len(dataset["test"]))))
+    # Load model
     model_name = "AvocadoMuffin/roberta-cuad-qa-v2"
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     model = AutoModelForQuestionAnswering.from_pretrained(model_name)
     results = []
     for example in test_data:
         context = example["context"]
         question = example["question"]
         gt_answer = example["answers"]["text"][0] if example["answers"]["text"] else ""
+        pred, conf = get_qa_confidence(model, tokenizer, question, context)
         results.append({
+            "question": question[:100] + "..." if len(question) > 100 else question,
+            "prediction": pred,
+            "confidence": conf,
+            "exact_match": exact_match_score(pred, gt_answer),
+            "f1": f1_score_qa(pred, gt_answer),
+            "ground_truth": gt_answer
         })
     # Generate report
     df = pd.DataFrame(results)
     report = f"""
+    Evaluation Results (n={len(df)})
+    =================
+    Exact Match: {df['exact_match'].mean():.1%}
+    F1 Score: {df['f1'].mean():.1%}
+    Avg Confidence: {df['confidence'].mean():.1%}
+    High-Confidence Accuracy: {
+        df[df['confidence'] > 0.8]['exact_match'].mean():.1%}
     """
+    # Save
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    results_file = f"eval_results_{timestamp}.json"
+    with open(results_file, 'w') as f:
         json.dump({
+            "config": {"model": model_name, "dataset": "cuad-qa"},
+            "metrics": {
+                "exact_match": float(df['exact_match'].mean()),
+                "f1": float(df['f1'].mean()),
+                "confidence": float(df['confidence'].mean())
+            },
+            "samples": results
         }, f, indent=2)
     return report, df, results_file
 if __name__ == "__main__":
+    report, df, _ = run_evaluation(num_samples=50)
     print(report)
     print("\nSample predictions:")
+    print(df[["question", "confidence", "exact_match"]].head())