SLM-RAG-Arena

Running on Zero

App Files Files Community

aizip-dev commited on May 22

Commit

b867be1

verified ·

1 Parent(s): 693f0cb

Roll back interruption changes

Browse files

Files changed (1) hide show

utils/models.py +13 -69

utils/models.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import os
-# Add Dynamo error suppression
 import torch._dynamo
 torch._dynamo.config.suppress_errors = True
@@ -17,7 +17,8 @@ from transformers import (
     BitNetForCausalLM
 )
 from .prompts import format_rag_prompt
-from .shared import generation_interrupt
 models = {
     "Qwen2.5-1.5b-Instruct": "qwen/qwen2.5-1.5b-instruct",
@@ -47,13 +48,13 @@ tokenizer_cache = {}
 model_names = list(models.keys())
-# Custom stopping criteria that checks the interrupt flag
-class InterruptCriteria(StoppingCriteria):
-    def __init__(self, interrupt_event):
-        self.interrupt_event = interrupt_event
-    def __call__(self, input_ids, scores, **kwargs):
-        return self.interrupt_event.is_set()
 @spaces.GPU
@@ -61,20 +62,12 @@ def generate_summaries(example, model_a_name, model_b_name):
     """
     Generates summaries for the given example using the assigned models sequentially.
     """
-    if generation_interrupt.is_set():
-        print("Generation interrupted before starting")
-        return "", ""
     context_text = ""
     context_parts = []
     if "full_contexts" in example and example["full_contexts"]:
         for i, ctx in enumerate(example["full_contexts"]):
-            # Check interrupt during context processing
-            if generation_interrupt.is_set():
-                print("Generation interrupted during context processing")
-                return "", ""
             content = ""
             # Extract content from either dict or string
@@ -97,18 +90,10 @@ def generate_summaries(example, model_a_name, model_b_name):
     question = example.get("question", "")
-    if generation_interrupt.is_set():
-        print("Generation interrupted before model A")
-        return "", ""
     print(f"Starting inference for Model A: {model_a_name}")
     # Run model A
     summary_a = run_inference(models[model_a_name], context_text, question)
-    if generation_interrupt.is_set():
-        print("Generation interrupted after model A, before model B")
-        return summary_a, ""
     print(f"Starting inference for Model B: {model_b_name}")
     # Run model B
     summary_b = run_inference(models[model_b_name], context_text, question)
@@ -121,13 +106,8 @@ def generate_summaries(example, model_a_name, model_b_name):
 def run_inference(model_name, context, question):
     """
     Run inference using the specified model.
-    Returns the generated text or empty string if interrupted.
     """
-    # Check interrupt at the beginning
-    if generation_interrupt.is_set():
-        print(f"Inference interrupted before starting for {model_name}")
-        return ""
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     result = ""
     tokenizer_kwargs = {
@@ -146,11 +126,6 @@ def run_inference(model_name, context, question):
         if model_name in tokenizer_cache:
             tokenizer = tokenizer_cache[model_name]
         else:
-            # Check interrupt before loading tokenizer
-            if generation_interrupt.is_set():
-                print(f"Inference interrupted before loading tokenizer for {model_name}")
-                return ""
             # Common arguments for tokenizer loading
             tokenizer_load_args = {"padding_side": "left", "token": True}
@@ -170,21 +145,8 @@ def run_inference(model_name, context, question):
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
-        # Check interrupt before loading the model
-        if generation_interrupt.is_set():
-            print(f"Inference interrupted before loading model {model_name}")
-            return ""
-        # Create interrupt criteria for this generation
-        interrupt_criteria = InterruptCriteria(generation_interrupt)
         print("REACHED HERE BEFORE pipe")
         print(f"Loading model {model_name}...")
-        # Check interrupt before model loading
-        if generation_interrupt.is_set():
-            print(f"Inference interrupted during model loading for {model_name}")
-            return ""
         if "bitnet" in model_name.lower():
             bitnet_model = BitNetForCausalLM.from_pretrained(
@@ -226,11 +188,6 @@ def run_inference(model_name, context, question):
                 torch_dtype=torch.bfloat16,
             )
-        # Final interrupt check before generation
-        if generation_interrupt.is_set():
-            print(f"Inference interrupted before generation for {model_name}")
-            return ""
         text_input = format_rag_prompt(question, context, accepts_sys)
         print(f"Starting generation for {model_name}")
@@ -239,7 +196,6 @@ def run_inference(model_name, context, question):
             result = pipe(
                 text_input,
                 max_new_tokens=512,
-                stopping_criteria=[interrupt_criteria],
                 generation_kwargs={"skip_special_tokens": True}
             )[0]["generated_text"]
@@ -263,18 +219,12 @@ def run_inference(model_name, context, question):
             prompt_tokens_length = input_ids.shape[1]
             with torch.inference_mode():
-                # Check interrupt before generation
-                if generation_interrupt.is_set():
-                    print(f"Inference interrupted before torch generation for {model_name}")
-                    return ""
                 output_sequences = model.generate(
                     input_ids=input_ids,
                     attention_mask=attention_mask,
                     max_new_tokens=512,
                     eos_token_id=tokenizer.eos_token_id,
-                    pad_token_id=tokenizer.pad_token_id,
-                    stopping_criteria=[interrupt_criteria]
                 )
             generated_token_ids = output_sequences[0][prompt_tokens_length:]
@@ -288,15 +238,10 @@ def run_inference(model_name, context, question):
         #         **tokenizer_kwargs,
         #     ).to(bitnet_model.device)
         #     with torch.inference_mode():
-        #         # Check interrupt before generation
-        #         if generation_interrupt.is_set():
-        #             return ""
         #         output_sequences = bitnet_model.generate(
         #             **formatted,
         #             max_new_tokens=512,
-        #             stopping_criteria=[interrupt_criteria]
         #         )
         #         result = tokenizer.decode(output_sequences[0][formatted['input_ids'].shape[-1]:], skip_special_tokens=True)
         else:  # For other models
             formatted = pipe.tokenizer.apply_chat_template(
@@ -310,7 +255,6 @@ def run_inference(model_name, context, question):
             outputs = pipe(
                 formatted,
                 max_new_tokens=512,
-                stopping_criteria=[interrupt_criteria],
                 generation_kwargs={"skip_special_tokens": True}
             )
             result = outputs[0]["generated_text"][input_length:]

 import os
+# Keep Dynamo error suppression
 import torch._dynamo
 torch._dynamo.config.suppress_errors = True
     BitNetForCausalLM
 )
 from .prompts import format_rag_prompt
+# Remove interrupt import
+# from .shared import generation_interrupt
 models = {
     "Qwen2.5-1.5b-Instruct": "qwen/qwen2.5-1.5b-instruct",
 model_names = list(models.keys())
+# Remove interrupt criteria class since we're not using it
+# class InterruptCriteria(StoppingCriteria):
+#     def __init__(self, interrupt_event):
+#         self.interrupt_event = interrupt_event
+#
+#     def __call__(self, input_ids, scores, **kwargs):
+#         return self.interrupt_event.is_set()
 @spaces.GPU
     """
     Generates summaries for the given example using the assigned models sequentially.
     """
+    # Remove interrupt checks
     context_text = ""
     context_parts = []
     if "full_contexts" in example and example["full_contexts"]:
         for i, ctx in enumerate(example["full_contexts"]):
             content = ""
             # Extract content from either dict or string
     question = example.get("question", "")
     print(f"Starting inference for Model A: {model_a_name}")
     # Run model A
     summary_a = run_inference(models[model_a_name], context_text, question)
     print(f"Starting inference for Model B: {model_b_name}")
     # Run model B
     summary_b = run_inference(models[model_b_name], context_text, question)
 def run_inference(model_name, context, question):
     """
     Run inference using the specified model.
+    Returns the generated text.
     """
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     result = ""
     tokenizer_kwargs = {
         if model_name in tokenizer_cache:
             tokenizer = tokenizer_cache[model_name]
         else:
             # Common arguments for tokenizer loading
             tokenizer_load_args = {"padding_side": "left", "token": True}
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
         print("REACHED HERE BEFORE pipe")
         print(f"Loading model {model_name}...")
         if "bitnet" in model_name.lower():
             bitnet_model = BitNetForCausalLM.from_pretrained(
                 torch_dtype=torch.bfloat16,
             )
         text_input = format_rag_prompt(question, context, accepts_sys)
         print(f"Starting generation for {model_name}")
             result = pipe(
                 text_input,
                 max_new_tokens=512,
                 generation_kwargs={"skip_special_tokens": True}
             )[0]["generated_text"]
             prompt_tokens_length = input_ids.shape[1]
             with torch.inference_mode():
                 output_sequences = model.generate(
                     input_ids=input_ids,
                     attention_mask=attention_mask,
                     max_new_tokens=512,
                     eos_token_id=tokenizer.eos_token_id,
+                    pad_token_id=tokenizer.pad_token_id
                 )
             generated_token_ids = output_sequences[0][prompt_tokens_length:]
         #         **tokenizer_kwargs,
         #     ).to(bitnet_model.device)
         #     with torch.inference_mode():
         #         output_sequences = bitnet_model.generate(
         #             **formatted,
         #             max_new_tokens=512,
         #         )
         #         result = tokenizer.decode(output_sequences[0][formatted['input_ids'].shape[-1]:], skip_special_tokens=True)
         else:  # For other models
             formatted = pipe.tokenizer.apply_chat_template(
             outputs = pipe(
                 formatted,
                 max_new_tokens=512,
                 generation_kwargs={"skip_special_tokens": True}
             )
             result = outputs[0]["generated_text"][input_length:]