SLM-RAG-Arena

Running on Zero

App Files Files Community

oliver-aizip commited on May 7

Commit

b8ee0a2

1 Parent(s): 5021e53

prepare for zeroGPU

Browse files

Files changed (2) hide show

requirements.txt +2 -1
utils/models.py +43 -28

requirements.txt CHANGED Viewed

@@ -6,4 +6,5 @@ numpy==1.26.4
 openai>=1.60.2
 torch>=2.5.1
 tqdm==4.67.1
-vllm>=0.8.5

 openai>=1.60.2
 torch>=2.5.1
 tqdm==4.67.1
+vllm>=0.8.5
+spaces

utils/models.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import os
 os.environ['MKL_THREADING_LAYER'] = 'GNU'
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria, StoppingCriteriaList
 from .prompts import format_rag_prompt
 from .shared import generation_interrupt
 import threading
@@ -104,6 +105,7 @@ def generate_summaries(example, model_a_name, model_b_name):
 # Modified run_inference to run in a thread and use a queue for results
 def run_inference(model_name, context, question, result_queue):
     """
     Run inference using the specified model. Designed to be run in a thread.
@@ -125,14 +127,26 @@ def run_inference(model_name, context, question, result_queue):
             "System role not supported" not in tokenizer.chat_template
             if tokenizer.chat_template else False # Handle missing chat_template
         )
-        # if tokenizer.pad_token is None:
-        #     tokenizer.pad_token = tokenizer.eos_token
-        # # Check interrupt before loading the model
-        # if generation_interrupt.is_set():
-        #      result_queue.put("")
-        #      return
         # model = AutoModelForCausalLM.from_pretrained(
         #     model_name, torch_dtype=torch.bfloat16, attn_implementation="eager", token=True
@@ -141,10 +155,10 @@ def run_inference(model_name, context, question, result_queue):
         text_input = format_rag_prompt(question, context, accepts_sys)
-        # # Check interrupt before tokenization/template application
-        # if generation_interrupt.is_set():
-        #      result_queue.put("")
-        #      return
         # actual_input = tokenizer.apply_chat_template(
         #     text_input,
@@ -156,7 +170,8 @@ def run_inference(model_name, context, question, result_queue):
         #     max_length=2048, # Keep original max_length for now
         #     add_generation_prompt=True,
         # ).to(device)
         # # Ensure input does not exceed model max length after adding generation prompt
         # # This check might be redundant if tokenizer handles it, but good for safety
         # # if actual_input.shape[1] > tokenizer.model_max_length:
@@ -193,23 +208,23 @@ def run_inference(model_name, context, question, result_queue):
         # else:
         #     # Decode the generated tokens, excluding the input tokens
         #     result = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True)
-        llm = LLM(model_name, dtype=torch.bfloat16, hf_token=True, enforce_eager=True, device="cpu")
-        params = SamplingParams(
-            max_tokens=512,
-            )
-        # Check interrupt before generation
-        if generation_interrupt.is_set():
-            result_queue.put("")
-            return
-        # Generate the response
-        outputs = llm.chat(
-            text_input,
-            sampling_params=params,
-            # stopping_criteria=StoppingCriteriaList([InterruptCriteria(generation_interrupt)]),
-        )
-        # Check interrupt immediately after generation finishes or stops
-        result_queue.put(outputs[0].outputs[0].text)
     except Exception as e:
         print(f"Error in inference thread for {model_name}: {e}")

 import os
 os.environ['MKL_THREADING_LAYER'] = 'GNU'
+import spaces
 import torch
+from transformers import pipeline, AutoTokenizer, StoppingCriteria, StoppingCriteriaList
 from .prompts import format_rag_prompt
 from .shared import generation_interrupt
 import threading
 # Modified run_inference to run in a thread and use a queue for results
+@spaces.GPU
 def run_inference(model_name, context, question, result_queue):
     """
     Run inference using the specified model. Designed to be run in a thread.
             "System role not supported" not in tokenizer.chat_template
             if tokenizer.chat_template else False # Handle missing chat_template
         )
+        outputs = ""
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        # Check interrupt before loading the model
+        if generation_interrupt.is_set():
+             result_queue.put("")
+             return
+        pipe = pipeline(
+            "text-generation",
+            model=model_name,
+            tokenizer=tokenizer,
+            device_map='auto',
+            max_length=512,
+            do_sample=True,
+            temperature=0.6,
+            top_p=0.9,
+        )
         # model = AutoModelForCausalLM.from_pretrained(
         #     model_name, torch_dtype=torch.bfloat16, attn_implementation="eager", token=True
         text_input = format_rag_prompt(question, context, accepts_sys)
+        # Check interrupt before tokenization/template application
+        if generation_interrupt.is_set():
+             result_queue.put("")
+             return
         # actual_input = tokenizer.apply_chat_template(
         #     text_input,
         #     max_length=2048, # Keep original max_length for now
         #     add_generation_prompt=True,
         # ).to(device)
+        output = pipe(text_input, max_new_tokens=512)
+        result = output[0]['generated_text'][-1]['content']
         # # Ensure input does not exceed model max length after adding generation prompt
         # # This check might be redundant if tokenizer handles it, but good for safety
         # # if actual_input.shape[1] > tokenizer.model_max_length:
         # else:
         #     # Decode the generated tokens, excluding the input tokens
         #     result = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True)
+        # llm = LLM(model_name, dtype=torch.bfloat16, hf_token=True, enforce_eager=True, device="cpu")
+        # params = SamplingParams(
+        #     max_tokens=512,
+        #     )
+        # # Check interrupt before generation
+        # if generation_interrupt.is_set():
+        #     result_queue.put("")
+        #     return
+        # # Generate the response
+        # outputs = llm.chat(
+        #     text_input,
+        #     sampling_params=params,
+        #     # stopping_criteria=StoppingCriteriaList([InterruptCriteria(generation_interrupt)]),
+        # )
+        # # Check interrupt immediately after generation finishes or stops
+        result_queue.put(result)
     except Exception as e:
         print(f"Error in inference thread for {model_name}: {e}")