SLM-RAG-Arena

Running on Zero

App Files Files Community

oliver-aizip commited on May 22

Commit

116a714

1 Parent(s): 83d0454

try using pipe but explicitly set model to be BitNetForCausalLM

Browse files

Files changed (1) hide show

utils/models.py +29 -18

utils/models.py CHANGED Viewed

@@ -163,6 +163,17 @@ def run_inference(model_name, context, question):
                 torch_dtype=torch.bfloat16,
                 trust_remote_code=True,
             )
         elif "icecream" not in model_name.lower():
             pipe = pipeline(
                 "text-generation",
@@ -233,24 +244,24 @@ def run_inference(model_name, context, question):
             generated_token_ids = output_sequences[0][prompt_tokens_length:]
             result = tokenizer.decode(generated_token_ids, skip_special_tokens=True)
-        elif "bitnet" in model_name.lower():
-            formatted = tokenizer.apply_chat_template(
-                text_input,
-                tokenize=True,
-                return_tensors="pt",
-                return_dict=True,
-                **tokenizer_kwargs,
-            ).to(bitnet_model.device)
-            with torch.inference_mode():
-                # Check interrupt before generation
-                if generation_interrupt.is_set():
-                    return ""
-                output_sequences = bitnet_model.generate(
-                    **formatted,
-                    max_new_tokens=512,
-                )
-                result = tokenizer.decode(output_sequences[0][formatted['input_ids'].shape[-1]:], skip_special_tokens=True)
         else:  # For other models
             formatted = pipe.tokenizer.apply_chat_template(
                 text_input,

                 torch_dtype=torch.bfloat16,
                 trust_remote_code=True,
             )
+            pipe = pipeline(
+                "text-generation",
+                model=bitnet_model,
+                tokenizer=tokenizer,
+                device_map="cuda",
+                trust_remote_code=True,
+                torch_dtype=torch.bfloat16,
+                model_kwargs={
+                    "attn_implementation": "eager",
+                },
+            )
         elif "icecream" not in model_name.lower():
             pipe = pipeline(
                 "text-generation",
             generated_token_ids = output_sequences[0][prompt_tokens_length:]
             result = tokenizer.decode(generated_token_ids, skip_special_tokens=True)
+        # elif "bitnet" in model_name.lower():
+        #     formatted = tokenizer.apply_chat_template(
+        #         text_input,
+        #         tokenize=True,
+        #         return_tensors="pt",
+        #         return_dict=True,
+        #         **tokenizer_kwargs,
+        #     ).to(bitnet_model.device)
+        #     with torch.inference_mode():
+        #         # Check interrupt before generation
+        #         if generation_interrupt.is_set():
+        #             return ""
+        #         output_sequences = bitnet_model.generate(
+        #             **formatted,
+        #             max_new_tokens=512,
+        #         )
+        #         result = tokenizer.decode(output_sequences[0][formatted['input_ids'].shape[-1]:], skip_special_tokens=True)
         else:  # For other models
             formatted = pipe.tokenizer.apply_chat_template(
                 text_input,