Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	Update app.py
Browse files
    	
        app.py
    CHANGED
    
    | @@ -5,7 +5,7 @@ from threading import Thread | |
| 5 |  | 
| 6 | 
             
            #Load the model
         | 
| 7 | 
             
            model_id = 'mobiuslabsgmbh/Llama-2-7b-chat-hf_1bitgs8_hqq' 
         | 
| 8 | 
            -
            model     = HQQModelForCausalLM.from_quantized(model_id, adapter='adapter_v0.1.lora', device=' | 
| 9 | 
             
            tokenizer = AutoTokenizer.from_pretrained(model_id)
         | 
| 10 |  | 
| 11 | 
             
            #Setup Inference Mode
         | 
| @@ -16,9 +16,9 @@ model.config.use_cache  = True | |
| 16 | 
             
            model.eval();
         | 
| 17 |  | 
| 18 | 
             
            # Optional: torch compile for faster inference
         | 
| 19 | 
            -
             | 
| 20 |  | 
| 21 | 
            -
            def chat_processor(chat, max_new_tokens=100, do_sample=True, device=' | 
| 22 | 
             
                tokenizer.use_default_system_prompt = False
         | 
| 23 | 
             
                streamer = transformers.TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
         | 
| 24 |  | 
| @@ -47,27 +47,16 @@ def chat_processor(chat, max_new_tokens=100, do_sample=True, device='cpu'): | |
| 47 |  | 
| 48 | 
             
                #torch.cuda.empty_cache()
         | 
| 49 |  | 
| 50 | 
            -
                return streamer
         | 
| 51 |  | 
| 52 | 
            -
             | 
| 53 | 
            -
                 | 
| 54 | 
            -
                 | 
| 55 | 
            -
                 | 
|  | |
|  | |
| 56 |  | 
| 57 | 
            -
                 | 
| 58 | 
            -
             | 
| 59 |  | 
| 60 | 
            -
             | 
| 61 | 
            -
                    print("Question: ", history[-1][0])
         | 
| 62 | 
            -
                    stream = chat_processor(chat=history[-1][0])
         | 
| 63 | 
            -
                    history[-1][1] = ""
         | 
| 64 | 
            -
                    for character in stream:
         | 
| 65 | 
            -
                        print(character)
         | 
| 66 | 
            -
                        history[-1][1] += character
         | 
| 67 | 
            -
                        yield history
         | 
| 68 | 
            -
             | 
| 69 | 
            -
                msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(bot, chatbot, chatbot)
         | 
| 70 | 
            -
                clear.click(lambda: None, None, chatbot, queue=False)
         | 
| 71 | 
            -
             | 
| 72 | 
            -
            demo.queue()
         | 
| 73 | 
            -
            demo.launch()
         | 
|  | |
| 5 |  | 
| 6 | 
             
            #Load the model
         | 
| 7 | 
             
            model_id = 'mobiuslabsgmbh/Llama-2-7b-chat-hf_1bitgs8_hqq' 
         | 
| 8 | 
            +
            model     = HQQModelForCausalLM.from_quantized(model_id, adapter='adapter_v0.1.lora', device='cuda')
         | 
| 9 | 
             
            tokenizer = AutoTokenizer.from_pretrained(model_id)
         | 
| 10 |  | 
| 11 | 
             
            #Setup Inference Mode
         | 
|  | |
| 16 | 
             
            model.eval();
         | 
| 17 |  | 
| 18 | 
             
            # Optional: torch compile for faster inference
         | 
| 19 | 
            +
            model = torch.compile(model)
         | 
| 20 |  | 
| 21 | 
            +
            def chat_processor(chat, max_new_tokens=100, do_sample=True, device='cuda'):
         | 
| 22 | 
             
                tokenizer.use_default_system_prompt = False
         | 
| 23 | 
             
                streamer = transformers.TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
         | 
| 24 |  | 
|  | |
| 47 |  | 
| 48 | 
             
                #torch.cuda.empty_cache()
         | 
| 49 |  | 
| 50 | 
            +
                return t, streamer
         | 
| 51 |  | 
| 52 | 
            +
            def chat(message, history):
         | 
| 53 | 
            +
                t, stream = chat_processor(chat=message)
         | 
| 54 | 
            +
                response = ""
         | 
| 55 | 
            +
                for character in stream:
         | 
| 56 | 
            +
                    response += character
         | 
| 57 | 
            +
                    yield response
         | 
| 58 |  | 
| 59 | 
            +
                t.join()
         | 
| 60 | 
            +
                torch.cuda.empty_cache()
         | 
| 61 |  | 
| 62 | 
            +
            gr.ChatInterface(chat).launch()
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  |