Spaces:

junaidbaber
/

demo_lowcode_llm

Sleeping

App Files Files Community

junaidbaber commited on Jan 29

Commit

0d5774d

verified ·

1 Parent(s): fccfdf4

Update app.py

Browse files

Files changed (1) hide show

app.py +119 -58

app.py CHANGED Viewed

@@ -1,86 +1,134 @@
 import streamlit as st
 from huggingface_hub import login
-from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 import torch
 import os
 def initialize_model():
-    """Initialize the model and tokenizer"""
     # Log in to Hugging Face
     token = os.environ.get("hf")
-    login(token)
-    # Define the model ID and device
-    model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    # Configure INT8 quantization
-    bnb_config = BitsAndBytesConfig(
-        load_in_8bit=True,
-        llm_int8_enable_fp32_cpu_offload=True
-    )
-    # Load tokenizer and model
     tokenizer = AutoTokenizer.from_pretrained(model_id)
-    model = AutoModelForCausalLM.from_pretrained(
-        model_id,
-        quantization_config=bnb_config,
-        device_map="auto"
-    )
     # Ensure padding token is defined
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
-    return model, tokenizer, device
 def format_conversation(conversation_history):
     """Format the conversation history into a single string."""
     formatted = ""
     for turn in conversation_history:
-        formatted += f"User: {turn['user']}\nAssistant: {turn['assistant']}\n"
     return formatted.strip()
-def generate_response(model, tokenizer, device, prompt, conversation_history):
     """Generate model response"""
     # Format the entire conversation context
     context = format_conversation(conversation_history[:-1])
     if context:
-        full_prompt = f"{context}\nUser: {prompt}"
     else:
-        full_prompt = f"User: {prompt}"
     # Tokenize input
-    inputs = tokenizer(full_prompt, return_tensors="pt", padding=True, truncation=True).to(device)
     # Calculate max new tokens
     input_length = inputs["input_ids"].shape[1]
-    max_model_length = 2048
-    max_new_tokens = min(200, max_model_length - input_length)
-    # Generate response
-    outputs = model.generate(
-        inputs["input_ids"],
-        attention_mask=inputs["attention_mask"],
-        max_new_tokens=max_new_tokens,
-        temperature=0.7,
-        top_p=0.9,
-        pad_token_id=tokenizer.pad_token_id,
-        do_sample=True,
-        min_length=20,
-        no_repeat_ngram_size=3
-    )
-    # Decode response
-    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    response_parts = response.split("User: ")
-    model_response = response_parts[-1].split("Assistant: ")[-1].strip()
-    return model_response
 def main():
-    st.set_page_config(page_title="LLM Chat Interface", page_icon="🤖")
-    st.title("Chat with LLM 🤖")
     # Initialize session state for chat history
     if "chat_history" not in st.session_state:
@@ -89,10 +137,14 @@ def main():
     # Initialize model (only once)
     if "model" not in st.session_state:
         with st.spinner("Loading the model... This might take a minute..."):
-            model, tokenizer, device = initialize_model()
-            st.session_state.model = model
-            st.session_state.tokenizer = tokenizer
-            st.session_state.device = device
     # Display chat messages
     for message in st.session_state.chat_history:
@@ -116,7 +168,6 @@ def main():
                 response = generate_response(
                     st.session_state.model,
                     st.session_state.tokenizer,
-                    st.session_state.device,
                     prompt,
                     st.session_state.chat_history
                 )
@@ -128,10 +179,20 @@ def main():
         if len(st.session_state.chat_history) > 5:
             st.session_state.chat_history = st.session_state.chat_history[-5:]
-    # Add a clear chat button
-    if st.sidebar.button("Clear Chat"):
-        st.session_state.chat_history = []
-        st.rerun()
 if __name__ == "__main__":
     main()

 import streamlit as st
 from huggingface_hub import login
+from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
+from transformers import BitsAndBytesConfig
 import os
 def initialize_model():
+    """Initialize the model and tokenizer with CPU support"""
     # Log in to Hugging Face
     token = os.environ.get("hf")
+    if token:
+        login(token)
+    # Use a smaller model that's more CPU-friendly
+    model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  # Much smaller model
+    # Load tokenizer
     tokenizer = AutoTokenizer.from_pretrained(model_id)
+    # Configure 4-bit quantization for CPU
+    try:
+        # First try with bitsandbytes 4-bit quantization
+        from transformers import AutoModelForCausalLM, BitsAndBytesConfig
+        compute_dtype = getattr(torch, "float16")
+        bnb_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=compute_dtype,
+            bnb_4bit_use_double_quant=False,
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            quantization_config=bnb_config,
+            device_map="auto",
+            trust_remote_code=True
+        )
+    except:
+        # Fallback to CPU without quantization
+        print("Falling back to CPU without quantization")
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            device_map="cpu",
+            trust_remote_code=True,
+            low_cpu_mem_usage=True
+        )
     # Ensure padding token is defined
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
+    return model, tokenizer
 def format_conversation(conversation_history):
     """Format the conversation history into a single string."""
     formatted = ""
     for turn in conversation_history:
+        formatted += f"Human: {turn['user']}\nAssistant: {turn['assistant']}\n"
     return formatted.strip()
+def generate_response(model, tokenizer, prompt, conversation_history):
     """Generate model response"""
     # Format the entire conversation context
     context = format_conversation(conversation_history[:-1])
     if context:
+        full_prompt = f"{context}\nHuman: {prompt}"
     else:
+        full_prompt = f"Human: {prompt}"
     # Tokenize input
+    inputs = tokenizer(full_prompt, return_tensors="pt", padding=True, truncation=True)
+    # Move inputs to the same device as the model
+    device = next(model.parameters()).device
+    inputs = {k: v.to(device) for k, v in inputs.items()}
     # Calculate max new tokens
     input_length = inputs["input_ids"].shape[1]
+    max_model_length = 1024  # Reduced context window for memory efficiency
+    max_new_tokens = min(150, max_model_length - input_length)
+    try:
+        # Generate response with lower temperature for faster generation
+        outputs = model.generate(
+            inputs["input_ids"],
+            attention_mask=inputs["attention_mask"],
+            max_new_tokens=max_new_tokens,
+            temperature=0.5,  # Lower temperature for faster, more focused responses
+            top_p=0.9,
+            pad_token_id=tokenizer.pad_token_id,
+            do_sample=True,
+            min_length=10,  # Reduced minimum length
+            no_repeat_ngram_size=3
+        )
+        # Decode response
+        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        response_parts = response.split("Human: ")
+        model_response = response_parts[-1].split("Assistant: ")[-1].strip()
+        return model_response
+    except RuntimeError as e:
+        if "out of memory" in str(e):
+            torch.cuda.empty_cache()
+            return "I apologize, but I ran out of memory. Please try a shorter message or clear the chat history."
+        else:
+            return f"An error occurred: {str(e)}"
 def main():
+    st.set_page_config(
+        page_title="LLM Chat Interface",
+        page_icon="🤖",
+        layout="wide"
+    )
+    # Add CSS to make the chat interface more compact
+    st.markdown("""
+        <style>
+        .stChat {
+            padding-top: 0rem;
+        }
+        .stChatMessage {
+            padding: 0.5rem;
+        }
+        </style>
+    """, unsafe_allow_html=True)
+    st.title("Welcome to LowCode No Code Demo")
     # Initialize session state for chat history
     if "chat_history" not in st.session_state:
     # Initialize model (only once)
     if "model" not in st.session_state:
         with st.spinner("Loading the model... This might take a minute..."):
+            try:
+                model, tokenizer = initialize_model()
+                st.session_state.model = model
+                st.session_state.tokenizer = tokenizer
+                st.success("Model loaded successfully!")
+            except Exception as e:
+                st.error(f"Error loading model: {str(e)}")
+                return
     # Display chat messages
     for message in st.session_state.chat_history:
                 response = generate_response(
                     st.session_state.model,
                     st.session_state.tokenizer,
                     prompt,
                     st.session_state.chat_history
                 )
         if len(st.session_state.chat_history) > 5:
             st.session_state.chat_history = st.session_state.chat_history[-5:]
+    # Sidebar controls
+    with st.sidebar:
+        st.title("Controls")
+        if st.button("Clear Chat"):
+            st.session_state.chat_history = []
+            st.rerun()
+        st.markdown("---")
+        st.markdown("""
+        ### Model Info
+        - Using TinyLlama 1.1B Chat
+        - Optimized for CPU usage
+        - Context window: 1024 tokens
+        """)
 if __name__ == "__main__":
     main()