Spaces:

junaidbaber
/

demo_lowcode_llm

Sleeping

App Files Files Community

junaidbaber commited on Jan 29

Commit

97a2367

verified ·

1 Parent(s): 03b1321

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -118

app.py CHANGED Viewed

@@ -1,144 +1,82 @@
 import streamlit as st
-from huggingface_hub import login
-from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
-from transformers import BitsAndBytesConfig
 import os
 def initialize_model():
-    """Initialize the model and tokenizer with CPU support"""
-    # Log in to Hugging Face
-    token = os.environ.get("hf")
-    if token:
-        login(token)
-    model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
-    # Load tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
     try:
-        # Try with regular CPU mode first (simpler and more reliable)
-        model = AutoModelForCausalLM.from_pretrained(
-            model_id,
             device_map="cpu",
-            trust_remote_code=True,
-            low_cpu_mem_usage=True
         )
     except Exception as e:
         print(f"Error loading model: {str(e)}")
         raise e
-    # Ensure padding token is defined
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
-    return model, tokenizer
-def format_prompt(user_input, conversation_history=[]):
-    """Format the prompt according to TinyLlama's expected chat format"""
-    messages = []
-    # Add conversation history
-    for turn in conversation_history:
-        messages.append({"role": "user", "content": turn["user"]})
-        messages.append({"role": "assistant", "content": turn["assistant"]})
-    # Add current user input
-    messages.append({"role": "user", "content": user_input})
-    # Format into TinyLlama chat format
-    formatted_prompt = "<|system|>You are a helpful AI assistant.</s>"
-    for message in messages:
-        if message["role"] == "user":
-            formatted_prompt += f"<|user|>{message['content']}</s>"
-        else:
-            formatted_prompt += f"<|assistant|>{message['content']}</s>"
-    formatted_prompt += "<|assistant|>"
-    return formatted_prompt
-def generate_response(model, tokenizer, prompt, conversation_history):
     """Generate model response"""
     try:
-        # Format prompt using TinyLlama's chat template
-        formatted_prompt = format_prompt(prompt, conversation_history[:-1])
-        # Tokenize input
-        inputs = tokenizer(formatted_prompt, return_tensors="pt", padding=True, truncation=True)
-        # Move inputs to the same device as the model
-        device = next(model.parameters()).device
-        inputs = {k: v.to(device) for k, v in inputs.items()}
-        # Calculate max new tokens
-        input_length = inputs["input_ids"].shape[1]
-        max_model_length = 1024
-        max_new_tokens = min(150, max_model_length - input_length)
-        # Generate response
-        outputs = model.generate(
-            inputs["input_ids"],
-            attention_mask=inputs["attention_mask"],
-            max_new_tokens=max_new_tokens,
             temperature=0.7,
             top_p=0.9,
-            pad_token_id=tokenizer.pad_token_id,
             do_sample=True,
-            min_length=10,
-            no_repeat_ngram_size=3,
-            eos_token_id=tokenizer.encode("</s>")[0]  # Set end token
-        )
-        # Decode response and extract only the assistant's message
-        full_response = tokenizer.decode(outputs[0], skip_special_tokens=False)
-        # Extract only the last assistant response
-        assistant_response = full_response.split("<|assistant|>")[-1].split("</s>")[0].strip()
-        return assistant_response if assistant_response else "I apologize, but I couldn't generate a proper response."
-    except RuntimeError as e:
-        if "out of memory" in str(e):
-            torch.cuda.empty_cache()
-            return "I apologize, but I ran out of memory. Please try a shorter message or clear the chat history."
-        else:
-            return f"An error occurred: {str(e)}"
 def main():
-    st.set_page_config(
-        page_title="LLM Chat Interface",
-        page_icon="🤖",
-        layout="wide"
-    )
-    # Add CSS to make the chat interface more compact
-    st.markdown("""
-        <style>
-        .stChat {
-            padding-top: 0rem;
-        }
-        .stChatMessage {
-            padding: 0.5rem;
-        }
-        </style>
-    """, unsafe_allow_html=True)
-    st.title("Chat with TinyLlama 🤖")
-    # Initialize session state for chat history
     if "chat_history" not in st.session_state:
         st.session_state.chat_history = []
     # Initialize model (only once)
-    if "model" not in st.session_state:
-        with st.spinner("Loading the model... This might take a minute..."):
             try:
-                model, tokenizer = initialize_model()
-                st.session_state.model = model
                 st.session_state.tokenizer = tokenizer
-                st.success("Model loaded successfully!")
             except Exception as e:
                 st.error(f"Error loading model: {str(e)}")
                 return
@@ -151,7 +89,7 @@ def main():
             st.write(message["assistant"])
     # Chat input
-    if prompt := st.chat_input("What would you like to know?"):
         # Display user message
         with st.chat_message("user"):
             st.write(prompt)
@@ -163,7 +101,7 @@ def main():
                 st.session_state.chat_history.append(current_turn)
                 response = generate_response(
-                    st.session_state.model,
                     st.session_state.tokenizer,
                     prompt,
                     st.session_state.chat_history
@@ -172,23 +110,22 @@ def main():
                 st.write(response)
                 st.session_state.chat_history[-1]["assistant"] = response
-        # Manage context window
         if len(st.session_state.chat_history) > 5:
             st.session_state.chat_history = st.session_state.chat_history[-5:]
-    # Sidebar controls
     with st.sidebar:
-        st.title("Controls")
         if st.button("Clear Chat"):
             st.session_state.chat_history = []
             st.rerun()
         st.markdown("---")
         st.markdown("""
-        ### Model Info
-        - Using TinyLlama 1.1B Chat
-        - CPU optimized
-        - Context window: 1024 tokens
         """)
 if __name__ == "__main__":

 import streamlit as st
+from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
 import torch
 import os
 def initialize_model():
+    """Initialize a small and fast model for CPU"""
+    # Using a tiny model optimized for CPU
+    model_id = "facebook/opt-125m"  # Much smaller model (125M parameters)
     try:
+        # Initialize the pipeline directly - more efficient than loading model separately
+        pipe = pipeline(
+            "text-generation",
+            model=model_id,
             device_map="cpu",
+            model_kwargs={"low_cpu_mem_usage": True}
         )
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        return pipe, tokenizer
     except Exception as e:
         print(f"Error loading model: {str(e)}")
         raise e
+def generate_response(pipe, tokenizer, prompt, conversation_history):
     """Generate model response"""
     try:
+        # Format conversation context
+        context = ""
+        for turn in conversation_history[-3:]:  # Only use last 3 turns for efficiency
+            context += f"Human: {turn['user']}\nAssistant: {turn['assistant']}\n"
+        # Create the full prompt
+        full_prompt = f"{context}Human: {prompt}\nAssistant:"
+        # Generate response with conservative parameters
+        response = pipe(
+            full_prompt,
+            max_new_tokens=50,  # Limit response length
             temperature=0.7,
             top_p=0.9,
+            num_return_sequences=1,
             do_sample=True,
+            pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id else tokenizer.eos_token_id
+        )[0]['generated_text']
+        # Extract only the assistant's response
+        try:
+            assistant_response = response.split("Assistant:")[-1].strip()
+            if not assistant_response:
+                return "I apologize, but I couldn't generate a proper response."
+            return assistant_response
+        except:
+            return response.split(prompt)[-1].strip()
+    except Exception as e:
+        return f"An error occurred: {str(e)}"
 def main():
+    st.set_page_config(page_title="LLM Chat Interface", page_icon="🤖")
+    st.title("💬 Quick Chat Assistant")
+    # Initialize session state
     if "chat_history" not in st.session_state:
         st.session_state.chat_history = []
+    if "model_loaded" not in st.session_state:
+        st.session_state.model_loaded = False
     # Initialize model (only once)
+    if not st.session_state.model_loaded:
+        with st.spinner("Loading the model... (this should take just a few seconds)"):
             try:
+                pipe, tokenizer = initialize_model()
+                st.session_state.pipe = pipe
                 st.session_state.tokenizer = tokenizer
+                st.session_state.model_loaded = True
             except Exception as e:
                 st.error(f"Error loading model: {str(e)}")
                 return
             st.write(message["assistant"])
     # Chat input
+    if prompt := st.chat_input("Ask me anything!"):
         # Display user message
         with st.chat_message("user"):
             st.write(prompt)
                 st.session_state.chat_history.append(current_turn)
                 response = generate_response(
+                    st.session_state.pipe,
                     st.session_state.tokenizer,
                     prompt,
                     st.session_state.chat_history
                 st.write(response)
                 st.session_state.chat_history[-1]["assistant"] = response
+        # Keep only last 5 turns
         if len(st.session_state.chat_history) > 5:
             st.session_state.chat_history = st.session_state.chat_history[-5:]
+    # Sidebar
     with st.sidebar:
         if st.button("Clear Chat"):
             st.session_state.chat_history = []
             st.rerun()
         st.markdown("---")
         st.markdown("""
+        ### Chat Info
+        - Using OPT-125M model
+        - Optimized for quick responses
+        - Best for short conversations
         """)
 if __name__ == "__main__":