Spaces:

junaidbaber
/

demo_lowcode_llm

Sleeping

App Files Files Community

demo_lowcode_llm / app.py

junaidbaber

Update app.py

4257e69 verified 10 months ago

raw

history blame contribute delete

4.59 kB

	import streamlit as st
	from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
	import torch
	import os

	def initialize_model():
	"""Initialize a small and fast model for CPU"""
	# Using a tiny model optimized for CPU
	model_id = "facebook/opt-125m" # Much smaller model (125M parameters)
	model_id ="GEB-AGI/geb-1.3b"

	try:
	# Initialize the pipeline directly - more efficient than loading model separately
	pipe = pipeline(
	"text-generation",
	model=model_id,
	device_map="cpu",
	model_kwargs={"low_cpu_mem_usage": True}
	)
	tokenizer = AutoTokenizer.from_pretrained(model_id)

	return pipe, tokenizer
	except Exception as e:
	print(f"Error loading model: {str(e)}")
	raise e

	def generate_response(pipe, tokenizer, prompt, conversation_history):
	"""Generate model response"""
	try:
	# Format conversation context
	context = ""
	for turn in conversation_history[-3:]: # Only use last 3 turns for efficiency
	context += f"Human: {turn['user']}\nAssistant: {turn['assistant']}\n"

	# Create the full prompt
	full_prompt = f"{context}Human: {prompt}\nAssistant:"

	# Generate response with conservative parameters
	response = pipe(
	full_prompt,
	max_new_tokens=50, # Limit response length
	temperature=0.7,
	top_p=0.9,
	num_return_sequences=1,
	do_sample=True,
	pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id else tokenizer.eos_token_id
	)[0]['generated_text']

	# Extract only the assistant's response
	try:
	assistant_response = response.split("Assistant:")[-1].strip()
	if not assistant_response:
	return "I apologize, but I couldn't generate a proper response."
	return assistant_response
	except:
	return response.split(prompt)[-1].strip()

	except Exception as e:
	return f"An error occurred: {str(e)}"

	def main():
	st.set_page_config(page_title="LLM Chat Interface", page_icon="🤖")

	st.title("💬 Quick Chat Assistant")

	# Initialize session state
	if "chat_history" not in st.session_state:
	st.session_state.chat_history = []

	if "model_loaded" not in st.session_state:
	st.session_state.model_loaded = False

	# Initialize model (only once)
	if not st.session_state.model_loaded:
	with st.spinner("Loading the model... (this should take just a few seconds)"):
	try:
	pipe, tokenizer = initialize_model()
	st.session_state.pipe = pipe
	st.session_state.tokenizer = tokenizer
	st.session_state.model_loaded = True
	except Exception as e:
	st.error(f"Error loading model: {str(e)}")
	return

	# Display chat messages
	for message in st.session_state.chat_history:
	with st.chat_message("user"):
	st.write(message["user"])
	with st.chat_message("assistant"):
	st.write(message["assistant"])

	# Chat input
	if prompt := st.chat_input("Ask me anything!"):
	# Display user message
	with st.chat_message("user"):
	st.write(prompt)

	# Generate and display assistant response
	with st.chat_message("assistant"):
	with st.spinner("Thinking..."):
	current_turn = {"user": prompt, "assistant": ""}
	st.session_state.chat_history.append(current_turn)

	response = generate_response(
	st.session_state.pipe,
	st.session_state.tokenizer,
	prompt,
	st.session_state.chat_history
	)

	st.write(response)
	st.session_state.chat_history[-1]["assistant"] = response

	# Keep only last 5 turns
	if len(st.session_state.chat_history) > 5:
	st.session_state.chat_history = st.session_state.chat_history[-5:]

	# Sidebar
	with st.sidebar:
	if st.button("Clear Chat"):
	st.session_state.chat_history = []
	st.rerun()

	st.markdown("---")
	st.markdown("""
	### Chat Info
	- Using OPT-125M model
	- Optimized for quick responses
	- Best for short conversations
	""")

	if __name__ == "__main__":
	main()