Daedalus-1

Sleeping

App Files Files Community

Daedalus-1 / app.py

Spestly

Update app.py

2410ca2 verified 2 months ago

raw

history blame contribute delete

11.2 kB

	import gradio as gr
	import spaces
	from transformers import pipeline, AutoTokenizer
	import torch
	from typing import List, Dict, Optional

	# Global variable to store pipelines
	model_cache = {}
	tokenizer_cache = {}

	# Available models
	AVAILABLE_MODELS = {
	"Daedalus-1-2B": "NoemaResearch/Daedalus-1-2B",
	"Daedalus-1-8B": "NoemaResearch/Daedalus-1-8B",
	}

	# Models that need special token handling for repetition issues
	MODELS_NEEDING_SPECIAL_HANDLING = {"Daedalus-1-8B"}

	@spaces.GPU
	def initialize_model(model_name):
	global model_cache, tokenizer_cache

	if model_name not in AVAILABLE_MODELS:
	raise ValueError(f"Model {model_name} not found in available models")

	model_id = AVAILABLE_MODELS[model_name]

	# Check if model is already cached
	if model_id not in model_cache:
	try:
	# Load tokenizer separately to handle chat template properly
	tokenizer_cache[model_id] = AutoTokenizer.from_pretrained(
	model_id,
	trust_remote_code=True
	)

	model_cache[model_id] = pipeline(
	"text-generation",
	model=model_id,
	tokenizer=tokenizer_cache[model_id],
	torch_dtype=torch.float16,
	device_map="auto",
	trust_remote_code=True
	)
	except Exception:
	# Fallback to CPU if GPU fails
	tokenizer_cache[model_id] = AutoTokenizer.from_pretrained(
	model_id,
	trust_remote_code=True
	)

	model_cache[model_id] = pipeline(
	"text-generation",
	model=model_id,
	tokenizer=tokenizer_cache[model_id],
	torch_dtype=torch.float32,
	device_map="cpu",
	trust_remote_code=True
	)

	return model_cache[model_id], tokenizer_cache[model_id]

	def format_conversation_with_template(messages: List[Dict], tokenizer) -> str:
	"""Manually apply the chat template to ensure proper formatting"""

	# Get the chat template
	if hasattr(tokenizer, 'chat_template') and tokenizer.chat_template:
	try:
	# Use the tokenizer's apply_chat_template method
	formatted = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)
	return formatted
	except Exception as e:
	print(f"Chat template application failed: {e}")
	# Fall back to manual formatting
	pass

	# Manual fallback formatting using actual special tokens
	bos_token = "<[begin▁of▁sentence]>"
	eos_token = "<[end▁of▁sentence]>"

	# Start with system message
	formatted = f"{bos_token}system\nYou are an AI Coding model called Daedalus, developed by Noema Research{eos_token}"

	# Add each message
	for msg in messages:
	role = msg.get('role', 'user')
	content = msg.get('content', '').strip()
	formatted += f"{bos_token}{role}\n{content}{eos_token}"

	# Add generation prompt
	formatted += f"{bos_token}assistant\n"

	return formatted

	@spaces.GPU
	def generate_response(message, history, model_name, max_length=512, temperature=0.7, top_p=0.9):
	"""Generate response using the selected model"""

	try:
	model_pipe, tokenizer = initialize_model(model_name)
	except Exception as e:
	return f"Error loading model {model_name}: {str(e)}"

	# Format the conversation history
	messages = []
	for user_msg, assistant_msg in history:
	messages.append({"role": "user", "content": user_msg})
	if assistant_msg:
	messages.append({"role": "assistant", "content": assistant_msg})

	messages.append({"role": "user", "content": message})

	try:
	# Format the conversation using the chat template
	formatted_prompt = format_conversation_with_template(messages, tokenizer)

	# Different generation parameters based on model
	if model_name in MODELS_NEEDING_SPECIAL_HANDLING:
	# 8B model needs special token handling to prevent repetition
	stop_tokens = [
	"<[end▁of▁sentence]>", # EOS token
	"<[begin▁of▁sentence]>", # BOS token (shouldn't appear mid-generation)
	"user\n", # Stop if model tries to continue conversation
	"system\n", # Stop if model tries to add system messages
	"\nuser", # Alternative format
	"\nsystem" # Alternative format
	]

	response = model_pipe(
	formatted_prompt,
	max_new_tokens=max_length,
	temperature=temperature,
	top_p=top_p,
	do_sample=True,
	pad_token_id=1, # PAD token ID from config
	eos_token_id=2, # EOS token ID from config
	bos_token_id=0, # BOS token ID from config
	return_full_text=False,
	repetition_penalty=1.1, # Reduce loops
	stop_sequence=stop_tokens[0] # Primary stop token
	)
	else:
	# 2B model - standard generation without special handling
	response = model_pipe(
	formatted_prompt,
	max_new_tokens=max_length,
	temperature=temperature,
	top_p=top_p,
	do_sample=True,
	pad_token_id=tokenizer.pad_token_id,
	eos_token_id=tokenizer.eos_token_id,
	return_full_text=False,
	repetition_penalty=1.05 # Light repetition penalty
	)

	if isinstance(response, list) and len(response) > 0:
	generated_text = response[0]['generated_text']
	else:
	generated_text = str(response)

	# Clean up the response
	assistant_response = str(generated_text).strip()

	# Apply different cleanup based on model
	if model_name in MODELS_NEEDING_SPECIAL_HANDLING:
	# More aggressive cleanup for 8B model
	stop_tokens = [
	"<[end▁of▁sentence]>", "<[begin▁of▁sentence]>",
	"user\n", "system\n", "\nuser", "\nsystem"
	]

	for stop_token in stop_tokens:
	if stop_token in assistant_response:
	assistant_response = assistant_response.split(stop_token)[0].strip()

	# Additional cleanup for common repetition patterns
	lines = assistant_response.split('\n')
	cleaned_lines = []
	for line in lines:
	if line.strip() and not line.strip().startswith(('user', 'assistant', 'system')):
	cleaned_lines.append(line)
	assistant_response = '\n'.join(cleaned_lines).strip()
	else:
	# Standard cleanup for 2B model
	if assistant_response.startswith("assistant\n"):
	assistant_response = assistant_response[10:].strip()

	return assistant_response if assistant_response else "I apologize, but I couldn't generate a proper response. Please try again."

	except Exception as e:
	return f"Error generating response: {str(e)}"

	def create_interface():
	with gr.Blocks(title="Daedalus-1-8B Chat", theme=gr.themes.Base(primary_hue="green")) as demo:
	gr.Markdown("""
	# 🟢 Daedalus Chat Interface

	Chat with Daedalus models by Noema Research.
	""")

	# Model selection dropdown
	model_dropdown = gr.Dropdown(
	choices=list(AVAILABLE_MODELS.keys()),
	value="Daedalus-1-2B", # Default to 2B model
	label="Select Model",
	info="Choose between Daedalus-1-2B (faster) or Daedalus-1-8B (more capable)"
	)

	chatbot = gr.Chatbot(
	height=400,
	placeholder="Start chatting with Daedalus-1-8B...",
	label="Chat"
	)

	msg = gr.Textbox(
	placeholder="Type your message here...",
	label="Message",
	lines=2
	)

	with gr.Row():
	submit_btn = gr.Button("Send", variant="primary")
	clear_btn = gr.Button("Clear Chat", variant="secondary")

	with gr.Accordion("Advanced Settings", open=False):
	max_length = gr.Slider(
	minimum=200,
	maximum=4096, # Reduced from 8192 to prevent memory issues
	value=1024, # Reduced default from 2048
	step=50,
	label="Max New Tokens",
	info="Maximum number of new tokens to generate"
	)
	temperature = gr.Slider(
	minimum=0.1,
	maximum=2.0,
	value=0.7,
	step=0.1,
	label="Temperature",
	info="Controls randomness in generation"
	)
	top_p = gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.9,
	step=0.1,
	label="Top P",
	info="Controls diversity via nucleus sampling"
	)

	def user_message(message, history):
	return "", history + [[message, None]]

	def bot_response(history, selected_model, max_len, temp, top_p):
	if history:
	user_message = history[-1][0]
	bot_message = generate_response(
	user_message,
	history[:-1],
	selected_model, # Use selected model
	max_len,
	temp,
	top_p
	)
	history[-1][1] = bot_message
	return history

	msg.submit(user_message, [msg, chatbot], [msg, chatbot]).then(
	bot_response, [chatbot, model_dropdown, max_length, temperature, top_p], chatbot
	)

	submit_btn.click(user_message, [msg, chatbot], [msg, chatbot]).then(
	bot_response, [chatbot, model_dropdown, max_length, temperature, top_p], chatbot
	)

	clear_btn.click(lambda: None, None, chatbot, queue=False)

	gr.Markdown("""
	---

	### About Daedalus Models

	Daedalus-1-2B: Faster, lightweight model for quick responses and basic coding tasks.

	Daedalus-1-8B: More capable model with advanced reasoning, fine-tuned for structured outputs,
	debugging, and long-context reasoning (up to ~64K tokens).

	Both models are optimized for:
	- Conversational AI
	- Code generation & debugging
	- Structured JSON/function outputs
	- Multi-step reasoning
	""")

	return demo

	# Launch the app
	if __name__ == "__main__":
	demo = create_interface()
	demo.launch(share=True)