Spaces:

FridayMaster
/

CHATBOT1

Sleeping

App Files Files Community

CHATBOT1 / app.py

FridayMaster

Update app.py

1bca555 verified over 1 year ago

raw

history blame contribute delete

4.78 kB

	import gradio as gr
	import nltk
	from nltk.tokenize import sent_tokenize
	from transformers import AutoTokenizer, AutoModel
	import torch
	import faiss
	import numpy as np
	import openai

	# Set up OpenAI API key
	openai.api_key = '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'

	# Download NLTK data
	nltk.download('punkt')
	nltk.download('punkt_tab')

	# Load the tokenizer and model
	tokenizer = AutoTokenizer.from_pretrained("microsoft/MiniLM-L12-H384-uncased")
	model = AutoModel.from_pretrained("microsoft/MiniLM-L12-H384-uncased")

	manual_path = "ubuntu_manual.txt"

	# Load the Ubuntu manual from a .txt file
	with open(manual_path, "r", encoding="utf-8") as file:
	full_text = file.read()

	# Function to chunk the text into smaller pieces
	def chunk_text(text, chunk_size=500):
	sentences = sent_tokenize(text)
	chunks = []
	current_chunk = []

	for sentence in sentences:
	if len(current_chunk) + len(sentence.split()) <= chunk_size:
	current_chunk.append(sentence)
	else:
	chunks.append(" ".join(current_chunk))
	current_chunk = [sentence]

	if current_chunk:
	chunks.append(" ".join(current_chunk))

	return chunks

	# Apply chunking to the entire text
	manual_chunks = chunk_text(full_text, chunk_size=500)

	# Function to generate embeddings for each chunk
	def embed_text(texts):
	inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
	with torch.no_grad():
	outputs = model(**inputs)
	embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy() # CLS token representation
	return embeddings

	# Generate embeddings for the chunks
	chunk_embeddings = embed_text(manual_chunks)

	# Convert embeddings to a numpy array
	chunk_embeddings_np = np.array(chunk_embeddings)

	# Create a FAISS index and add the embeddings
	dimension = chunk_embeddings_np.shape[1]
	index = faiss.IndexFlatL2(dimension)
	index.add(chunk_embeddings_np)

	# Function to retrieve relevant chunks for a user query and print indices and distances
	def retrieve_chunks(query, k=5):
	query_embedding = embed_text([query])
	distances, indices = index.search(query_embedding, k=k)
	valid_indices = [i for i in indices[0] if i < len(manual_chunks)]
	relevant_chunks = [manual_chunks[i] for i in valid_indices]

	# Print indices and distances
	for i, idx in enumerate(valid_indices):
	print(f"Index: {idx}, Distance: {distances[0][i]}")

	return relevant_chunks, indices[0], distances[0]

	# Function to perform RAG: Retrieve chunks and generate a response using GPT-3.5
	def rag_response_gpt3_5(query, k=3, max_tokens=150):
	relevant_chunks, indices, distances = retrieve_chunks(query, k=k)
	if not relevant_chunks:
	return "Sorry, I couldn't find relevant information."

	# Combine the query with a limited number of retrieved chunks
	augmented_input = query + "\n" + "\n".join(relevant_chunks)

	# Tokenize the augmented input and ensure it fits within model token limits
	input_ids = tokenizer(augmented_input, return_tensors="pt").input_ids[0]

	if len(input_ids) > 512:
	input_ids = input_ids[:512]
	augmented_input = tokenizer.decode(input_ids, skip_special_tokens=True)

	response = openai.ChatCompletion.create(
	model="gpt-3.5-turbo",
	messages=[
	{"role": "system", "content": "You are a helpful assistant."},
	{"role": "user", "content": augmented_input}
	],
	max_tokens=max_tokens,
	temperature=0.7
	)

	return response.choices[0].message['content'].strip()

	# Chat history to maintain conversation context
	def chatbot(query, history):
	if history is None:
	history = []

	# Retrieve relevant chunks along with their indices and distances
	relevant_chunks, indices, distances = retrieve_chunks(query)

	# Print the indices and distances of the retrieved chunks
	print(f"Retrieved Indices: {indices}")
	print(f"Retrieved Distances: {distances}")

	response = rag_response_gpt3_5(query)
	history.append((query, response))

	# Combine all messages into a single string
	chat_history = ""
	for user_input, bot_response in history:
	chat_history += f"User: {user_input}\nBot: {bot_response}\n\n"

	return chat_history, history

	# Create the Gradio interface
	iface = gr.Interface(fn=chatbot,
	inputs=["text", "state"],
	outputs=["text", "state"],
	title="Ubuntu Manual Chatbot",
	description="Ask me anything about the Ubuntu manual.")

	# Launch the app
	if __name__ == "__main__":
	iface.launch()