Multi-modal-o1 / app.py
shukdevdattaEX's picture
Create app.py
3a9e21d verified
import gradio as gr
import openai
import base64
from PIL import Image
import io
import fitz # PyMuPDF for PDF handling
# Extract text from PDF
def extract_text_from_pdf(pdf_file):
try:
text = ""
pdf_document = fitz.open(pdf_file)
for page_num in range(len(pdf_document)):
page = pdf_document[page_num]
text += page.get_text()
pdf_document.close()
return text
except Exception as e:
return f"Error extracting text from PDF: {str(e)}"
# Generate MCQ quiz from PDF
def generate_mcq_quiz(pdf_content, num_questions, openai_api_key, model_choice):
if not openai_api_key:
return "Error: No API key provided."
openai.api_key = openai_api_key
limited_content = pdf_content[:8000]
prompt = f"""Based on the following document content, generate {num_questions} multiple-choice quiz questions.
For each question:
1. Write a clear question
2. Give 4 options (A, B, C, D)
3. Indicate the correct answer
4. Briefly explain why the answer is correct
Document:
{limited_content}
"""
try:
response = openai.ChatCompletion.create(
model=model_choice,
messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.content
except Exception as e:
return f"Error generating quiz: {str(e)}"
# Convert image to base64
def get_base64_string_from_image(pil_image):
buffered = io.BytesIO()
pil_image.save(buffered, format="PNG")
return base64.b64encode(buffered.getvalue()).decode("utf-8")
# Transcribe audio
def transcribe_audio(audio, openai_api_key):
if not openai_api_key:
return "Error: No API key provided."
openai.api_key = openai_api_key
try:
with open(audio, 'rb') as f:
audio_bytes = f.read()
file_obj = io.BytesIO(audio_bytes)
file_obj.name = 'audio.wav'
transcription = openai.Audio.transcribe(file=file_obj, model="whisper-1")
return transcription.text
except Exception as e:
return f"Error transcribing audio: {str(e)}"
# Generate response for text/image/pdf
def generate_response(input_text, image, pdf_content, openai_api_key, reasoning_effort, model_choice):
if not openai_api_key:
return "Error: No API key provided."
openai.api_key = openai_api_key
if pdf_content and input_text:
input_text = f"Based on the document below, answer the question:\n\n{input_text}\n\nDocument:\n{pdf_content}"
elif image:
image_b64 = get_base64_string_from_image(image)
input_text = f"data:image/png;base64,{image_b64}"
try:
response = openai.ChatCompletion.create(
model=model_choice,
messages=[{"role": "user", "content": input_text}],
max_completion_tokens=2000
)
return response.choices[0].message.content
except Exception as e:
return f"Error calling OpenAI API: {str(e)}"
# Chatbot logic
def chatbot(input_text, image, audio, pdf_file, openai_api_key, reasoning_effort, model_choice, pdf_content, num_quiz_questions, pdf_quiz_mode, history):
if history is None:
history = []
if audio:
input_text = transcribe_audio(audio, openai_api_key)
new_pdf_content = pdf_content
if pdf_file:
new_pdf_content = extract_text_from_pdf(pdf_file)
if pdf_quiz_mode:
if new_pdf_content:
quiz = generate_mcq_quiz(new_pdf_content, int(num_quiz_questions), openai_api_key, model_choice)
history.append((f"πŸ“˜ Generated {num_quiz_questions} quiz questions", quiz))
else:
history.append(("No PDF detected", "Please upload a PDF file first."))
else:
response = generate_response(input_text, image, new_pdf_content, openai_api_key, reasoning_effort, model_choice)
if input_text:
history.append((input_text, response))
elif image:
history.append(("πŸ–ΌοΈ [Image Uploaded]", response))
elif pdf_file:
history.append(("πŸ“„ [PDF Uploaded]", response))
else:
history.append(("No input", "Please provide input."))
return "", None, None, None, new_pdf_content, history
# Reset all fields
def clear_history():
return "", None, None, None, "", []
# Extract text when PDF uploaded
def process_pdf(pdf_file):
if pdf_file is None:
return ""
return extract_text_from_pdf(pdf_file)
# Switch between input modes
def update_input_type(choice):
if choice == "Text":
return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(value=False)
elif choice == "Image":
return gr.update(visible=True), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(value=False)
elif choice == "Voice":
return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(value=False)
elif choice == "PDF":
return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(value=False)
elif choice == "PDF(QUIZ)":
return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=True), gr.update(value=True)
# Build Gradio interface
def create_interface():
with gr.Blocks() as demo:
gr.Markdown("## 🧠 Multimodal Chatbot β€” Text | Image | Voice | PDF | Quiz")
pdf_content = gr.State("")
openai_api_key = gr.Textbox(label="πŸ”‘ OpenAI API Key", type="password", placeholder="sk-...")
input_type = gr.Radio(
["Text", "Image", "Voice", "PDF", "PDF(QUIZ)"],
label="Choose Input Type",
value="Text"
)
input_text = gr.Textbox(label="Enter your question or text", lines=2, visible=True)
image_input = gr.Image(label="Upload Image", type="pil", visible=False)
audio_input = gr.Audio(label="Upload/Record Audio", type="filepath", visible=False)
pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"], visible=False)
quiz_questions_slider = gr.Slider(1, 20, value=5, step=1, label="Number of Quiz Questions", visible=False)
quiz_mode = gr.Checkbox(label="Quiz Mode", visible=False, value=False)
with gr.Row():
reasoning_effort = gr.Dropdown(["low", "medium", "high"], value="medium", label="Reasoning Effort")
model_choice = gr.Dropdown(["o1", "o3-mini"], value="o1", label="Model")
submit_btn = gr.Button("Submit")
clear_btn = gr.Button("Clear Chat")
chat_history = gr.Chatbot(label="Chat History")
# Input type handling
input_type.change(
fn=update_input_type,
inputs=[input_type],
outputs=[input_text, image_input, audio_input, pdf_input, quiz_questions_slider, quiz_mode]
)
# PDF upload processing
pdf_input.change(fn=process_pdf, inputs=[pdf_input], outputs=[pdf_content])
# Submit
submit_btn.click(
fn=chatbot,
inputs=[input_text, image_input, audio_input, pdf_input, openai_api_key, reasoning_effort, model_choice, pdf_content, quiz_questions_slider, quiz_mode, chat_history],
outputs=[input_text, image_input, audio_input, pdf_input, pdf_content, chat_history]
)
# Clear
clear_btn.click(fn=clear_history, inputs=[], outputs=[input_text, image_input, audio_input, pdf_input, pdf_content, chat_history])
return demo
if __name__ == "__main__":
demo = create_interface()
demo.launch()