Spaces:

shukdevdattaEX
/

Multi-modal-o1

Sleeping

App Files Files Community

shukdevdattaEX commited on 10 days ago

Commit

3a9e21d

verified ·

1 Parent(s): 0b8a82b

Create app.py

Browse files

Files changed (1) hide show

app.py +209 -0

app.py ADDED Viewed

	@@ -0,0 +1,209 @@

+import gradio as gr
+import openai
+import base64
+from PIL import Image
+import io
+import fitz  # PyMuPDF for PDF handling
+# Extract text from PDF
+def extract_text_from_pdf(pdf_file):
+    try:
+        text = ""
+        pdf_document = fitz.open(pdf_file)
+        for page_num in range(len(pdf_document)):
+            page = pdf_document[page_num]
+            text += page.get_text()
+        pdf_document.close()
+        return text
+    except Exception as e:
+        return f"Error extracting text from PDF: {str(e)}"
+# Generate MCQ quiz from PDF
+def generate_mcq_quiz(pdf_content, num_questions, openai_api_key, model_choice):
+    if not openai_api_key:
+        return "Error: No API key provided."
+    openai.api_key = openai_api_key
+    limited_content = pdf_content[:8000]
+    prompt = f"""Based on the following document content, generate {num_questions} multiple-choice quiz questions.
+For each question:
+1. Write a clear question
+2. Give 4 options (A, B, C, D)
+3. Indicate the correct answer
+4. Briefly explain why the answer is correct
+Document:
+{limited_content}
+"""
+    try:
+        response = openai.ChatCompletion.create(
+            model=model_choice,
+            messages=[{"role": "user", "content": prompt}]
+        )
+        return response.choices[0].message.content
+    except Exception as e:
+        return f"Error generating quiz: {str(e)}"
+# Convert image to base64
+def get_base64_string_from_image(pil_image):
+    buffered = io.BytesIO()
+    pil_image.save(buffered, format="PNG")
+    return base64.b64encode(buffered.getvalue()).decode("utf-8")
+# Transcribe audio
+def transcribe_audio(audio, openai_api_key):
+    if not openai_api_key:
+        return "Error: No API key provided."
+    openai.api_key = openai_api_key
+    try:
+        with open(audio, 'rb') as f:
+            audio_bytes = f.read()
+        file_obj = io.BytesIO(audio_bytes)
+        file_obj.name = 'audio.wav'
+        transcription = openai.Audio.transcribe(file=file_obj, model="whisper-1")
+        return transcription.text
+    except Exception as e:
+        return f"Error transcribing audio: {str(e)}"
+# Generate response for text/image/pdf
+def generate_response(input_text, image, pdf_content, openai_api_key, reasoning_effort, model_choice):
+    if not openai_api_key:
+        return "Error: No API key provided."
+    openai.api_key = openai_api_key
+    if pdf_content and input_text:
+        input_text = f"Based on the document below, answer the question:\n\n{input_text}\n\nDocument:\n{pdf_content}"
+    elif image:
+        image_b64 = get_base64_string_from_image(image)
+        input_text = f"data:image/png;base64,{image_b64}"
+    try:
+        response = openai.ChatCompletion.create(
+            model=model_choice,
+            messages=[{"role": "user", "content": input_text}],
+            max_completion_tokens=2000
+        )
+        return response.choices[0].message.content
+    except Exception as e:
+        return f"Error calling OpenAI API: {str(e)}"
+# Chatbot logic
+def chatbot(input_text, image, audio, pdf_file, openai_api_key, reasoning_effort, model_choice, pdf_content, num_quiz_questions, pdf_quiz_mode, history):
+    if history is None:
+        history = []
+    if audio:
+        input_text = transcribe_audio(audio, openai_api_key)
+    new_pdf_content = pdf_content
+    if pdf_file:
+        new_pdf_content = extract_text_from_pdf(pdf_file)
+    if pdf_quiz_mode:
+        if new_pdf_content:
+            quiz = generate_mcq_quiz(new_pdf_content, int(num_quiz_questions), openai_api_key, model_choice)
+            history.append((f"📘 Generated {num_quiz_questions} quiz questions", quiz))
+        else:
+            history.append(("No PDF detected", "Please upload a PDF file first."))
+    else:
+        response = generate_response(input_text, image, new_pdf_content, openai_api_key, reasoning_effort, model_choice)
+        if input_text:
+            history.append((input_text, response))
+        elif image:
+            history.append(("🖼️ [Image Uploaded]", response))
+        elif pdf_file:
+            history.append(("📄 [PDF Uploaded]", response))
+        else:
+            history.append(("No input", "Please provide input."))
+    return "", None, None, None, new_pdf_content, history
+# Reset all fields
+def clear_history():
+    return "", None, None, None, "", []
+# Extract text when PDF uploaded
+def process_pdf(pdf_file):
+    if pdf_file is None:
+        return ""
+    return extract_text_from_pdf(pdf_file)
+# Switch between input modes
+def update_input_type(choice):
+    if choice == "Text":
+        return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(value=False)
+    elif choice == "Image":
+        return gr.update(visible=True), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(value=False)
+    elif choice == "Voice":
+        return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(value=False)
+    elif choice == "PDF":
+        return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(value=False)
+    elif choice == "PDF(QUIZ)":
+        return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=True), gr.update(value=True)
+# Build Gradio interface
+def create_interface():
+    with gr.Blocks() as demo:
+        gr.Markdown("## 🧠 Multimodal Chatbot — Text | Image | Voice | PDF | Quiz")
+        pdf_content = gr.State("")
+        openai_api_key = gr.Textbox(label="🔑 OpenAI API Key", type="password", placeholder="sk-...")
+        input_type = gr.Radio(
+            ["Text", "Image", "Voice", "PDF", "PDF(QUIZ)"],
+            label="Choose Input Type",
+            value="Text"
+        )
+        input_text = gr.Textbox(label="Enter your question or text", lines=2, visible=True)
+        image_input = gr.Image(label="Upload Image", type="pil", visible=False)
+        audio_input = gr.Audio(label="Upload/Record Audio", type="filepath", visible=False)
+        pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"], visible=False)
+        quiz_questions_slider = gr.Slider(1, 20, value=5, step=1, label="Number of Quiz Questions", visible=False)
+        quiz_mode = gr.Checkbox(label="Quiz Mode", visible=False, value=False)
+        with gr.Row():
+            reasoning_effort = gr.Dropdown(["low", "medium", "high"], value="medium", label="Reasoning Effort")
+            model_choice = gr.Dropdown(["o1", "o3-mini"], value="o1", label="Model")
+        submit_btn = gr.Button("Submit")
+        clear_btn = gr.Button("Clear Chat")
+        chat_history = gr.Chatbot(label="Chat History")
+        # Input type handling
+        input_type.change(
+            fn=update_input_type,
+            inputs=[input_type],
+            outputs=[input_text, image_input, audio_input, pdf_input, quiz_questions_slider, quiz_mode]
+        )
+        # PDF upload processing
+        pdf_input.change(fn=process_pdf, inputs=[pdf_input], outputs=[pdf_content])
+        # Submit
+        submit_btn.click(
+            fn=chatbot,
+            inputs=[input_text, image_input, audio_input, pdf_input, openai_api_key, reasoning_effort, model_choice, pdf_content, quiz_questions_slider, quiz_mode, chat_history],
+            outputs=[input_text, image_input, audio_input, pdf_input, pdf_content, chat_history]
+        )
+        # Clear
+        clear_btn.click(fn=clear_history, inputs=[], outputs=[input_text, image_input, audio_input, pdf_input, pdf_content, chat_history])
+    return demo
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch()