File size: 4,593 Bytes
0e07292
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import gradio as gr


# Placeholder for the prediction function
def predict(mode, text, image_path):
    """
    This is a placeholder for the final prediction logic.
    It will return a hardcoded dictionary to demonstrate the output format.
    """
    if mode == "Multimodal":
        result_text = "Result for Multimodal input: a category from a real model. Confidence: 0.95"
    elif mode == "Text Only":
        result_text = (
            "Result for Text Only input: a category from a real model. Confidence: 0.92"
        )
    elif mode == "Image Only":
        result_text = "Result for Image Only input: a category from a real model. Confidence: 0.88"
    else:
        result_text = "Please select a classification mode."

    return {
        "label": result_text,
        "confidences": {
            "abcat0100000": 0.05,
            "abcat0200000": 0.10,
            "abcat0300000": 0.20,
            "abcat0400000": 0.45,
            "abcat0500000": 0.20,
        },
    }


# Function to update input visibility based on mode selection
def update_inputs(mode):
    if mode == "Multimodal":
        return gr.Textbox(visible=True), gr.Image(visible=True)
    elif mode == "Text Only":
        return gr.Textbox(visible=True), gr.Image(visible=False)
    elif mode == "Image Only":
        return gr.Textbox(visible=False), gr.Image(visible=True)
    else:  # Default case
        return gr.Textbox(visible=True), gr.Image(visible=True)


# Gradio Interface using Blocks
with gr.Blocks(title="Multimodal Product Classification") as demo:
    with gr.Tabs():
        with gr.TabItem("App"):
            gr.Markdown("# Multimodal Product Classifier")
            gr.Markdown("Classify products using either text, images, or both.")

            with gr.Row():
                with gr.Column(scale=1):
                    with gr.Column(variant="panel"):
                        gr.Markdown("### βš™οΈ Classification Inputs")

                        mode_radio = gr.Radio(
                            choices=["Multimodal", "Text Only", "Image Only"],
                            value="Multimodal",
                            label="Choose Classification Mode",
                        )

                        text_input = gr.Textbox(
                            label="Product Description",
                            placeholder="e.g., Apple iPhone 15 Pro Max 256GB",
                        )
                        image_input = gr.Image(
                            label="Product Image", type="filepath", visible=True
                        )

                    classify_btn = gr.Button("πŸš€ Classify Product", variant="primary")

                with gr.Column(scale=1):
                    with gr.Column(variant="panel"):
                        gr.Markdown("### πŸ“Š Classification Results")

                        output_label = gr.Label(
                            label="Predicted Category", num_top_classes=5
                        )

                    with gr.Accordion("How to use this demo", open=False):
                        gr.Markdown(
                            """
                            This demo classifies a product based on its description and image.
                            - **Multimodal:** Uses both text and image for the most accurate prediction.
                            - **Text Only:** Uses only the product description.
                            - **Image Only:** Uses only the product image.
                            """
                        )

        with gr.TabItem("About"):
            gr.Markdown(
                """
                ### About the Project
                This project demonstrates a multimodal classification system trained on data from Best Buy. It uses a Multilayer Perceptron (MLP) model trained on pre-generated embeddings from a Text-based model (MiniLM-L6) and an Image-based model (ConvNeXtV2).
                """
            )

        with gr.TabItem("Architecture"):
            gr.Markdown(
                """
                ### Model Architecture
                This section would contain details about the MLP architecture, the embedding models used, and a diagram explaining the data flow.
                """
            )

    # Event listeners for conditional rendering
    mode_radio.change(
        fn=update_inputs, inputs=mode_radio, outputs=[text_input, image_input]
    )

    # Event listener for the classify button
    classify_btn.click(
        fn=predict, inputs=[mode_radio, text_input, image_input], outputs=output_label
    )

demo.launch()