File size: 4,724 Bytes
d7c8166
 
 
0e07292
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d7c8166
0e07292
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d7c8166
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import gradio as gr


# Updated placeholder for the prediction function
def predict(mode, text, image_path):
    """
    This placeholder function now returns a dictionary
    in the format expected by the gr.Label component.
    """
    # Hardcoded, sample output. In the future, this will come from your model.
    multimodal_output = {
        "abcat0100000": 0.05,
        "abcat0200000": 0.10,
        "abcat0300000": 0.20,
        "abcat0400000": 0.45,
        "abcat0500000": 0.20,
    }
    text_only_output = {
        "abcat0100000": 0.08,
        "abcat0200000": 0.15,
        "abcat0300000": 0.25,
        "abcat0400000": 0.35,
        "abcat0500000": 0.17,
    }
    image_only_output = {
        "abcat0100000": 0.10,
        "abcat0200000": 0.20,
        "abcat0300000": 0.30,
        "abcat0400000": 0.25,
        "abcat0500000": 0.15,
    }

    if mode == "Multimodal":
        return multimodal_output
    elif mode == "Text Only":
        return text_only_output
    elif mode == "Image Only":
        return image_only_output
    else:
        return {}  # Return an empty dictionary for no selection


# Function to update input visibility based on mode selection
def update_inputs(mode):
    if mode == "Multimodal":
        return gr.Textbox(visible=True), gr.Image(visible=True)
    elif mode == "Text Only":
        return gr.Textbox(visible=True), gr.Image(visible=False)
    elif mode == "Image Only":
        return gr.Textbox(visible=False), gr.Image(visible=True)
    else:  # Default case
        return gr.Textbox(visible=True), gr.Image(visible=True)


# Gradio Interface using Blocks
with gr.Blocks(title="Multimodal Product Classification") as demo:
    with gr.Tabs():
        with gr.TabItem("App"):
            gr.Markdown("# Multimodal Product Classifier")
            gr.Markdown("Classify products using either text, images, or both.")

            with gr.Row():
                with gr.Column(scale=1):
                    with gr.Column(variant="panel"):
                        gr.Markdown("### βš™οΈ Classification Inputs")

                        mode_radio = gr.Radio(
                            choices=["Multimodal", "Text Only", "Image Only"],
                            value="Multimodal",
                            label="Choose Classification Mode",
                        )

                        text_input = gr.Textbox(
                            label="Product Description",
                            placeholder="e.g., Apple iPhone 15 Pro Max 256GB",
                        )
                        image_input = gr.Image(
                            label="Product Image", type="filepath", visible=True
                        )

                    classify_btn = gr.Button("πŸš€ Classify Product", variant="primary")

                with gr.Column(scale=1):
                    with gr.Column(variant="panel"):
                        gr.Markdown("### πŸ“Š Classification Results")

                        output_label = gr.Label(
                            label="Predicted Category", num_top_classes=5
                        )

                    with gr.Accordion("How to use this demo", open=False):
                        gr.Markdown(
                            """
                            This demo classifies a product based on its description and image.
                            - **Multimodal:** Uses both text and image for the most accurate prediction.
                            - **Text Only:** Uses only the product description.
                            - **Image Only:** Uses only the product image.
                            """
                        )

        with gr.TabItem("About"):
            gr.Markdown(
                """
                ### About the Project
                This project demonstrates a multimodal classification system trained on data from Best Buy. It uses a Multilayer Perceptron (MLP) model trained on pre-generated embeddings from a Text-based model (MiniLM-L6) and an Image-based model (ConvNeXtV2).
                """
            )

        with gr.TabItem("Architecture"):
            gr.Markdown(
                """
                ### Model Architecture
                This section would contain details about the MLP architecture, the embedding models used, and a diagram explaining the data flow.
                """
            )

    # Event listeners for conditional rendering
    mode_radio.change(
        fn=update_inputs, inputs=mode_radio, outputs=[text_input, image_input]
    )

    # Event listener for the classify button
    classify_btn.click(
        fn=predict, inputs=[mode_radio, text_input, image_input], outputs=output_label
    )

demo.launch()