Spaces:

iBrokeTheCode
/

Multimodal_Product_Classification

Sleeping

File size: 7,658 Bytes

9470ff7

import os
import warnings
from io import BytesIO

import numpy as np
import pandas as pd
import requests
from PIL import Image
from sklearn.model_selection import train_test_split

# 💬 NOTE: Suppress all warnings
warnings.filterwarnings("ignore")


def process_embeddings(df, col_name):
    """
    Process embeddings in a DataFrame column.

    Args:
    - df (pd.DataFrame): The DataFrame containing the embeddings column.
    - col_name (str): The name of the column containing the embeddings.

    Returns:
    pd.DataFrame: The DataFrame with processed embeddings.

    Steps:
    1. Convert the values in the specified column to lists.
    2. Extract values from lists and create new columns for each element.
    3. Remove the original embeddings column.

    Example:
    df_processed = process_embeddings(df, 'embeddings')
    """
    # Convert the values (eg. "[-0.123, 0.456, ...]") in the column to lists
    df[col_name] = df[col_name].apply(eval)

    # Extract values from lists and create new columns
    """ 🔎 Example
    text_1   text_2   text_3
    0  -0.123   0.456   0.789
    1   0.321  -0.654   0.987
    """
    embeddings_df = pd.DataFrame(
        df[col_name].to_list(),
        columns=[f"text_{i + 1}" for i in range(df[col_name].str.len().max())],
    )
    df = pd.concat([df, embeddings_df], axis=1)

    # Remove the original "embeddings" column
    df = df.drop(columns=[col_name])

    return df


def rename_image_embeddings(df):
    """
    Rename columns in a DataFrame for image embeddings.

    Args:
    - df (pd.DataFrame): The DataFrame containing columns to be renamed.

    Returns:
    pd.DataFrame: The DataFrame with renamed columns.

    Example:
    df_renamed = rename_image_embeddings(df)
    """
    # From 0    1    2   label  ➡️ image_0  image_1  image_2  label
    df.columns = [f"image_{int(col)}" if col.isdigit() else col for col in df.columns]

    return df


def preprocess_data(
    text_data,
    image_data,
    text_id="image_id",
    image_id="ImageName",
    embeddings_col="embeddings",
):
    """
    Preprocess and merge text and image dataframes.

    Args:
    - text_data (pd.DataFrame): DataFrame containing text data.
    - image_data (pd.DataFrame): DataFrame containing image data.
    - text_id (str): Column name for text data identifier.
    - image_id (str): Column name for image data identifier.
    - embeddings_col (str): Column name for embeddings data.

    Returns:
    pd.DataFrame: Merged and preprocessed DataFrame.

    This function:
    Process text and image embeddings.
    Convert image_id and text_id values to integers.
    Merge dataframes using id.
    Drop unnecessary columns.

    Example:
    merged_df = preprocess_data(text_df, image_df)
    """
    # Call previous functions to tune the text and image dataframes
    text_data = process_embeddings(text_data, embeddings_col)
    image_data = rename_image_embeddings(image_data)

    # Drop missing values in image id - Removes rows where the ID (used to join text ↔ image) is missing.
    image_data = image_data.dropna(subset=[image_id])
    text_data = text_data.dropna(subset=[text_id])

    # Cleans up text IDs: if the column contains file paths (like "data/images/123.jpg"), it extracts just the file name ("123.jpg").
    text_data[text_id] = text_data[text_id].apply(lambda x: x.split("/")[-1])

    # Merge dataframes using image_id - Joins text and image embeddings using the IDs (text_id vs image_id).
    df = pd.merge(text_data, image_data, left_on=text_id, right_on=image_id)

    # Drop unnecessary columns - Removes the original ID columns since they’re no longer needed after the merge.
    df.drop([image_id, text_id], axis=1, inplace=True)

    return df


class ImageDownloader:
    """
    Image downloader class to download images from URLs.

    Args:
    - image_dir (str): Directory to save images.
    - image_size (tuple): Size of the images to be saved.
    - override (bool): Whether to override existing images.

    Methods:
    - download_images(df, print_every=1000): Download images from URLs in a DataFrame.
        Args:
        - df (pd.DataFrame): DataFrame containing image URLs.
        - print_every (int): Print progress every n images.
        Returns:
        pd.DataFrame: DataFrame with image paths added.

    Example:
    downloader = ImageDownloader()
    df = downloader.download_images(df)
    """

    def __init__(
        self, image_dir="data/images/", image_size=(224, 224), overwrite=False
    ):
        self.image_dir = image_dir
        self.image_size = image_size
        self.overwrite = overwrite

        # Create the directory if it doesn't exist
        if not os.path.exists(self.image_dir):
            os.makedirs(self.image_dir)

    def download_images(self, df, print_every=1000):
        # Bulk download images from a DataFrame of URLs, resize them to a standard format, and add their local paths back to the DataFrame.
        image_paths = []

        i = 0
        for index, row in df.iterrows():
            if i % print_every == 0:
                print(f"Downloading image {i}/{len(df)}")
                i += 1

            sku = row["sku"]
            image_url = row["image"]
            image_path = os.path.join(self.image_dir, f"{sku}.jpg")

            if os.path.exists(image_path) and not self.overwrite:
                print(f"Image {sku} is already in the path.")
                image_paths.append(image_path)
                continue

            try:
                response = requests.get(image_url)
                response.raise_for_status()
                img = Image.open(BytesIO(response.content))
                img = img.resize(self.image_size, Image.Resampling.LANCZOS)
                img.save(image_path)
                # print(f"Downloaded image for SKU: {sku}")
                image_paths.append(image_path)
            except Exception as e:
                print(f"Could not download image for SKU: {sku}. Error: {e}")
                image_paths.append(np.nan)

        df["image_path"] = image_paths
        return df


def train_test_split_and_feature_extraction(df, test_size=0.3, random_state=42):
    """
    Split the data into train and test sets and extract features and labels.

    Args:
    - df (pd.DataFrame): DataFrame containing the data.

    Keyword Args:
    - test_size (float): Size of the test set.
    - random_state (int): Random state for reproducibility

    Returns:
    pd.DataFrame: Train DataFrame.
    pd.DataFrame: Test DataFrame.
    list: List of columns with text embeddings.
    list: List of columns with image embeddings.
    list: List of columns with class labels.

    Example:
    train_df, test_df, text_columns, image_columns, label_columns = train_test_split_and_feature_extraction(df)
    """

    # Split the data into train and test sets setting using the test_size and random_state parameters
    train_df, test_df = train_test_split(
        df, test_size=test_size, random_state=random_state
    )

    # Select the name of the columns with the text embeddings and return it as a list (Even if there is only one column)
    text_columns = [col for col in df.columns if col.startswith("text_")]

    # Select the name of the columns with the image embeddings and return it as a list (Even if there is only one column)
    image_columns = [col for col in df.columns if col.startswith("image_")]

    # Select the name of the column with the class labels and return it as a list (Even if there is only one column)
    label_columns = ["class_id"]

    return train_df, test_df, text_columns, image_columns, label_columns