import os import warnings from io import BytesIO import numpy as np import pandas as pd import requests from PIL import Image from sklearn.model_selection import train_test_split # 💬 NOTE: Suppress all warnings warnings.filterwarnings("ignore") def process_embeddings(df, col_name): """ Process embeddings in a DataFrame column. Args: - df (pd.DataFrame): The DataFrame containing the embeddings column. - col_name (str): The name of the column containing the embeddings. Returns: pd.DataFrame: The DataFrame with processed embeddings. Steps: 1. Convert the values in the specified column to lists. 2. Extract values from lists and create new columns for each element. 3. Remove the original embeddings column. Example: df_processed = process_embeddings(df, 'embeddings') """ # Convert the values (eg. "[-0.123, 0.456, ...]") in the column to lists df[col_name] = df[col_name].apply(eval) # Extract values from lists and create new columns """ 🔎 Example text_1 text_2 text_3 0 -0.123 0.456 0.789 1 0.321 -0.654 0.987 """ embeddings_df = pd.DataFrame( df[col_name].to_list(), columns=[f"text_{i + 1}" for i in range(df[col_name].str.len().max())], ) df = pd.concat([df, embeddings_df], axis=1) # Remove the original "embeddings" column df = df.drop(columns=[col_name]) return df def rename_image_embeddings(df): """ Rename columns in a DataFrame for image embeddings. Args: - df (pd.DataFrame): The DataFrame containing columns to be renamed. Returns: pd.DataFrame: The DataFrame with renamed columns. Example: df_renamed = rename_image_embeddings(df) """ # From 0 1 2 label ➡️ image_0 image_1 image_2 label df.columns = [f"image_{int(col)}" if col.isdigit() else col for col in df.columns] return df def preprocess_data( text_data, image_data, text_id="image_id", image_id="ImageName", embeddings_col="embeddings", ): """ Preprocess and merge text and image dataframes. Args: - text_data (pd.DataFrame): DataFrame containing text data. - image_data (pd.DataFrame): DataFrame containing image data. - text_id (str): Column name for text data identifier. - image_id (str): Column name for image data identifier. - embeddings_col (str): Column name for embeddings data. Returns: pd.DataFrame: Merged and preprocessed DataFrame. This function: Process text and image embeddings. Convert image_id and text_id values to integers. Merge dataframes using id. Drop unnecessary columns. Example: merged_df = preprocess_data(text_df, image_df) """ # Call previous functions to tune the text and image dataframes text_data = process_embeddings(text_data, embeddings_col) image_data = rename_image_embeddings(image_data) # Drop missing values in image id - Removes rows where the ID (used to join text ↔ image) is missing. image_data = image_data.dropna(subset=[image_id]) text_data = text_data.dropna(subset=[text_id]) # Cleans up text IDs: if the column contains file paths (like "data/images/123.jpg"), it extracts just the file name ("123.jpg"). text_data[text_id] = text_data[text_id].apply(lambda x: x.split("/")[-1]) # Merge dataframes using image_id - Joins text and image embeddings using the IDs (text_id vs image_id). df = pd.merge(text_data, image_data, left_on=text_id, right_on=image_id) # Drop unnecessary columns - Removes the original ID columns since they’re no longer needed after the merge. df.drop([image_id, text_id], axis=1, inplace=True) return df class ImageDownloader: """ Image downloader class to download images from URLs. Args: - image_dir (str): Directory to save images. - image_size (tuple): Size of the images to be saved. - override (bool): Whether to override existing images. Methods: - download_images(df, print_every=1000): Download images from URLs in a DataFrame. Args: - df (pd.DataFrame): DataFrame containing image URLs. - print_every (int): Print progress every n images. Returns: pd.DataFrame: DataFrame with image paths added. Example: downloader = ImageDownloader() df = downloader.download_images(df) """ def __init__( self, image_dir="data/images/", image_size=(224, 224), overwrite=False ): self.image_dir = image_dir self.image_size = image_size self.overwrite = overwrite # Create the directory if it doesn't exist if not os.path.exists(self.image_dir): os.makedirs(self.image_dir) def download_images(self, df, print_every=1000): # Bulk download images from a DataFrame of URLs, resize them to a standard format, and add their local paths back to the DataFrame. image_paths = [] i = 0 for index, row in df.iterrows(): if i % print_every == 0: print(f"Downloading image {i}/{len(df)}") i += 1 sku = row["sku"] image_url = row["image"] image_path = os.path.join(self.image_dir, f"{sku}.jpg") if os.path.exists(image_path) and not self.overwrite: print(f"Image {sku} is already in the path.") image_paths.append(image_path) continue try: response = requests.get(image_url) response.raise_for_status() img = Image.open(BytesIO(response.content)) img = img.resize(self.image_size, Image.Resampling.LANCZOS) img.save(image_path) # print(f"Downloaded image for SKU: {sku}") image_paths.append(image_path) except Exception as e: print(f"Could not download image for SKU: {sku}. Error: {e}") image_paths.append(np.nan) df["image_path"] = image_paths return df def train_test_split_and_feature_extraction(df, test_size=0.3, random_state=42): """ Split the data into train and test sets and extract features and labels. Args: - df (pd.DataFrame): DataFrame containing the data. Keyword Args: - test_size (float): Size of the test set. - random_state (int): Random state for reproducibility Returns: pd.DataFrame: Train DataFrame. pd.DataFrame: Test DataFrame. list: List of columns with text embeddings. list: List of columns with image embeddings. list: List of columns with class labels. Example: train_df, test_df, text_columns, image_columns, label_columns = train_test_split_and_feature_extraction(df) """ # Split the data into train and test sets setting using the test_size and random_state parameters train_df, test_df = train_test_split( df, test_size=test_size, random_state=random_state ) # Select the name of the columns with the text embeddings and return it as a list (Even if there is only one column) text_columns = [col for col in df.columns if col.startswith("text_")] # Select the name of the columns with the image embeddings and return it as a list (Even if there is only one column) image_columns = [col for col in df.columns if col.startswith("image_")] # Select the name of the column with the class labels and return it as a list (Even if there is only one column) label_columns = ["class_id"] return train_df, test_df, text_columns, image_columns, label_columns