File size: 7,658 Bytes
9470ff7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
import os
import warnings
from io import BytesIO

import numpy as np
import pandas as pd
import requests
from PIL import Image
from sklearn.model_selection import train_test_split

# 💬 NOTE: Suppress all warnings
warnings.filterwarnings("ignore")


def process_embeddings(df, col_name):
    """
    Process embeddings in a DataFrame column.

    Args:
    - df (pd.DataFrame): The DataFrame containing the embeddings column.
    - col_name (str): The name of the column containing the embeddings.

    Returns:
    pd.DataFrame: The DataFrame with processed embeddings.

    Steps:
    1. Convert the values in the specified column to lists.
    2. Extract values from lists and create new columns for each element.
    3. Remove the original embeddings column.

    Example:
    df_processed = process_embeddings(df, 'embeddings')
    """
    # Convert the values (eg. "[-0.123, 0.456, ...]") in the column to lists
    df[col_name] = df[col_name].apply(eval)

    # Extract values from lists and create new columns
    """ 🔎 Example
    text_1   text_2   text_3
    0  -0.123   0.456   0.789
    1   0.321  -0.654   0.987
    """
    embeddings_df = pd.DataFrame(
        df[col_name].to_list(),
        columns=[f"text_{i + 1}" for i in range(df[col_name].str.len().max())],
    )
    df = pd.concat([df, embeddings_df], axis=1)

    # Remove the original "embeddings" column
    df = df.drop(columns=[col_name])

    return df


def rename_image_embeddings(df):
    """
    Rename columns in a DataFrame for image embeddings.

    Args:
    - df (pd.DataFrame): The DataFrame containing columns to be renamed.

    Returns:
    pd.DataFrame: The DataFrame with renamed columns.

    Example:
    df_renamed = rename_image_embeddings(df)
    """
    # From 0    1    2   label  ➡️ image_0  image_1  image_2  label
    df.columns = [f"image_{int(col)}" if col.isdigit() else col for col in df.columns]

    return df


def preprocess_data(
    text_data,
    image_data,
    text_id="image_id",
    image_id="ImageName",
    embeddings_col="embeddings",
):
    """
    Preprocess and merge text and image dataframes.

    Args:
    - text_data (pd.DataFrame): DataFrame containing text data.
    - image_data (pd.DataFrame): DataFrame containing image data.
    - text_id (str): Column name for text data identifier.
    - image_id (str): Column name for image data identifier.
    - embeddings_col (str): Column name for embeddings data.

    Returns:
    pd.DataFrame: Merged and preprocessed DataFrame.

    This function:
    Process text and image embeddings.
    Convert image_id and text_id values to integers.
    Merge dataframes using id.
    Drop unnecessary columns.

    Example:
    merged_df = preprocess_data(text_df, image_df)
    """
    # Call previous functions to tune the text and image dataframes
    text_data = process_embeddings(text_data, embeddings_col)
    image_data = rename_image_embeddings(image_data)

    # Drop missing values in image id - Removes rows where the ID (used to join text ↔ image) is missing.
    image_data = image_data.dropna(subset=[image_id])
    text_data = text_data.dropna(subset=[text_id])

    # Cleans up text IDs: if the column contains file paths (like "data/images/123.jpg"), it extracts just the file name ("123.jpg").
    text_data[text_id] = text_data[text_id].apply(lambda x: x.split("/")[-1])

    # Merge dataframes using image_id - Joins text and image embeddings using the IDs (text_id vs image_id).
    df = pd.merge(text_data, image_data, left_on=text_id, right_on=image_id)

    # Drop unnecessary columns - Removes the original ID columns since they’re no longer needed after the merge.
    df.drop([image_id, text_id], axis=1, inplace=True)

    return df


class ImageDownloader:
    """
    Image downloader class to download images from URLs.

    Args:
    - image_dir (str): Directory to save images.
    - image_size (tuple): Size of the images to be saved.
    - override (bool): Whether to override existing images.

    Methods:
    - download_images(df, print_every=1000): Download images from URLs in a DataFrame.
        Args:
        - df (pd.DataFrame): DataFrame containing image URLs.
        - print_every (int): Print progress every n images.
        Returns:
        pd.DataFrame: DataFrame with image paths added.

    Example:
    downloader = ImageDownloader()
    df = downloader.download_images(df)
    """

    def __init__(
        self, image_dir="data/images/", image_size=(224, 224), overwrite=False
    ):
        self.image_dir = image_dir
        self.image_size = image_size
        self.overwrite = overwrite

        # Create the directory if it doesn't exist
        if not os.path.exists(self.image_dir):
            os.makedirs(self.image_dir)

    def download_images(self, df, print_every=1000):
        # Bulk download images from a DataFrame of URLs, resize them to a standard format, and add their local paths back to the DataFrame.
        image_paths = []

        i = 0
        for index, row in df.iterrows():
            if i % print_every == 0:
                print(f"Downloading image {i}/{len(df)}")
                i += 1

            sku = row["sku"]
            image_url = row["image"]
            image_path = os.path.join(self.image_dir, f"{sku}.jpg")

            if os.path.exists(image_path) and not self.overwrite:
                print(f"Image {sku} is already in the path.")
                image_paths.append(image_path)
                continue

            try:
                response = requests.get(image_url)
                response.raise_for_status()
                img = Image.open(BytesIO(response.content))
                img = img.resize(self.image_size, Image.Resampling.LANCZOS)
                img.save(image_path)
                # print(f"Downloaded image for SKU: {sku}")
                image_paths.append(image_path)
            except Exception as e:
                print(f"Could not download image for SKU: {sku}. Error: {e}")
                image_paths.append(np.nan)

        df["image_path"] = image_paths
        return df


def train_test_split_and_feature_extraction(df, test_size=0.3, random_state=42):
    """
    Split the data into train and test sets and extract features and labels.

    Args:
    - df (pd.DataFrame): DataFrame containing the data.

    Keyword Args:
    - test_size (float): Size of the test set.
    - random_state (int): Random state for reproducibility

    Returns:
    pd.DataFrame: Train DataFrame.
    pd.DataFrame: Test DataFrame.
    list: List of columns with text embeddings.
    list: List of columns with image embeddings.
    list: List of columns with class labels.

    Example:
    train_df, test_df, text_columns, image_columns, label_columns = train_test_split_and_feature_extraction(df)
    """

    # Split the data into train and test sets setting using the test_size and random_state parameters
    train_df, test_df = train_test_split(
        df, test_size=test_size, random_state=random_state
    )

    # Select the name of the columns with the text embeddings and return it as a list (Even if there is only one column)
    text_columns = [col for col in df.columns if col.startswith("text_")]

    # Select the name of the columns with the image embeddings and return it as a list (Even if there is only one column)
    image_columns = [col for col in df.columns if col.startswith("image_")]

    # Select the name of the column with the class labels and return it as a list (Even if there is only one column)
    label_columns = ["class_id"]

    return train_df, test_df, text_columns, image_columns, label_columns