Spaces:

iBrokeTheCode
/

Multimodal_Product_Classification

Running

App Files Files Community

iBrokeTheCode commited on Aug 26

Commit

9470ff7

1 Parent(s): d7c8166

chore: Add source code for training

Browse files

Files changed (7) hide show

requirements_train.txt +15 -0
src/__init__.py +0 -0
src/classifiers_classic_ml.py +298 -0
src/classifiers_mlp.py +522 -0
src/nlp_models.py +242 -0
src/utils.py +227 -0
src/vision_embeddings_tf.py +470 -0

requirements_train.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+pandas~=1.5.0
+numpy~=1.23.3
+pillow==10.4.0
+requests==2.26.0
+matplotlib==3.4.2
+seaborn==0.13.2
+plotly==5.23.0
+pytest==8.3.3
+scikit-learn==0.24.2
+torch==2.0.0
+tensorflow==2.10.0
+transformers==4.44.2
+openai==1.37.0
+python-dotenv==1.0.1
+tensorflow-gpu==2.10.0

src/__init__.py ADDED Viewed

File without changes

src/classifiers_classic_ml.py ADDED Viewed

	@@ -0,0 +1,298 @@

+import warnings
+from itertools import cycle
+import matplotlib
+# 💬 NOTE: Handle plots issues when running tests or displaying in notebooks
+try:
+    get_ipython  # Only exists in Jupyter
+    matplotlib.use("module://matplotlib_inline.backend_inline")
+except Exception:
+    matplotlib.use("Agg")  # Fix error with tests
+import matplotlib.pyplot as plt
+import pandas as pd
+import plotly.express as px
+import seaborn as sns
+from sklearn.decomposition import PCA
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.manifold import TSNE
+from sklearn.metrics import (
+    accuracy_score,
+    auc,
+    classification_report,
+    confusion_matrix,
+    f1_score,
+    precision_score,
+    recall_score,
+    roc_curve,
+)
+warnings.filterwarnings("ignore")
+def visualize_embeddings(
+    X_train, X_test, y_train, y_test, plot_type="2D", method="PCA"
+):
+    """
+    Visualizes high-dimensional embeddings (e.g., text or image embeddings) using dimensionality reduction techniques (PCA or t-SNE)
+    and plots the results in 2D or 3D using Plotly for interactive visualizations.
+    Args:
+        X_train (np.ndarray): Training data embeddings of shape (n_samples, n_features).
+        X_test (np.ndarray): Test data embeddings of shape (n_samples, n_features).
+        y_train (np.ndarray): True labels for the training data.
+        y_test (np.ndarray): True labels for the test data.
+        plot_type (str, optional): Type of plot to generate, either '2D' or '3D'. Default is '2D'.
+        method (str, optional): Dimensionality reduction method to use, either 'PCA' or 't-SNE'. Default is 'PCA'.
+    Returns:
+        None
+    Side Effects:
+        - Displays an interactive 2D or 3D scatter plot of the reduced embeddings, with points colored by their class labels.
+    Notes:
+        - PCA is a linear dimensionality reduction method, while t-SNE is non-linear and captures more complex relationships.
+        - Perplexity is set to 10 for t-SNE. It can be tuned if necessary for better visualization of data clusters.
+        - The function raises a `ValueError` if an invalid method is specified.
+        - The function uses Plotly to display interactive plots.
+    Example:
+        visualize_embeddings(X_train, X_test, y_train, y_test, plot_type='3D', method='t-SNE')
+    Visualization Details:
+        - For 3D visualization, the reduced embeddings are plotted in a 3D scatter plot, with axes labeled as 'col1', 'col2', and 'col3'.
+        - For 2D visualization, the embeddings are plotted in a 2D scatter plot, with axes labeled as 'col1' and 'col2'.
+        - Class labels are represented by different colors in the scatter plots.
+    """
+    perplexity = 10
+    if plot_type == "3D":
+        if method == "PCA":
+            # Create an instance of PCA for 3D visualization and fit it on the training data
+            red = PCA(n_components=3)
+            red.fit(X_train)
+            # Use the trained model to transform the test data
+            reduced_embeddings = red.transform(X_test)
+        elif method == "t-SNE":
+            # Implement t-SNE for 3D visualization
+            red = TSNE(
+                n_components=3, perplexity=perplexity, random_state=42, init="pca"
+            )
+            # Use the model to train and transform the test data
+            reduced_embeddings = red.fit_transform(X_test)
+        else:
+            raise ValueError("Invalid method. Please choose either 'PCA' or 't-SNE'.")
+        df_reduced = pd.DataFrame(reduced_embeddings, columns=["col1", "col2", "col3"])
+        df_reduced["Class"] = y_test
+        # 3D scatter plot
+        fig = px.scatter_3d(
+            df_reduced, x="col1", y="col2", z="col3", color="Class", title="3D"
+        )
+    else:  # 2D
+        if method == "PCA":
+            # Create an instance of PCA for 2D visualization and fit it on the training data
+            red = PCA(n_components=2)
+            red.fit(X_train)
+            # Use the trained model to transform the test data
+            reduced_embeddings = red.transform(X_test)
+        elif method == "t-SNE":
+            # Implement t-SNE for 2D visualization
+            red = TSNE(
+                n_components=2, perplexity=perplexity, random_state=42, init="pca"
+            )
+            # Use the model to train and transform the test data
+            reduced_embeddings = red.fit_transform(X_test)
+        else:
+            raise ValueError("Invalid method. Please choose either 'PCA' or 't-SNE'.")
+        df_reduced = pd.DataFrame(reduced_embeddings, columns=["col1", "col2"])
+        df_reduced["Class"] = y_test
+        # 2D scatter plot
+        fig = px.scatter(df_reduced, x="col1", y="col2", color="Class", title="2D")
+    fig.update_layout(
+        title=f"Embeddings - {method} {plot_type} Visualization", scene=dict()
+    )
+    fig.show()
+    return red
+def test_model(X_test, y_test, model):
+    """
+    Evaluates a trained model on a test set by computing key performance metrics and visualizing the results.
+    The function generates a confusion matrix, plots ROC curves (for binary or multi-class classification),
+    and prints the classification report. It also computes overall accuracy, weighted precision, weighted recall,
+    and weighted F1-score for the test data.
+    Args:
+        X_test (np.ndarray): Test set feature data.
+        y_test (np.ndarray): True labels for the test set.
+        model (sklearn-like model): A trained machine learning model with `predict` and `predict_proba` methods.
+    Returns:
+        tuple:
+            - accuracy (float): Overall accuracy of the model on the test set.
+            - precision (float): Weighted precision score across all classes.
+            - recall (float): Weighted recall score across all classes.
+            - f1 (float): Weighted F1-score across all classes.
+    Side Effects:
+        - Displays a confusion matrix as a heatmap.
+        - Plots ROC curves for binary or multi-class classification.
+        - Prints the classification report with precision, recall, F1-score, and support for each class.
+    Example:
+        accuracy, precision, recall, f1 = test_model(X_test, y_test, trained_model)
+    Notes:
+        - If `y_test` is multi-dimensional (e.g., one-hot encoded), it will be squeezed to 1D.
+        - For binary classification, a single ROC curve is plotted. For multi-class classification,
+          an ROC curve is plotted for each class with a unique color.
+        - Weighted precision, recall, and F1-score are computed to handle class imbalance in multi-class classification.
+    """
+    y_pred = model.predict(X_test)
+    y_pred_proba = model.predict_proba(X_test)
+    y_test = y_test.squeeze() if y_test.ndim > 1 else y_test
+    # Confusion matrix
+    cm = confusion_matrix(y_test, y_pred)
+    plt.figure(figsize=(10, 5))
+    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
+    plt.xlabel("Predicted")
+    plt.ylabel("True")
+    plt.title("Confusion Matrix")
+    plt.show()
+    # ROC curve
+    fig, ax = plt.subplots(figsize=(6, 6))
+    # Binary classification
+    if y_pred_proba.shape[1] == 2:
+        fpr, tpr, _ = roc_curve(y_test, y_pred_proba[:, 1])
+        ax.plot(
+            fpr,
+            tpr,
+            color="aqua",
+            lw=2,
+            label=f"ROC curve (area = {auc(fpr, tpr):.2f})",
+        )
+        ax.plot([0, 1], [0, 1], "k--", label="Chance level (AUC = 0.5)")
+    # Multiclass classification
+    else:
+        y_onehot_test = pd.get_dummies(y_test).values
+        colors = cycle(
+            [
+                "aqua",
+                "darkorange",
+                "cornflowerblue",
+                "red",
+                "green",
+                "yellow",
+                "purple",
+                "pink",
+                "brown",
+                "black",
+            ]
+        )
+        for class_id, color in zip(range(y_onehot_test.shape[1]), colors):
+            fpr, tpr, _ = roc_curve(
+                y_onehot_test[:, class_id], y_pred_proba[:, class_id]
+            )
+            ax.plot(
+                fpr,
+                tpr,
+                color=color,
+                lw=2,
+                label=f"ROC curve for class {class_id} (area = {auc(fpr, tpr):.2f})",
+            )
+    ax.plot([0, 1], [0, 1], "k--", label="Chance level (AUC = 0.5)")
+    ax.set_axisbelow(True)
+    ax.set_xlabel("False Positive Rate")
+    ax.set_ylabel("True Positive Rate")
+    ax.set_title("ROC Curve")
+    ax.legend(loc="lower right")
+    plt.show()
+    cr = classification_report(y_test, y_pred)
+    print(cr)
+    accuracy = accuracy_score(y_test, y_pred)
+    precision = precision_score(y_test, y_pred, average="weighted")
+    recall = recall_score(y_test, y_pred, average="weighted")
+    f1 = f1_score(y_test, y_pred, average="weighted")
+    return accuracy, precision, recall, f1
+def train_and_evaluate_model(X_train, X_test, y_train, y_test, models=None, test=True):
+    """
+    Trains and evaluates multiple machine learning models on a given dataset, then visualizes the data embeddings
+    using PCA before training. This function trains each model on the training data, evaluates them on the test data,
+    and computes performance metrics (accuracy, precision, recall, and F1-score).
+    Args:
+        X_train (np.ndarray): Feature matrix for the training data.
+        X_test (np.ndarray): Feature matrix for the test data.
+        y_train (np.ndarray): True labels for the training data.
+        y_test (np.ndarray): True labels for the test data.
+        models (list of tuples, optional): A list of tuples, where each tuple contains the model name as a string and
+                                           the corresponding scikit-learn model instance.
+                                           If None, default models include Random Forest, Decision Tree, and Logistic Regression.
+    Returns:
+        list: A list of trained model tuples, where each tuple contains the model name and the trained model instance.
+    Side Effects:
+        - Displays a PCA 2D visualization of the embeddings using the `visualize_embeddings` function.
+        - Trains each model on the training set.
+        - Prints evaluation metrics (accuracy, precision, recall, F1-score) for each model on the test set.
+        - Displays confusion matrix and ROC curve for each model using the `test_model` function.
+    Example:
+        models = train_and_evaluate_model(X_train, X_test, y_train, y_test)
+    Notes:
+        - The `models` argument can be customized to include any classification models from scikit-learn.
+        - The function uses PCA for the embedding visualization. You can modify the `visualize_embeddings` function call for other visualization methods or dimensionality reduction techniques.
+        - Default models include Random Forest, Decision Tree, and Logistic Regression.
+    """
+    visualize_embeddings(X_train, X_test, y_train, y_test, plot_type="2D", method="PCA")
+    if not (models):
+        # Implement the ML models
+        models = [
+            (
+                "Random Forest",
+                RandomForestClassifier(n_estimators=100, random_state=42),
+            ),
+            ("Logistic Regression", LogisticRegression(max_iter=1000, random_state=42)),
+        ]
+    for name, model in models:
+        print("#" * 20, f" {name} ", "#" * 20)
+        # Train the model on the training
+        model.fit(X_train, y_train)
+        # Evaluate the model on the test set using the test_model function
+        if test:
+            accuracy, precision, recall, f1 = test_model(X_test, y_test, model)
+    return models

src/classifiers_mlp.py ADDED Viewed

	@@ -0,0 +1,522 @@

+import os
+from itertools import cycle
+import matplotlib
+import tensorflow as tf
+# 💬 NOTE: Handle plots issues when running tests or displaying in notebooks
+try:
+    get_ipython  # Only exists in Jupyter
+    matplotlib.use("module://matplotlib_inline.backend_inline")
+except Exception:
+    matplotlib.use("Agg")  # Fix error with tests
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+from sklearn.metrics import (
+    accuracy_score,
+    classification_report,
+    confusion_matrix,
+    f1_score,
+    precision_score,
+    recall_score,
+    roc_auc_score,
+    roc_curve,
+)
+from sklearn.preprocessing import LabelEncoder
+from sklearn.utils.class_weight import compute_class_weight
+from tensorflow.keras import Input, Model
+from tensorflow.keras.callbacks import EarlyStopping
+from tensorflow.keras.layers import BatchNormalization, Concatenate, Dense, Dropout
+from tensorflow.keras.losses import CategoricalCrossentropy
+from tensorflow.keras.optimizers import SGD, Adam
+from tensorflow.keras.utils import Sequence
+class MultimodalDataset(Sequence):
+    """
+    Custom Keras Dataset class for multimodal data handling, designed for models that
+    take both text and image data as inputs. It facilitates batching and shuffling
+    of data for efficient training in Keras models.
+    This class supports loading and batching multimodal data (text and images), as well as handling
+    label encoding. It is compatible with Keras and can be used to train models that require both
+    text and image inputs. It also supports optional shuffling at the end of each epoch for better
+    training performance.
+    Args:
+        df (pd.DataFrame): The DataFrame containing the dataset with text, image, and label columns.
+        text_cols (list): List of column names corresponding to text data. Can be a single column or multiple columns.
+        image_cols (list): List of column names corresponding to image data (usually file paths or image pixel data).
+        label_col (str): Column name corresponding to the target labels.
+        encoder (LabelEncoder, optional): A pre-fitted LabelEncoder instance for encoding the labels.
+                                          If None, a new LabelEncoder is fitted based on the provided data.
+        batch_size (int, optional): Number of samples per batch. Default is 32.
+        shuffle (bool, optional): Whether to shuffle the dataset at the end of each epoch. Default is True.
+    Attributes:
+        text_data (np.ndarray): Array of text data from the DataFrame. None if `text_cols` is not provided.
+        image_data (np.ndarray): Array of image data from the DataFrame. None if `image_cols` is not provided.
+        labels (np.ndarray): One-hot encoded labels corresponding to the dataset's classes.
+        encoder (LabelEncoder): Fitted LabelEncoder used to encode target labels.
+        batch_size (int): Number of samples per batch.
+        shuffle (bool): Flag indicating whether to shuffle the data after each epoch.
+        indices (np.ndarray): Array of indices representing the dataset. Used for shuffling batches.
+    Methods:
+    -------
+    __len__():
+        Returns the number of batches per epoch based on the dataset size and batch size.
+    __getitem__(idx):
+        Retrieves a single batch of data, including both text and image inputs and the corresponding labels.
+        The method returns a tuple in the format ({'text': text_batch, 'image': image_batch}, label_batch),
+        where 'text' and 'image' are only included if their respective columns were provided.
+    on_epoch_end():
+        Updates the index order after each epoch, shuffling if needed.
+    """
+    def __init__(
+        self,
+        df,
+        text_cols,
+        image_cols,
+        label_col,
+        encoder=None,
+        batch_size=32,
+        shuffle=True,
+    ):
+        """
+        Initializes the MultimodalDataset object.
+        Args:
+            df (pd.DataFrame): The dataset as a DataFrame, containing text, image, and label data.
+            text_cols (list): List of column names representing text features.
+            image_cols (list): List of column names representing image features (e.g., file paths or pixel data).
+            label_col (str): Column name corresponding to the target labels.
+            encoder (LabelEncoder, optional): LabelEncoder for encoding the target labels. If None, a new LabelEncoder will be created.
+            batch_size (int, optional): Batch size for loading data. Default is 32.
+            shuffle (bool, optional): Whether to shuffle the data at the end of each epoch. Default is True.
+        Raises:
+            ValueError: If both text_cols and image_cols are None or empty.
+        """
+        if text_cols:
+            # Get the text data from the DataFrame as a NumPy array
+            self.text_data = df[text_cols].astype(np.float32).values
+        else:
+            # Else, set text data to None
+            self.text_data = None
+        if image_cols:
+            # Get the image data from the DataFrame as a NumPy array
+            self.image_data = df[image_cols].astype(np.float32).values
+        else:
+            # Else, set image data to None
+            self.image_data = None
+        if not text_cols and not image_cols:
+            raise ValueError(
+                "At least one of text_cols or image_cols must be provided."
+            )
+        # Get the labels from the DataFrame and encode them
+        self.labels = df[label_col].values
+        # Use provided encoder or fit a new one
+        if encoder is None:
+            self.encoder = LabelEncoder()
+            self.labels = self.encoder.fit_transform(self.labels)
+        else:
+            self.encoder = encoder
+            self.labels = self.encoder.transform(self.labels)
+        # One-hot encode labels for multi-class classification
+        num_classes = len(self.encoder.classes_)
+        self.labels = np.eye(num_classes)[self.labels]
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.on_epoch_end()
+    def __len__(self):
+        """
+        Returns the number of batches per epoch based on the dataset size and batch size.
+        Returns:
+        -------
+        int:
+            The number of batches per epoch.
+        """
+        return int(np.floor(len(self.labels) / self.batch_size))
+    def __getitem__(self, idx):
+        """
+        Retrieves a single batch of data (text and/or image) and the corresponding labels.
+        Args:
+            idx (int): Index of the batch to retrieve.
+        Returns:
+        -------
+        tuple:
+            A tuple containing the batch of text and/or image inputs and the corresponding labels.
+            The input data is returned as a dictionary with keys 'text' and 'image', depending on the provided columns.
+            If no text or image columns were provided, only the other is returned.
+        """
+        indices = self.indices[idx * self.batch_size : (idx + 1) * self.batch_size]
+        if self.text_data is not None:
+            text_batch = self.text_data[indices]
+        if self.image_data is not None:
+            image_batch = self.image_data[indices]
+        label_batch = self.labels[indices]
+        if self.text_data is None:
+            return {"image": image_batch}, label_batch
+        if self.image_data is None:
+            return {"text": text_batch}, label_batch
+        else:
+            return {"text": text_batch, "image": image_batch}, label_batch
+    def on_epoch_end(self):
+        """
+        Updates the index order after each epoch, shuffling the data if needed.
+        This method is called at the end of each epoch and will shuffle the data if the `shuffle` flag is set to True.
+        """
+        self.indices = np.arange(len(self.labels))
+        if self.shuffle:
+            np.random.shuffle(self.indices)
+# Early Fusion Model
+def create_early_fusion_model(
+    text_input_size, image_input_size, output_size, hidden=[128], p=0.2
+):
+    """
+    Creates a multimodal early fusion model combining text and image inputs. The model concatenates the text and
+    image features, passes them through fully connected layers with optional dropout and batch normalization,
+    and produces a multi-class classification output.
+    Args:
+        text_input_size (int): Size of the input vector for the text data.
+        image_input_size (int): Size of the input vector for the image data.
+        output_size (int): Number of classes for the output layer (i.e., size of the softmax output).
+        hidden (int or list, optional): Specifies the number of hidden units in the dense layers.
+                                        If an integer, a single dense layer with the specified units is created.
+                                        If a list, multiple dense layers are created with the respective units. Default is [128].
+        p (float, optional): Dropout rate to apply after each dense layer. Default is 0.2.
+    Returns:
+        Model (keras.Model): A compiled Keras model with text and image inputs and a softmax output for classification.
+    Model Architecture:
+        - The model accepts two inputs: one for text features and one for image features.
+        - The features are concatenated into a single vector.
+        - Dense layers with ReLU activation are applied, followed by dropout and batch normalization (if multiple hidden layers are specified).
+        - The output layer uses a softmax activation for multi-class classification.
+    Example:
+        model = create_early_fusion_model(text_input_size=300, image_input_size=2048, output_size=10, hidden=[128, 64], p=0.3)
+        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
+    """
+    if text_input_size is None and image_input_size is None:
+        raise ValueError(
+            "At least one of text_input_size and image_input_size must be provided."
+        )
+    # Define inputs
+    if text_input_size is not None:
+        # Define text input layer for only text data
+        text_input = Input(shape=(text_input_size,), name="text")
+    if image_input_size is not None:
+        # Define image input layer for only image data
+        image_input = Input(shape=(image_input_size,), name="image")
+    # Merge or select inputs
+    if text_input_size is not None and image_input_size is not None:
+        # Concatenate text and image inputs if both are provided
+        x = Concatenate(name="fusion_layer")([text_input, image_input])
+    elif text_input_size is not None:
+        x = text_input
+    elif image_input_size is not None:
+        x = image_input
+    # Hidden layers
+    if isinstance(hidden, int):
+        # Add a single dense layer, activation, dropout and normalization
+        x = Dense(hidden, activation="relu")(x)
+        x = Dropout(p)(x)
+        x = BatchNormalization()(x)
+    elif isinstance(hidden, list):
+        for h in hidden:
+            # Add multiple dense layers based on the hidden list, activation, dropout and normalization
+            x = Dense(h, activation="relu")(x)
+            x = Dropout(p)(x)
+            x = BatchNormalization()(x)
+    # Output layer
+    # Add the output layer with softmax activation
+    output = Dense(output_size, activation="softmax", name="output")(x)
+    # Create the model
+    if text_input_size is not None and image_input_size is not None:
+        # Define the model with both text and image inputs
+        model = Model(inputs=[text_input, image_input], outputs=output)
+    elif text_input_size is not None:
+        # Define the model with only text input
+        model = Model(inputs=text_input, outputs=output)
+    elif image_input_size is not None:
+        # Define the model with only image input
+        model = Model(inputs=image_input, outputs=output)
+    else:
+        raise ValueError(
+            "At least one of text_input_size and image_input_size must be provided."
+        )
+    return model
+def test_model(y_test, y_pred, y_prob=None, encoder=None):
+    """
+    Evaluates a trained model's performance using various metrics such as accuracy, precision, recall, F1-score,
+    and visualizations including a confusion matrix and ROC curves.
+    Args:
+        y_test (np.ndarray): Ground truth one-hot encoded labels for the test data.
+        y_pred (np.ndarray): Predicted class labels by the model for the test data (after argmax transformation).
+        y_prob (np.ndarray, optional): Predicted probabilities for each class from the model. Required for ROC curves. Default is None.
+        encoder (LabelEncoder, optional): A fitted LabelEncoder instance used to inverse transform one-hot encoded and predicted labels to their original categorical form.
+    Returns:
+        accuracy (float): Accuracy score of the model on the test data.
+        precision (float): Weighted precision score of the model on the test data.
+        recall (float): Weighted recall score of the model on the test data.
+        f1 (float): Weighted F1 score of the model on the test data.
+    This function performs the following steps:
+        - Inverse transforms the one-hot encoded `y_test` and predicted `y_pred` values to their original labels using the provided LabelEncoder.
+        - Computes the confusion matrix and plots it as a heatmap using Seaborn.
+        - If `y_prob` is provided, computes and plots the ROC curves for each class.
+        - Prints the classification report, which includes precision, recall, F1-score, and support for each class.
+        - Returns the overall accuracy, weighted precision, recall, and F1-score of the model.
+    Visualizations:
+        - Confusion Matrix: A heatmap of the confusion matrix comparing the true labels with the predicted labels.
+        - ROC Curves: Plots ROC curves for each class if predicted probabilities are provided (`y_prob`).
+    Example:
+        accuracy, precision, recall, f1 = test_model(y_test, y_pred, y_prob, encoder)
+    """
+    # Handle label decoding
+    y_test_binarized = y_test
+    y_test = encoder.inverse_transform(np.argmax(y_test, axis=1))
+    y_pred = encoder.inverse_transform(y_pred)
+    cm = confusion_matrix(y_test, y_pred)
+    fig, ax = plt.subplots(figsize=(15, 15))
+    sns.heatmap(cm, annot=True, cmap="Blues", fmt="g", ax=ax)
+    plt.xlabel("Predicted")
+    plt.ylabel("True")
+    plt.title("Confusion Matrix")
+    plt.show()
+    if y_prob is not None:
+        fig, ax = plt.subplots(figsize=(15, 15))
+        colors = cycle(["aqua", "darkorange", "cornflowerblue"])
+        for i, color in zip(range(y_prob.shape[1]), colors):
+            fpr, tpr, _ = roc_curve(y_test_binarized[:, i], y_prob[:, i])
+            ax.plot(fpr, tpr, color=color, lw=2, label=f"Class {i}")
+        ax.plot([0, 1], [0, 1], "k--")
+        plt.title("ROC Curve")
+        plt.ylabel("True Positive Rate")
+        plt.xlabel("False Positive Rate")
+        plt.legend()
+        plt.show()
+    cr = classification_report(y_test, y_pred)
+    print(cr)
+    accuracy = accuracy_score(y_test, y_pred)
+    precision = precision_score(y_test, y_pred, average="weighted")
+    recall = recall_score(y_test, y_pred, average="weighted")
+    f1 = f1_score(y_test, y_pred, average="weighted")
+    return accuracy, precision, recall, f1
+def train_mlp(
+    train_loader,
+    test_loader,
+    text_input_size,
+    image_input_size,
+    output_size,
+    num_epochs=50,
+    report=False,
+    lr=0.001,
+    set_weights=True,
+    adam=False,
+    p=0.0,
+    seed=1,
+    patience=40,
+    save_results=True,
+    train_model=True,
+    test_mlp_model=True,
+):
+    """
+    Trains a multimodal early fusion model using both text and image data.
+    The function handles the training process of the model by combining text and image features,
+    computes class weights if needed, applies an optimizer (SGD or Adam), and implements early stopping
+    to prevent overfitting. The model is evaluated on the test set, and key performance metrics are computed.
+    Args:
+        train_loader (MultimodalDataset): Keras-compatible data loader for the training set with both text and image data.
+        test_loader (MultimodalDataset): Keras-compatible data loader for the test set with both text and image data.
+        text_input_size (int): The size of the input vector for the text data.
+        image_input_size (int): The size of the input vector for the image data.
+        output_size (int): Number of output classes for the softmax layer.
+        num_epochs (int, optional): Number of training epochs. Default is 50.
+        report (bool, optional): Whether to generate a detailed classification report and display metrics. Default is False.
+        lr (float, optional): Learning rate for the optimizer. Default is 0.001.
+        set_weights (bool, optional): Whether to compute and apply class weights to handle imbalanced datasets. Default is True.
+        adam (bool, optional): Whether to use the Adam optimizer instead of SGD. Default is False.
+        p (float, optional): Dropout rate for regularization in the model. Default is 0.0.
+        seed (int, optional): Seed for random number generators to ensure reproducibility. Default is 1.
+        patience (int, optional): Number of epochs with no improvement on validation loss before early stopping. Default is 40.
+    Returns:
+        None
+    Side Effects:
+        - Trains the early fusion model and saves the best weights based on validation loss.
+        - Generates plots showing the training and validation accuracy over epochs.
+        - If `report` is True, calls `test_model` to print detailed evaluation metrics and plots.
+    Training Process:
+        - The function creates a fusion model combining text and image inputs.
+        - Class weights are computed to balance the dataset if `set_weights` is True.
+        - The model is trained using categorical cross-entropy loss and the chosen optimizer (Adam or SGD).
+        - Early stopping is applied based on validation loss to prevent overfitting.
+        - After training, the model is evaluated on the test set, and accuracy, F1-score, and AUC are calculated.
+    Example:
+        train_mlp(train_loader, test_loader, text_input_size=300, image_input_size=2048, output_size=10, num_epochs=30, lr=0.001, adam=True, report=True)
+    Notes:
+        - `train_loader` and `test_loader` should be instances of `MultimodalDataset` or compatible Keras data loaders.
+        - If the dataset is imbalanced, setting `set_weights=True` is recommended to ensure better model performance on minority classes.
+    """
+    if seed is not None:
+        np.random.seed(seed)
+        tf.random.set_seed(seed)
+    # Create an early fusion model using the provided input sizes and output size
+    model = create_early_fusion_model(text_input_size, image_input_size, output_size)
+    # Compute class weights for imbalanced datasets
+    class_weights = None
+    if set_weights:
+        class_indices = np.argmax(train_loader.labels, axis=1)
+        # Compute class weights using the training labels
+        weights = compute_class_weight(
+            class_weight="balanced",
+            classes=np.unique(class_indices),
+            y=class_indices,
+        )
+        class_weights = {i: w for i, w in enumerate(weights)}
+    # Choose the loss function for multi-class classification
+    loss = CategoricalCrossentropy()
+    # Choose the optimizer
+    if adam:
+        # Use the Adam optimizer with the specified learning rate
+        optimizer = Adam(learning_rate=lr)
+    else:
+        # Use the SGD optimizer with the specified learning rate
+        optimizer = SGD(learning_rate=lr)
+    # Compile the model with the chosen optimizer and loss function
+    model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"])
+    # Define an early stopping callback with the specified patience
+    early_stopping = EarlyStopping(
+        monitor="val_loss",
+        patience=patience,
+        restore_best_weights=True,
+    )
+    # Train the model using the training data and validation data
+    history = None
+    if train_model:
+        history = model.fit(
+            train_loader,
+            validation_data=test_loader,
+            epochs=num_epochs,
+            class_weight=class_weights,
+            callbacks=[early_stopping],
+            verbose="1",
+        )
+    if test_mlp_model:
+        # Test the model on the test set
+        y_true, y_pred, y_prob = [], [], []
+        for batch in test_loader:
+            features, labels = batch
+            if len(features) == 1:
+                text = features["text"] if "text" in features else features["image"]
+                preds = model.predict(text)
+            else:
+                text, image = features["text"], features["image"]
+                preds = model.predict([text, image])
+            y_true.extend(labels)
+            y_pred.extend(np.argmax(preds, axis=1))
+            y_prob.extend(preds)
+        y_true, y_pred, y_prob = np.array(y_true), np.array(y_pred), np.array(y_prob)
+        test_accuracy = accuracy_score(np.argmax(y_true, axis=1), y_pred)
+        f1 = f1_score(np.argmax(y_true, axis=1), y_pred, average="macro")
+        auc_scores = roc_auc_score(y_true, y_prob, average="macro", multi_class="ovr")
+        macro_auc = auc_scores
+        plt.plot(history.history["accuracy"], label="Train Accuracy")
+        plt.plot(history.history["val_accuracy"], label="Validation Accuracy")
+        plt.xlabel("Epoch")
+        plt.ylabel("Accuracy")
+        plt.legend()
+        plt.show()
+        if report:
+            test_model(y_true, y_pred, y_prob, encoder=train_loader.encoder)
+        # Store results in a dataframe and save in the results folder
+        if text_input_size is not None and image_input_size is not None:
+            model_type = "multimodal"
+        elif text_input_size is not None:
+            model_type = "text"
+        elif image_input_size is not None:
+            model_type = "image"
+        if save_results:
+            results = pd.DataFrame(
+                {"Predictions": y_pred, "True Labels": np.argmax(y_true, axis=1)}
+            )
+            # create results folder if it does not exist
+            os.makedirs("results", exist_ok=True)
+            results.to_csv(f"results/{model_type}_results.csv", index=False)
+    else:
+        test_accuracy, f1, macro_auc = None, None, None
+    return model, test_accuracy, f1, macro_auc

src/nlp_models.py ADDED Viewed

	@@ -0,0 +1,242 @@

+import json
+import os
+import numpy as np
+import pandas as pd
+import torch
+from transformers import AutoModel, AutoTokenizer
+class HuggingFaceEmbeddings:
+    """
+    A class to handle text embedding generation using a Hugging Face pre-trained transformer model.
+    This class loads the model, tokenizes the input text, generates embeddings, and provides an option
+    to save the embeddings to a CSV file.
+    Args:
+        model_name (str, optional): The name of the Hugging Face pre-trained model to use for generating embeddings.
+                                    Default is 'sentence-transformers/all-MiniLM-L6-v2'.
+        path (str, optional): The path to the CSV file containing the text data. Default is 'data/file.csv'.
+        save_path (str, optional): The directory path where the embeddings will be saved. Default is 'Models'.
+        device (str, optional): The device to run the model on ('cpu' or 'cuda'). If None, it will automatically detect
+                                a GPU if available; otherwise, it defaults to CPU.
+    Attributes:
+        model_name (str): The name of the Hugging Face model used for embedding generation.
+        tokenizer (transformers.AutoTokenizer): The tokenizer corresponding to the chosen model.
+        model (transformers.AutoModel): The pre-trained model loaded for embedding generation.
+        path (str): Path to the input CSV file.
+        save_path (str): Directory where the embeddings CSV will be saved.
+        device (torch.device): The device on which the model and data are processed (CPU or GPU).
+    Methods:
+        get_embedding(text):
+            Generates embeddings for a given text input using the pre-trained model.
+        get_embedding_df(column, directory, file):
+            Reads a CSV file, computes embeddings for a specified text column, and saves the resulting DataFrame
+            with embeddings to a new CSV file in the specified directory.
+    Example:
+        embedding_instance = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',
+                                                   path='data/products.csv', save_path='output')
+        text_embedding = embedding_instance.get_embedding("Sample product description.")
+        embedding_instance.get_embedding_df(column='description', directory='output', file='product_embeddings.csv')
+    Notes:
+        - The Hugging Face model and tokenizer are downloaded from the Hugging Face hub.
+        - The function supports large models and can run on either GPU or CPU, depending on device availability.
+        - The input text will be truncated and padded to a maximum length of 512 tokens to fit into the model.
+    """
+    def __init__(
+        self,
+        model_name="sentence-transformers/all-MiniLM-L6-v2",
+        path="data/file.csv",
+        save_path=None,
+        device=None,
+    ):
+        """
+        Initializes the HuggingFaceEmbeddings class with the specified model and paths.
+        Args:
+            model_name (str, optional): The name of the Hugging Face pre-trained model. Default is 'sentence-transformers/all-MiniLM-L6-v2'.
+            path (str, optional): The path to the CSV file containing text data. Default is 'data/file.csv'.
+            save_path (str, optional): Directory path where the embeddings will be saved. Default is 'Models'.
+            device (str, optional): Device to use for model processing. Defaults to 'cuda' if available, otherwise 'cpu'.
+        """
+        self.model_name = model_name
+        # Load the Hugging Face tokenizer from a pre-trained model
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        # Load the model from the Hugging Face model hub from the specified model name
+        self.model = AutoModel.from_pretrained(model_name)
+        self.path = path
+        self.save_path = save_path or "Models"
+        # Define device
+        if device is None:
+            # Note: If you have a mac, you may want to change 'cuda' to 'mps' to use GPU
+            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        else:
+            self.device = torch.device(device)
+        print(f"Using device: {self.device}")
+        # Move model to the specified device
+        self.model.to(self.device)
+        print(f"Model moved to device: {self.device}")
+        print(f"Model: {model_name}")
+    def get_embedding(self, text):
+        """
+        Generates embeddings for a given text using the Hugging Face model.
+        Args:
+            text (str): The input text for which embeddings will be generated.
+        Returns:
+            np.ndarray: A numpy array containing the embedding vector for the input text.
+        """
+        # Tokenize the input text using the Hugging Face tokenizer
+        inputs = self.tokenizer(
+            text, return_tensors="pt", truncation=True, padding=True, max_length=512
+        )
+        # Move the inputs to the device
+        inputs = {key: value.to(self.device) for key, value in inputs.items()}
+        with torch.no_grad():
+            # Generate the embeddings using the Hugging Face model from the tokenized input
+            outputs = self.model(**inputs)
+        # Extract the embeddings from the model output, send to cpu and return the numpy array
+        last_hidden_state = outputs.last_hidden_state
+        embeddings = last_hidden_state.mean(dim=1)
+        embeddings = embeddings.cpu().numpy()
+        return embeddings[0]
+    def get_embedding_df(self, column, directory, file):
+        # Load the CSV file
+        df = pd.read_csv(self.path)
+        # Generate embeddings for the specified column using the `get_embedding` method
+        df["embeddings"] = df[column].apply(
+            lambda x: self.get_embedding(str(x)).tolist() if pd.notnull(x) else None
+        )
+        os.makedirs(directory, exist_ok=True)
+        # Save the DataFrame with the embeddings to a new CSV file in the specified directory
+        output_path = os.path.join(directory, file)
+        df.to_csv(output_path, index=False)
+        print(f"✅ Embeddings saved to {output_path}")
+class GPT:
+    """
+    A class to interact with the OpenAI GPT API for generating text embeddings from a given dataset.
+    This class provides methods to retrieve embeddings for text data and save them to a CSV file.
+    Args:
+        path (str, optional): The path to the CSV file containing the text data. Default is 'data/file.csv'.
+        embedding_model (str, optional): The embedding model to use for generating text embeddings.
+                                         Default is 'text-embedding-3-small'.
+    Attributes:
+        path (str): Path to the CSV file.
+        embedding_model (str): The embedding model used for generating text embeddings.
+    Methods:
+        get_embedding(text):
+            Generates and returns the embedding vector for the given text using the OpenAI API.
+        get_embedding_df(column, directory, file):
+            Reads a CSV file, computes the embeddings for a specified text column, and saves the embeddings
+            to a new CSV file in the specified directory.
+    Example:
+        gpt_instance = GPT(path='data/products.csv', embedding_model='text-embedding-ada-002')
+        text_embedding = gpt_instance.get_embedding("Sample product description.")
+        gpt_instance.get_embedding_df(column='description', directory='output', file='product_embeddings.csv')
+    Notes:
+        - The OpenAI API key must be stored in a `.env` file with the variable name `OPENAI_API_KEY`.
+        - The OpenAI Python package should be installed (`pip install openai`), and an active OpenAI API key is required.
+    """
+    def __init__(self, path="data/file.csv", embedding_model="text-embedding-3-small"):
+        """
+        Initializes the GPT class with the provided CSV file path and embedding model.
+        Args:
+            path (str, optional): The path to the CSV file containing the text data. Default is 'data/file.csv'.
+            embedding_model (str, optional): The embedding model to use for generating text embeddings.
+                                             Default is 'text-embedding-3-small'.
+        """
+        import openai
+        from dotenv import find_dotenv, load_dotenv
+        # Load the OpenAI API key from the .env file
+        _ = load_dotenv(find_dotenv())  # read local .env file
+        # Set the OpenAI API key
+        openai.api_key = os.getenv("OPENAI_API_KEY")
+        self.path = path
+        self.embedding_model = embedding_model
+    def get_embedding(self, text):
+        """
+        Generates and returns the embedding vector for the given text using the OpenAI API.
+        Args:
+            text (str): The input text to generate the embedding for.
+        Returns:
+            list: A list containing the embedding vector for the input text.
+        """
+        from openai import OpenAI
+        # Instantiate the OpenAI client
+        client = OpenAI()
+        # Optional. Do text preprocessing if needed (e.g., removing newlines)
+        text = text.replace("\n", " ").strip()
+        # Call the OpenAI API to generate the embeddings and return only the embedding data
+        response = client.embeddings.create(model=self.embedding_model, input=text)
+        embeddings_np = np.array(response.data[0].embedding, dtype=np.float32)
+        return embeddings_np
+    def get_embedding_df(self, column, directory, file):
+        """
+        Reads a CSV file, computes the embeddings for a specified text column, and saves the results in a new CSV file.
+        Args:
+            column (str): The name of the column in the CSV file that contains the text data.
+            directory (str): The directory where the output CSV file will be saved.
+            file (str): The name of the output CSV file.
+        Side Effects:
+            - Saves a new CSV file containing the original data along with the computed embeddings to the specified directory.
+        """
+        # Load the CSV file
+        df = pd.read_csv(self.path)
+        if column not in df.columns:
+            raise ValueError(f"Column '{column}' not found in CSV")
+        # Generate embeddings in a new column 'embeddings', for the specified column using the `get_embedding` method
+        df["embeddings"] = df[column].apply(
+            lambda x: json.dumps(self.get_embedding(str(x)).tolist())
+        )
+        os.makedirs(directory, exist_ok=True)
+        # Save the DataFrame with the embeddings to a new CSV file in the specified directory
+        output_path = os.path.join(directory, file)
+        df.to_csv(output_path, index=False)
+        print(f"✅ Embeddings saved to {output_path}")

src/utils.py ADDED Viewed

	@@ -0,0 +1,227 @@

+import os
+import warnings
+from io import BytesIO
+import numpy as np
+import pandas as pd
+import requests
+from PIL import Image
+from sklearn.model_selection import train_test_split
+# 💬 NOTE: Suppress all warnings
+warnings.filterwarnings("ignore")
+def process_embeddings(df, col_name):
+    """
+    Process embeddings in a DataFrame column.
+    Args:
+    - df (pd.DataFrame): The DataFrame containing the embeddings column.
+    - col_name (str): The name of the column containing the embeddings.
+    Returns:
+    pd.DataFrame: The DataFrame with processed embeddings.
+    Steps:
+    1. Convert the values in the specified column to lists.
+    2. Extract values from lists and create new columns for each element.
+    3. Remove the original embeddings column.
+    Example:
+    df_processed = process_embeddings(df, 'embeddings')
+    """
+    # Convert the values (eg. "[-0.123, 0.456, ...]") in the column to lists
+    df[col_name] = df[col_name].apply(eval)
+    # Extract values from lists and create new columns
+    """ 🔎 Example
+    text_1   text_2   text_3
+    0  -0.123   0.456   0.789
+    1   0.321  -0.654   0.987
+    """
+    embeddings_df = pd.DataFrame(
+        df[col_name].to_list(),
+        columns=[f"text_{i + 1}" for i in range(df[col_name].str.len().max())],
+    )
+    df = pd.concat([df, embeddings_df], axis=1)
+    # Remove the original "embeddings" column
+    df = df.drop(columns=[col_name])
+    return df
+def rename_image_embeddings(df):
+    """
+    Rename columns in a DataFrame for image embeddings.
+    Args:
+    - df (pd.DataFrame): The DataFrame containing columns to be renamed.
+    Returns:
+    pd.DataFrame: The DataFrame with renamed columns.
+    Example:
+    df_renamed = rename_image_embeddings(df)
+    """
+    # From 0    1    2   label  ➡️ image_0  image_1  image_2  label
+    df.columns = [f"image_{int(col)}" if col.isdigit() else col for col in df.columns]
+    return df
+def preprocess_data(
+    text_data,
+    image_data,
+    text_id="image_id",
+    image_id="ImageName",
+    embeddings_col="embeddings",
+):
+    """
+    Preprocess and merge text and image dataframes.
+    Args:
+    - text_data (pd.DataFrame): DataFrame containing text data.
+    - image_data (pd.DataFrame): DataFrame containing image data.
+    - text_id (str): Column name for text data identifier.
+    - image_id (str): Column name for image data identifier.
+    - embeddings_col (str): Column name for embeddings data.
+    Returns:
+    pd.DataFrame: Merged and preprocessed DataFrame.
+    This function:
+    Process text and image embeddings.
+    Convert image_id and text_id values to integers.
+    Merge dataframes using id.
+    Drop unnecessary columns.
+    Example:
+    merged_df = preprocess_data(text_df, image_df)
+    """
+    # Call previous functions to tune the text and image dataframes
+    text_data = process_embeddings(text_data, embeddings_col)
+    image_data = rename_image_embeddings(image_data)
+    # Drop missing values in image id - Removes rows where the ID (used to join text ↔ image) is missing.
+    image_data = image_data.dropna(subset=[image_id])
+    text_data = text_data.dropna(subset=[text_id])
+    # Cleans up text IDs: if the column contains file paths (like "data/images/123.jpg"), it extracts just the file name ("123.jpg").
+    text_data[text_id] = text_data[text_id].apply(lambda x: x.split("/")[-1])
+    # Merge dataframes using image_id - Joins text and image embeddings using the IDs (text_id vs image_id).
+    df = pd.merge(text_data, image_data, left_on=text_id, right_on=image_id)
+    # Drop unnecessary columns - Removes the original ID columns since they’re no longer needed after the merge.
+    df.drop([image_id, text_id], axis=1, inplace=True)
+    return df
+class ImageDownloader:
+    """
+    Image downloader class to download images from URLs.
+    Args:
+    - image_dir (str): Directory to save images.
+    - image_size (tuple): Size of the images to be saved.
+    - override (bool): Whether to override existing images.
+    Methods:
+    - download_images(df, print_every=1000): Download images from URLs in a DataFrame.
+        Args:
+        - df (pd.DataFrame): DataFrame containing image URLs.
+        - print_every (int): Print progress every n images.
+        Returns:
+        pd.DataFrame: DataFrame with image paths added.
+    Example:
+    downloader = ImageDownloader()
+    df = downloader.download_images(df)
+    """
+    def __init__(
+        self, image_dir="data/images/", image_size=(224, 224), overwrite=False
+    ):
+        self.image_dir = image_dir
+        self.image_size = image_size
+        self.overwrite = overwrite
+        # Create the directory if it doesn't exist
+        if not os.path.exists(self.image_dir):
+            os.makedirs(self.image_dir)
+    def download_images(self, df, print_every=1000):
+        # Bulk download images from a DataFrame of URLs, resize them to a standard format, and add their local paths back to the DataFrame.
+        image_paths = []
+        i = 0
+        for index, row in df.iterrows():
+            if i % print_every == 0:
+                print(f"Downloading image {i}/{len(df)}")
+                i += 1
+            sku = row["sku"]
+            image_url = row["image"]
+            image_path = os.path.join(self.image_dir, f"{sku}.jpg")
+            if os.path.exists(image_path) and not self.overwrite:
+                print(f"Image {sku} is already in the path.")
+                image_paths.append(image_path)
+                continue
+            try:
+                response = requests.get(image_url)
+                response.raise_for_status()
+                img = Image.open(BytesIO(response.content))
+                img = img.resize(self.image_size, Image.Resampling.LANCZOS)
+                img.save(image_path)
+                # print(f"Downloaded image for SKU: {sku}")
+                image_paths.append(image_path)
+            except Exception as e:
+                print(f"Could not download image for SKU: {sku}. Error: {e}")
+                image_paths.append(np.nan)
+        df["image_path"] = image_paths
+        return df
+def train_test_split_and_feature_extraction(df, test_size=0.3, random_state=42):
+    """
+    Split the data into train and test sets and extract features and labels.
+    Args:
+    - df (pd.DataFrame): DataFrame containing the data.
+    Keyword Args:
+    - test_size (float): Size of the test set.
+    - random_state (int): Random state for reproducibility
+    Returns:
+    pd.DataFrame: Train DataFrame.
+    pd.DataFrame: Test DataFrame.
+    list: List of columns with text embeddings.
+    list: List of columns with image embeddings.
+    list: List of columns with class labels.
+    Example:
+    train_df, test_df, text_columns, image_columns, label_columns = train_test_split_and_feature_extraction(df)
+    """
+    # Split the data into train and test sets setting using the test_size and random_state parameters
+    train_df, test_df = train_test_split(
+        df, test_size=test_size, random_state=random_state
+    )
+    # Select the name of the columns with the text embeddings and return it as a list (Even if there is only one column)
+    text_columns = [col for col in df.columns if col.startswith("text_")]
+    # Select the name of the columns with the image embeddings and return it as a list (Even if there is only one column)
+    image_columns = [col for col in df.columns if col.startswith("image_")]
+    # Select the name of the column with the class labels and return it as a list (Even if there is only one column)
+    label_columns = ["class_id"]
+    return train_df, test_df, text_columns, image_columns, label_columns

src/vision_embeddings_tf.py ADDED Viewed

	@@ -0,0 +1,470 @@

+import os
+import warnings
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+from PIL import Image
+from tensorflow.keras.applications import (
+    DenseNet121,
+    DenseNet169,
+    InceptionV3,
+    ResNet50,
+    ResNet101,
+)
+from tensorflow.keras.layers import GlobalAveragePooling2D, Input
+from tensorflow.keras.models import Model
+from transformers import TFConvNextV2Model, TFSwinModel, TFViTModel
+# 💬 NOTE: Suppress TensorFlow warnings
+warnings.filterwarnings("ignore")
+tf.get_logger().setLevel("ERROR")
+def load_and_preprocess_image(image_path, target_size=(224, 224)):
+    """
+    Load and preprocess an image.
+    Args:
+    - image_path (str): Path to the image file.
+    - target_size (tuple): Desired image size.
+    Returns:
+    - np.array: Preprocessed image.
+    """
+    # Open the image using PIL Image.open and convert it to RGB format
+    img = Image.open(image_path).convert("RGB")
+    # Resize the image to the target size
+    img = img.resize(target_size)
+    # Convert the image to a numpy array and scale the pixel values to [0, 1]
+    img = np.array(img, dtype=np.float32) / 255.0
+    return img
+class FoundationalCVModel:
+    """
+    A Keras module for loading and using foundational computer vision models.
+    This class allows you to load and use various foundational computer vision models for tasks like image classification
+    or feature extraction. The user can choose between evaluation mode (non-trainable model) and fine-tuning mode (trainable model).
+    Attributes:
+    ----------
+    backbone_name : str
+        The name of the foundational CV model to load (e.g., 'resnet50', 'vit_base').
+    model : keras.Model
+        The compiled Keras model with the selected backbone.
+    Parameters:
+    ----------
+    backbone : str
+        The name of the foundational CV model to load. The available backbones can include:
+        - ResNet variants: 'resnet50', 'resnet101'
+        - DenseNet variants: 'densenet121', 'densenet169'
+        - InceptionV3: 'inception_v3'
+        - ConvNextV2 variants: 'convnextv2_tiny', 'convnextv2_base', 'convnextv2_large'
+        - Swin Transformer variants: 'swin_tiny', 'swin_small', 'swin_base'
+        - Vision Transformer (ViT) variants: 'vit_base', 'vit_large'
+    mode : str, optional
+        The mode of the model, either 'eval' for evaluation or 'fine_tune' for fine-tuning. Default is 'eval'.
+    Methods:
+    -------
+    __init__(self, backbone, mode='eval'):
+        Initializes the model with the specified backbone and mode.
+    predict(self, images):
+        Given a batch of images, performs a forward pass through the model and returns predictions.
+        Parameters:
+        ----------
+        images : numpy.ndarray
+            A batch of images to perform prediction on, with shape (batch_size, 224, 224, 3).
+        Returns:
+        -------
+        numpy.ndarray
+            Model predictions or extracted features for the provided images.
+    """
+    def __init__(self, backbone, mode="eval", input_shape=(224, 224, 3)):
+        self.backbone_name = backbone
+        # Select the backbone from the possible foundational models
+        input_layer = Input(shape=input_shape)
+        if backbone == "resnet50":
+            # Load the ResNet50 model from tensorflow.keras.applications
+            self.base_model = ResNet50(
+                include_top=False, weights="imagenet", input_tensor=input_layer
+            )
+        elif backbone == "resnet101":
+            # Load the ResNet101 model from tensorflow.keras.applications
+            self.base_model = ResNet101(
+                include_top=False, weights="imagenet", input_tensor=input_layer
+            )
+        elif backbone == "densenet121":
+            # Load the DenseNet121 model from tensorflow.keras.applications
+            self.base_model = DenseNet121(
+                include_top=False, weights="imagenet", input_tensor=input_layer
+            )
+        elif backbone == "densenet169":
+            # Load the DenseNet169 model from tensorflow.keras.applications
+            self.base_model = DenseNet169(
+                include_top=False, weights="imagenet", input_tensor=input_layer
+            )
+        elif backbone == "inception_v3":
+            # Load the InceptionV3 model from tensorflow.keras.applications
+            self.base_model = InceptionV3(
+                include_top=False, weights="imagenet", input_tensor=input_layer
+            )
+        elif backbone == "convnextv2_tiny":
+            # Load the ConvNeXtV2 Tiny model from transformers
+            self.base_model = TFConvNextV2Model.from_pretrained(
+                "facebook/convnextv2-tiny-22k-224"
+            )
+        elif backbone == "convnextv2_base":
+            # Load the ConvNeXtV2 Base model from transformers
+            self.base_model = TFConvNextV2Model.from_pretrained(
+                "facebook/convnextv2-base-22k-224"
+            )
+        elif backbone == "convnextv2_large":
+            # Load the ConvNeXtV2 Large model from transformers
+            self.base_model = TFConvNextV2Model.from_pretrained(
+                "facebook/convnextv2-large-22k-224"
+            )
+        elif backbone == "swin_tiny":
+            # Load the Swin Transformer Tiny model from transformers
+            self.base_model = TFSwinModel.from_pretrained(
+                "microsoft/swin-tiny-patch4-window7-224"
+            )
+        elif backbone == "swin_small":
+            # Load the Swin Transformer Small model from transformers
+            self.base_model = TFSwinModel.from_pretrained(
+                "microsoft/swin-small-patch4-window7-224"
+            )
+        elif backbone == "swin_base":
+            # Load the Swin Transformer Base model from transformers
+            self.base_model = TFSwinModel.from_pretrained(
+                "microsoft/swin-base-patch4-window7-224"
+            )
+        elif backbone in ["vit_base", "vit_large"]:
+            # Load the Vision Transformer (ViT) model from transformers
+            backbone_path = {
+                "vit_base": "google/vit-base-patch16-224",
+                "vit_large": "google/vit-large-patch16-224",
+            }
+            self.base_model = TFViTModel.from_pretrained(backbone_path[backbone])
+        else:
+            raise ValueError(f"Unsupported backbone model: {backbone}")
+        if mode == "eval":
+            # Set the model to evaluation mode (non-trainable)
+            self.base_model.trainable = False
+        elif mode == "fine_tune":
+            self.base_model.trainable = True
+        # 💬 NOTE: Take into account the model's input requirements. In models from transformers, the input is channels first, but in models from keras.applications, the input is channels last.
+        # Additionally, the output of the model is different in both cases, we need to get the pooling of the output layer.
+        # If is a model from transformers:
+        if backbone in [
+            "vit_base",
+            "vit_large",
+            "convnextv2_tiny",
+            "convnextv2_base",
+            "convnextv2_large",
+            "swin_tiny",
+            "swin_small",
+            "swin_base",
+        ]:
+            # Adjust the input for channels first models within the model
+            input_layer_transposed = tf.transpose(input_layer, perm=[0, 3, 1, 2])
+            hf_outputs = self.base_model(input_layer_transposed)
+            # Get the pooling output of the model "pooler_output"
+            outputs = hf_outputs.pooler_output  # shape (batch_size, hidden_size)
+        # If is a model from keras.applications
+        else:
+            # Get the pooling output of the model
+            # In this case the pooling layer is not included in the model, we can use a pooling layer such as GlobalAveragePooling2D
+            x = self.base_model.output
+            outputs = GlobalAveragePooling2D()(x)
+        # Create the final model with the input layer and the pooling output
+        self.model = Model(inputs=input_layer, outputs=outputs)
+    def get_output_shape(self):
+        """
+        Get the output shape of the model.
+        Returns:
+        -------
+        tuple
+            The shape of the model's output tensor.
+        """
+        return self.model.output_shape
+    def predict(self, images):
+        """
+        Predict on a batch of images.
+        Parameters:
+        ----------
+        images : numpy.ndarray
+            A batch of images of shape (batch_size, 224, 224, 3).
+        Returns:
+        -------
+        numpy.ndarray
+            Predictions or features from the model for the given images.
+        """
+        # Perform a forward pass through the model and return the predictions
+        images = tf.convert_to_tensor(images, dtype=tf.float32)
+        # Forward pass (no training)
+        predictions = self.model(images, training=False)
+        # Convert back to numpy for usability
+        return predictions.numpy()
+class ImageFolderDataset:
+    """
+    A custom dataset class for loading and preprocessing images from a folder.
+    This class helps in loading images from a given folder, automatically filtering valid image files and
+    preprocessing them to a specified shape. It also handles any unreadable or corrupted images by excluding them.
+    Attributes:
+    ----------
+    folder_path : str
+        The path to the folder containing the images.
+    shape : tuple
+        The desired shape (width, height) to which the images will be resized.
+    image_files : list
+        A list of valid image file names that can be processed.
+    Parameters:
+    ----------
+    folder_path : str
+        The path to the folder containing image files.
+    shape : tuple, optional
+        The target shape to resize the images to. The default value is (224, 224).
+    image_files : list, optional
+        A pre-provided list of image file names. If not provided, it will automatically detect valid image files
+        (with extensions '.jpg', '.jpeg', '.png', '.gif') in the specified folder.
+    Methods:
+    -------
+    clean_unidentified_images():
+        Cleans the dataset by removing images that cause an `UnidentifiedImageError` during loading. This helps ensure
+        that only valid, readable images are kept in the dataset.
+    __len__():
+        Returns the number of valid images in the dataset after cleaning.
+    __getitem__(idx):
+        Given an index `idx`, retrieves the image file at that index, loads and preprocesses it, and returns the image
+        along with its filename.
+    """
+    def __init__(self, folder_path, shape=(224, 224), image_files=None):
+        """
+        Initializes the dataset object by setting the folder path and target image shape.
+        It also optionally accepts a list of image files to be processed, otherwise detects valid images in the folder.
+        Parameters:
+        ----------
+        folder_path : str
+            The directory containing the images.
+        shape : tuple, optional
+            The target shape to resize the images to. Default is (224, 224).
+        image_files : list, optional
+            A list of image files to load. If not provided, it will auto-detect valid images from the folder.
+        """
+        self.folder_path = folder_path
+        self.shape = shape
+        # If image files are provided, use them; otherwise, detect image files in the folder
+        if image_files:
+            self.image_files = image_files
+        else:
+            # List all files in the folder and filter only image files
+            self.image_files = [
+                f
+                for f in os.listdir(folder_path)
+                if f.lower().endswith(("jpg", "jpeg", "png", "gif"))
+            ]
+        # Clean the dataset by removing images that cause errors during loading
+        self.clean_unidentified_images()
+    def clean_unidentified_images(self):
+        """
+        Clean the dataset by removing images that cannot be opened due to errors (e.g., `UnidentifiedImageError`).
+        This method iterates over the list of detected image files and attempts to open and convert each image to RGB.
+        If an image cannot be opened (e.g., due to corruption or unsupported format), it is excluded from the dataset.
+        Any image that causes an error will be skipped, and a message will be printed to indicate which file was skipped.
+        """
+        cleaned_files = []
+        # Iterate over the image files and check if they can be opened
+        for img_name in self.image_files:
+            img_path = os.path.join(self.folder_path, img_name)
+            try:
+                # Try to open the image and convert it to RGB format
+                Image.open(img_path).convert("RGB")
+                # If successful, add the image to the cleaned list
+                cleaned_files.append(img_name)
+            except Exception as e:
+                print(f"Skipping {img_name} due to error: {e}")
+        # Update the list of image files with only the cleaned files
+        self.image_files = cleaned_files
+    def __len__(self):
+        """
+        Returns the number of valid images in the dataset after cleaning.
+        Returns:
+        -------
+        int
+            The number of images in the cleaned dataset.
+        """
+        return len(self.image_files)
+    def __getitem__(self, idx):
+        """
+        Retrieves the image and its filename at the specified index.
+        Parameters:
+        ----------
+        idx : int
+            The index of the image to retrieve.
+        Returns:
+        -------
+        tuple
+            A tuple containing the image filename and the preprocessed image as a NumPy array or Tensor.
+        Raises:
+        ------
+        IndexError
+            If the index is out of bounds for the dataset.
+        """
+        # Get an item from the list of image files
+        img_name = self.image_files[idx]
+        # Load and preprocess the image:
+        img_path = os.path.join(self.folder_path, img_name)
+        img = load_and_preprocess_image(img_path, self.shape)
+        # Return the image filename and the preprocessed image
+        return img_name, img
+def get_embeddings_df(
+    batch_size=32,
+    path="data/images",
+    dataset_name="",
+    backbone="resnet50",
+    directory="Embeddings",
+    image_files=None,
+):
+    """
+    Generates embeddings for images in a dataset using a specified backbone model and saves them to a CSV file.
+    This function processes images from a given folder in batches, extracts features (embeddings) using a specified
+    pre-trained computer vision model, and stores the results in a CSV file. The embeddings can be used for
+    downstream tasks such as image retrieval or clustering.
+    Parameters:
+    ----------
+    batch_size : int, optional
+        The number of images to process in each batch. Default is 32.
+    path : str, optional
+        The folder path containing the images. Default is "data/images".
+    dataset_name : str, optional
+        The name of the dataset to create subdirectories for saving embeddings. Default is an empty string.
+    backbone : str, optional
+        The name of the backbone model to use for generating embeddings. The default is 'resnet50'.
+        Other possible options include models like 'convnext_tiny', 'vit_base', etc.
+    directory : str, optional
+        The root directory where the embeddings CSV file will be saved. Default is 'Embeddings'.
+    image_files : list, optional
+        A pre-defined list of image file names to process. If not provided, the function will automatically detect
+        image files in the `path` directory.
+    Returns:
+    -------
+    None
+        The function does not return any value. It saves a CSV file containing image names and their embeddings.
+    Side Effects:
+    ------------
+    - Saves a CSV file in the specified directory containing image file names and their corresponding embeddings.
+    Notes:
+    ------
+    - The images are loaded and preprocessed using the `ImageFolderDataset` class.
+    - The embeddings are generated using a pre-trained model from the `FoundationalCVModel` class.
+    - The embeddings are saved as a CSV file with the following structure:
+        - `ImageName`: The name of the image file.
+        - Columns corresponding to the embedding vector (one column per feature).
+    Example:
+    --------
+    >>> get_embeddings_df(batch_size=16, path="data/images", dataset_name='sample_dataset', backbone="resnet50")
+    This would generate a CSV file with image embeddings from the 'resnet50' backbone model for images in the "data/images" directory.
+    """
+    # Create an instance of the ImageFolderDataset class
+    dataset = ImageFolderDataset(folder_path=path, image_files=image_files)
+    # Create an instance of the FoundationalCVModel class
+    model = FoundationalCVModel(backbone)
+    img_names = []
+    features = []
+    # Calculate the number of batches based on the dataset size and batch size
+    num_batches = len(dataset) // batch_size + (
+        1 if len(dataset) % batch_size != 0 else 0
+    )
+    # Process images in batches and extract features
+    for i in range(0, len(dataset), batch_size):
+        # Get the image files and images for the current batch
+        batch_files = dataset.image_files[i : i + batch_size]
+        batch_imgs = np.array(
+            [dataset[j][1] for j in range(i, min(i + batch_size, len(dataset)))]
+        )
+        # Generate embeddings for the batch of images
+        batch_features = model.predict(batch_imgs)
+        # Append the image names and features to the lists
+        img_names.extend(batch_files)
+        features.extend(batch_features)
+        if (i // batch_size + 1) % 10 == 0:
+            print(f"Batch {i // batch_size + 1}/{num_batches} done")
+    # Create a DataFrame with the image names and embeddings
+    df = pd.DataFrame({"ImageName": img_names, "Embeddings": features})
+    # Split the embeddings into separate columns
+    df_aux = pd.DataFrame(df["Embeddings"].tolist())
+    df = pd.concat([df["ImageName"], df_aux], axis=1)
+    # Save the DataFrame to a CSV file
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+    if not os.path.exists(f"{directory}/{dataset_name}"):
+        os.makedirs(f"{directory}/{dataset_name}")
+    df.to_csv(f"{directory}/{dataset_name}/Embeddings_{backbone}.csv", index=False)