iBrokeTheCode's picture
chore: Add tests cases
43fe501
import numpy as np
import pandas as pd
import pytest
from transformers import AutoModel, AutoTokenizer
from src.nlp_models import HuggingFaceEmbeddings
# import torch
# import os
####################################################################################################
################################## Test the Text Embeddings Model ##################################
####################################################################################################
@pytest.fixture
def mock_text_data(tmp_path):
"""
Fixture to create a mock CSV file with text data for testing.
"""
data = {"description": ["Product 1 description", "Product 2 description"]}
df = pd.DataFrame(data)
file_path = tmp_path / "test_text_data.csv"
df.to_csv(file_path, index=False)
return str(file_path)
@pytest.mark.parametrize(
"model_name, expected_hidden_size",
[
("sentence-transformers/all-MiniLM-L6-v2", 384), # MiniLM with 384 hidden units
# ('bert-base-uncased', 768), # BERT base with 768 hidden units
],
)
def test_huggingface_embeddings_generic(
model_name, expected_hidden_size, mock_text_data
):
"""
Generic test for loading a Hugging Face model, generating text embeddings, and saving them to a CSV file.
This test ensures that:
- The model and tokenizer are properly loaded from Hugging Face.
- Embeddings are correctly generated for text descriptions.
- Embeddings are saved in the correct format to a CSV file.
Parameters:
----------
model_name : str
The name of the Hugging Face model to test.
expected_hidden_size : int
The expected hidden size (dimensionality) of the embeddings generated by the model.
mock_text_data : str
Path to the mock CSV file containing text descriptions.
"""
# Initialize the HuggingFaceEmbeddings model with the provided model name
model = HuggingFaceEmbeddings(
model_name=model_name, path=mock_text_data, device="cpu"
)
# Check that the tokenizer and model were loaded correctly
assert isinstance(
model.tokenizer, type(AutoTokenizer.from_pretrained(model_name))
), (
f"Tokenizer should be an instance of {type(AutoTokenizer.from_pretrained(model_name))}"
)
assert isinstance(model.model, type(AutoModel.from_pretrained(model_name))), (
f"Model should be an instance of {type(AutoModel.from_pretrained(model_name))}"
)
# Generate embeddings for a sample text
sample_text = "This is a test description."
embeddings = model.get_embedding(sample_text)
# Check that the embeddings are a NumPy array with the expected shape
assert isinstance(embeddings, np.ndarray), "Embeddings should be a NumPy array"
assert embeddings.shape == (expected_hidden_size,), (
f"Embeddings shape should be ({expected_hidden_size},), got {embeddings.shape}"
)
if __name__ == "__main__":
pytest.main()