import numpy as np import pandas as pd import pytest from transformers import AutoModel, AutoTokenizer from src.nlp_models import HuggingFaceEmbeddings # import torch # import os #################################################################################################### ################################## Test the Text Embeddings Model ################################## #################################################################################################### @pytest.fixture def mock_text_data(tmp_path): """ Fixture to create a mock CSV file with text data for testing. """ data = {"description": ["Product 1 description", "Product 2 description"]} df = pd.DataFrame(data) file_path = tmp_path / "test_text_data.csv" df.to_csv(file_path, index=False) return str(file_path) @pytest.mark.parametrize( "model_name, expected_hidden_size", [ ("sentence-transformers/all-MiniLM-L6-v2", 384), # MiniLM with 384 hidden units # ('bert-base-uncased', 768), # BERT base with 768 hidden units ], ) def test_huggingface_embeddings_generic( model_name, expected_hidden_size, mock_text_data ): """ Generic test for loading a Hugging Face model, generating text embeddings, and saving them to a CSV file. This test ensures that: - The model and tokenizer are properly loaded from Hugging Face. - Embeddings are correctly generated for text descriptions. - Embeddings are saved in the correct format to a CSV file. Parameters: ---------- model_name : str The name of the Hugging Face model to test. expected_hidden_size : int The expected hidden size (dimensionality) of the embeddings generated by the model. mock_text_data : str Path to the mock CSV file containing text descriptions. """ # Initialize the HuggingFaceEmbeddings model with the provided model name model = HuggingFaceEmbeddings( model_name=model_name, path=mock_text_data, device="cpu" ) # Check that the tokenizer and model were loaded correctly assert isinstance( model.tokenizer, type(AutoTokenizer.from_pretrained(model_name)) ), ( f"Tokenizer should be an instance of {type(AutoTokenizer.from_pretrained(model_name))}" ) assert isinstance(model.model, type(AutoModel.from_pretrained(model_name))), ( f"Model should be an instance of {type(AutoModel.from_pretrained(model_name))}" ) # Generate embeddings for a sample text sample_text = "This is a test description." embeddings = model.get_embedding(sample_text) # Check that the embeddings are a NumPy array with the expected shape assert isinstance(embeddings, np.ndarray), "Embeddings should be a NumPy array" assert embeddings.shape == (expected_hidden_size,), ( f"Embeddings shape should be ({expected_hidden_size},), got {embeddings.shape}" ) if __name__ == "__main__": pytest.main()