|
|
import numpy as np |
|
|
import pandas as pd |
|
|
import pytest |
|
|
from transformers import AutoModel, AutoTokenizer |
|
|
|
|
|
from src.nlp_models import HuggingFaceEmbeddings |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture |
|
|
def mock_text_data(tmp_path): |
|
|
""" |
|
|
Fixture to create a mock CSV file with text data for testing. |
|
|
""" |
|
|
data = {"description": ["Product 1 description", "Product 2 description"]} |
|
|
df = pd.DataFrame(data) |
|
|
file_path = tmp_path / "test_text_data.csv" |
|
|
df.to_csv(file_path, index=False) |
|
|
return str(file_path) |
|
|
|
|
|
|
|
|
@pytest.mark.parametrize( |
|
|
"model_name, expected_hidden_size", |
|
|
[ |
|
|
("sentence-transformers/all-MiniLM-L6-v2", 384), |
|
|
|
|
|
], |
|
|
) |
|
|
def test_huggingface_embeddings_generic( |
|
|
model_name, expected_hidden_size, mock_text_data |
|
|
): |
|
|
""" |
|
|
Generic test for loading a Hugging Face model, generating text embeddings, and saving them to a CSV file. |
|
|
|
|
|
This test ensures that: |
|
|
- The model and tokenizer are properly loaded from Hugging Face. |
|
|
- Embeddings are correctly generated for text descriptions. |
|
|
- Embeddings are saved in the correct format to a CSV file. |
|
|
|
|
|
Parameters: |
|
|
---------- |
|
|
model_name : str |
|
|
The name of the Hugging Face model to test. |
|
|
expected_hidden_size : int |
|
|
The expected hidden size (dimensionality) of the embeddings generated by the model. |
|
|
mock_text_data : str |
|
|
Path to the mock CSV file containing text descriptions. |
|
|
""" |
|
|
|
|
|
model = HuggingFaceEmbeddings( |
|
|
model_name=model_name, path=mock_text_data, device="cpu" |
|
|
) |
|
|
|
|
|
|
|
|
assert isinstance( |
|
|
model.tokenizer, type(AutoTokenizer.from_pretrained(model_name)) |
|
|
), ( |
|
|
f"Tokenizer should be an instance of {type(AutoTokenizer.from_pretrained(model_name))}" |
|
|
) |
|
|
assert isinstance(model.model, type(AutoModel.from_pretrained(model_name))), ( |
|
|
f"Model should be an instance of {type(AutoModel.from_pretrained(model_name))}" |
|
|
) |
|
|
|
|
|
|
|
|
sample_text = "This is a test description." |
|
|
embeddings = model.get_embedding(sample_text) |
|
|
|
|
|
|
|
|
assert isinstance(embeddings, np.ndarray), "Embeddings should be a NumPy array" |
|
|
assert embeddings.shape == (expected_hidden_size,), ( |
|
|
f"Embeddings shape should be ({expected_hidden_size},), got {embeddings.shape}" |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
pytest.main() |
|
|
|