File size: 2,978 Bytes
43fe501
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import numpy as np
import pandas as pd
import pytest
from transformers import AutoModel, AutoTokenizer

from src.nlp_models import HuggingFaceEmbeddings

# import torch
# import os

####################################################################################################
################################## Test the Text Embeddings Model ##################################
####################################################################################################


@pytest.fixture
def mock_text_data(tmp_path):
    """
    Fixture to create a mock CSV file with text data for testing.
    """
    data = {"description": ["Product 1 description", "Product 2 description"]}
    df = pd.DataFrame(data)
    file_path = tmp_path / "test_text_data.csv"
    df.to_csv(file_path, index=False)
    return str(file_path)


@pytest.mark.parametrize(
    "model_name, expected_hidden_size",
    [
        ("sentence-transformers/all-MiniLM-L6-v2", 384),  # MiniLM with 384 hidden units
        # ('bert-base-uncased', 768),  # BERT base with 768 hidden units
    ],
)
def test_huggingface_embeddings_generic(
    model_name, expected_hidden_size, mock_text_data
):
    """
    Generic test for loading a Hugging Face model, generating text embeddings, and saving them to a CSV file.

    This test ensures that:
    - The model and tokenizer are properly loaded from Hugging Face.
    - Embeddings are correctly generated for text descriptions.
    - Embeddings are saved in the correct format to a CSV file.

    Parameters:
    ----------
    model_name : str
        The name of the Hugging Face model to test.
    expected_hidden_size : int
        The expected hidden size (dimensionality) of the embeddings generated by the model.
    mock_text_data : str
        Path to the mock CSV file containing text descriptions.
    """
    # Initialize the HuggingFaceEmbeddings model with the provided model name
    model = HuggingFaceEmbeddings(
        model_name=model_name, path=mock_text_data, device="cpu"
    )

    # Check that the tokenizer and model were loaded correctly
    assert isinstance(
        model.tokenizer, type(AutoTokenizer.from_pretrained(model_name))
    ), (
        f"Tokenizer should be an instance of {type(AutoTokenizer.from_pretrained(model_name))}"
    )
    assert isinstance(model.model, type(AutoModel.from_pretrained(model_name))), (
        f"Model should be an instance of {type(AutoModel.from_pretrained(model_name))}"
    )

    # Generate embeddings for a sample text
    sample_text = "This is a test description."
    embeddings = model.get_embedding(sample_text)

    # Check that the embeddings are a NumPy array with the expected shape
    assert isinstance(embeddings, np.ndarray), "Embeddings should be a NumPy array"
    assert embeddings.shape == (expected_hidden_size,), (
        f"Embeddings shape should be ({expected_hidden_size},), got {embeddings.shape}"
    )


if __name__ == "__main__":
    pytest.main()