File size: 5,627 Bytes
6c982a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import torch
import numpy as np
from PIL import Image
from transformers import AutoModel
from typing import Union, List
import io


class JinaClipEmbeddingService:
    """
    Jina CLIP v2 Embedding Service với hỗ trợ tiếng Việt
    Sử dụng AutoModel với trust_remote_code
    """

    def __init__(self, model_path: str = "jinaai/jina-clip-v2"):
        """
        Initialize Jina CLIP v2 model

        Args:
            model_path: Path to model hoặc HuggingFace model name
        """
        print(f"Loading Jina CLIP v2 model from {model_path}...")

        # Load model với trust_remote_code
        self.model = AutoModel.from_pretrained(model_path, trust_remote_code=True)

        # Chuyển sang eval mode
        self.model.eval()

        # Sử dụng GPU nếu có
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model.to(self.device)

        print(f"✓ Loaded Jina CLIP v2 model on: {self.device}")

    def encode_text(
        self,
        text: Union[str, List[str]],
        truncate_dim: int = None,
        normalize: bool = True
    ) -> np.ndarray:
        """
        Encode text thành vector embeddings (hỗ trợ tiếng Việt)

        Args:
            text: Text hoặc list of texts (tiếng Việt)
            truncate_dim: Matryoshka dimension (64-1024, None = full 1024)
            normalize: Có normalize embeddings không

        Returns:
            numpy array của embeddings
        """
        if isinstance(text, str):
            text = [text]

        # Jina CLIP v2 encode_text method
        # Automatically handles tokenization internally
        embeddings = self.model.encode_text(
            text,
            truncate_dim=truncate_dim  # Optional: 64, 128, 256, 512, 1024
        )

        # Convert to numpy
        if isinstance(embeddings, torch.Tensor):
            embeddings = embeddings.cpu().detach().numpy()

        # Normalize nếu cần
        if normalize:
            embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

        return embeddings

    def encode_image(
        self,
        image: Union[Image.Image, bytes, List, str],
        truncate_dim: int = None,
        normalize: bool = True
    ) -> np.ndarray:
        """
        Encode image thành vector embeddings

        Args:
            image: PIL Image, bytes, URL string, hoặc list of images
            truncate_dim: Matryoshka dimension (64-1024, None = full 1024)
            normalize: Có normalize embeddings không

        Returns:
            numpy array của embeddings
        """
        # Convert bytes to PIL Image nếu cần
        if isinstance(image, bytes):
            image = Image.open(io.BytesIO(image)).convert('RGB')
        elif isinstance(image, list):
            processed_images = []
            for img in image:
                if isinstance(img, bytes):
                    processed_images.append(Image.open(io.BytesIO(img)).convert('RGB'))
                elif isinstance(img, str):
                    # URL string - keep as is, Jina CLIP can handle URLs
                    processed_images.append(img)
                else:
                    processed_images.append(img)
            image = processed_images
        elif not isinstance(image, list) and not isinstance(image, str):
            # Single PIL Image
            image = [image]

        # Jina CLIP v2 encode_image method
        # Supports PIL Images, file paths, or URLs
        embeddings = self.model.encode_image(
            image,
            truncate_dim=truncate_dim  # Optional: 64, 128, 256, 512, 1024
        )

        # Convert to numpy
        if isinstance(embeddings, torch.Tensor):
            embeddings = embeddings.cpu().detach().numpy()

        # Normalize nếu cần
        if normalize:
            embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

        return embeddings

    def encode_multimodal(
        self,
        text: Union[str, List[str]] = None,
        image: Union[Image.Image, bytes, List] = None,
        truncate_dim: int = None,
        normalize: bool = True
    ) -> np.ndarray:
        """
        Encode cả text và image, trả về embeddings kết hợp

        Args:
            text: Text hoặc list of texts (tiếng Việt)
            image: PIL Image, bytes, hoặc list of images
            truncate_dim: Matryoshka dimension (64-1024, None = full 1024)
            normalize: Có normalize embeddings không

        Returns:
            numpy array của embeddings
        """
        embeddings = []

        if text is not None:
            text_emb = self.encode_text(text, truncate_dim=truncate_dim, normalize=False)
            embeddings.append(text_emb)

        if image is not None:
            image_emb = self.encode_image(image, truncate_dim=truncate_dim, normalize=False)
            embeddings.append(image_emb)

        # Combine embeddings (average)
        if len(embeddings) == 2:
            # Average của text và image embeddings
            combined = np.mean(embeddings, axis=0)
        elif len(embeddings) == 1:
            combined = embeddings[0]
        else:
            raise ValueError("Phải cung cấp ít nhất text hoặc image")

        # Normalize nếu cần
        if normalize:
            combined = combined / np.linalg.norm(combined, axis=1, keepdims=True)

        return combined

    def get_embedding_dimension(self) -> int:
        """
        Trả về dimension của embeddings (1024 cho Jina CLIP v2)
        """
        return 1024