Spaces:
Running
Running
File size: 5,627 Bytes
6c982a7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
import torch
import numpy as np
from PIL import Image
from transformers import AutoModel
from typing import Union, List
import io
class JinaClipEmbeddingService:
"""
Jina CLIP v2 Embedding Service với hỗ trợ tiếng Việt
Sử dụng AutoModel với trust_remote_code
"""
def __init__(self, model_path: str = "jinaai/jina-clip-v2"):
"""
Initialize Jina CLIP v2 model
Args:
model_path: Path to model hoặc HuggingFace model name
"""
print(f"Loading Jina CLIP v2 model from {model_path}...")
# Load model với trust_remote_code
self.model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
# Chuyển sang eval mode
self.model.eval()
# Sử dụng GPU nếu có
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.model.to(self.device)
print(f"✓ Loaded Jina CLIP v2 model on: {self.device}")
def encode_text(
self,
text: Union[str, List[str]],
truncate_dim: int = None,
normalize: bool = True
) -> np.ndarray:
"""
Encode text thành vector embeddings (hỗ trợ tiếng Việt)
Args:
text: Text hoặc list of texts (tiếng Việt)
truncate_dim: Matryoshka dimension (64-1024, None = full 1024)
normalize: Có normalize embeddings không
Returns:
numpy array của embeddings
"""
if isinstance(text, str):
text = [text]
# Jina CLIP v2 encode_text method
# Automatically handles tokenization internally
embeddings = self.model.encode_text(
text,
truncate_dim=truncate_dim # Optional: 64, 128, 256, 512, 1024
)
# Convert to numpy
if isinstance(embeddings, torch.Tensor):
embeddings = embeddings.cpu().detach().numpy()
# Normalize nếu cần
if normalize:
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
return embeddings
def encode_image(
self,
image: Union[Image.Image, bytes, List, str],
truncate_dim: int = None,
normalize: bool = True
) -> np.ndarray:
"""
Encode image thành vector embeddings
Args:
image: PIL Image, bytes, URL string, hoặc list of images
truncate_dim: Matryoshka dimension (64-1024, None = full 1024)
normalize: Có normalize embeddings không
Returns:
numpy array của embeddings
"""
# Convert bytes to PIL Image nếu cần
if isinstance(image, bytes):
image = Image.open(io.BytesIO(image)).convert('RGB')
elif isinstance(image, list):
processed_images = []
for img in image:
if isinstance(img, bytes):
processed_images.append(Image.open(io.BytesIO(img)).convert('RGB'))
elif isinstance(img, str):
# URL string - keep as is, Jina CLIP can handle URLs
processed_images.append(img)
else:
processed_images.append(img)
image = processed_images
elif not isinstance(image, list) and not isinstance(image, str):
# Single PIL Image
image = [image]
# Jina CLIP v2 encode_image method
# Supports PIL Images, file paths, or URLs
embeddings = self.model.encode_image(
image,
truncate_dim=truncate_dim # Optional: 64, 128, 256, 512, 1024
)
# Convert to numpy
if isinstance(embeddings, torch.Tensor):
embeddings = embeddings.cpu().detach().numpy()
# Normalize nếu cần
if normalize:
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
return embeddings
def encode_multimodal(
self,
text: Union[str, List[str]] = None,
image: Union[Image.Image, bytes, List] = None,
truncate_dim: int = None,
normalize: bool = True
) -> np.ndarray:
"""
Encode cả text và image, trả về embeddings kết hợp
Args:
text: Text hoặc list of texts (tiếng Việt)
image: PIL Image, bytes, hoặc list of images
truncate_dim: Matryoshka dimension (64-1024, None = full 1024)
normalize: Có normalize embeddings không
Returns:
numpy array của embeddings
"""
embeddings = []
if text is not None:
text_emb = self.encode_text(text, truncate_dim=truncate_dim, normalize=False)
embeddings.append(text_emb)
if image is not None:
image_emb = self.encode_image(image, truncate_dim=truncate_dim, normalize=False)
embeddings.append(image_emb)
# Combine embeddings (average)
if len(embeddings) == 2:
# Average của text và image embeddings
combined = np.mean(embeddings, axis=0)
elif len(embeddings) == 1:
combined = embeddings[0]
else:
raise ValueError("Phải cung cấp ít nhất text hoặc image")
# Normalize nếu cần
if normalize:
combined = combined / np.linalg.norm(combined, axis=1, keepdims=True)
return combined
def get_embedding_dimension(self) -> int:
"""
Trả về dimension của embeddings (1024 cho Jina CLIP v2)
"""
return 1024
|