text-embeddings

Sleeping

rrg92 commited on Dec 2, 2024

Commit

84c4fac

verified ·

1 Parent(s): 645dc98

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -4,12 +4,14 @@ from transformers import AutoTokenizer, AutoModel, AutoImageProcessor
 import gradio as gr
 import spaces
-processor = AutoImageProcessor.from_pretrained("nomic-ai/nomic-embed-vision-v1.5")
-vision_model = AutoModel.from_pretrained("nomic-ai/nomic-embed-vision-v1.5", trust_remote_code=True)
-tokenizer = AutoTokenizer.from_pretrained('nomic-ai/nomic-embed-text-v1.5')
-text_model = AutoModel.from_pretrained('nomic-ai/nomic-embed-text-v1.5', trust_remote_code=True)
-text_model.eval()
 def mean_pooling(model_output, attention_mask):
     token_embeddings = model_output[0]
@@ -18,18 +20,28 @@ def mean_pooling(model_output, attention_mask):
 @spaces.GPU
 def TxtEmbed(text):
-    sentences = [text]
-    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
     with torch.no_grad():
-        model_output = text_model(**encoded_input)
-    text_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
-    text_embeddings = F.layer_norm(text_embeddings, normalized_shape=(text_embeddings.shape[1],))
-    text_embeddings = F.normalize(text_embeddings, p=2, dim=1)
-    return (text_embeddings.tolist())[0];

 import gradio as gr
 import spaces
+model = AutoModel.from_pretrained('neuralmind/bert-base-portuguese-cased')
+# processor = AutoImageProcessor.from_pretrained("nomic-ai/nomic-embed-vision-v1.5")
+# vision_model = AutoModel.from_pretrained("nomic-ai/nomic-embed-vision-v1.5", trust_remote_code=True)
+# tokenizer = AutoTokenizer.from_pretrained('nomic-ai/nomic-embed-text-v1.5')
+# text_model = AutoModel.from_pretrained('nomic-ai/nomic-embed-text-v1.5', trust_remote_code=True)
+# text_model.eval()
 def mean_pooling(model_output, attention_mask):
     token_embeddings = model_output[0]
 @spaces.GPU
 def TxtEmbed(text):
+    import torch
+    input_ids = tokenizer.encode(text, return_tensors='pt')
     with torch.no_grad():
+        outs = model(input_ids)
+        encoded = outs[0][0, 1:-1]  # Ignore [CLS] and [SEP] special tokens
+   # sentences = [text]
+   # encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
+   #
+   # with torch.no_grad():
+   #     model_output = text_model(**encoded_input)
+   #
+   # text_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
+   # text_embeddings = F.layer_norm(text_embeddings, normalized_shape=(text_embeddings.shape[1],))
+   # text_embeddings = F.normalize(text_embeddings, p=2, dim=1)
+    return (encoded.tolist())[0];