Commit
·
bdef5c4
1
Parent(s):
bbb7e28
Make working embedding view
Browse files- app.py +1 -1
- resources.py +21 -2
- views.py +34 -5
app.py
CHANGED
|
@@ -44,4 +44,4 @@ with tab1:
|
|
| 44 |
)
|
| 45 |
|
| 46 |
with tab2:
|
| 47 |
-
views.diffs(embeddings, corrector)
|
|
|
|
| 44 |
)
|
| 45 |
|
| 46 |
with tab2:
|
| 47 |
+
views.diffs(embeddings, corrector, encoder, tokenizer)
|
resources.py
CHANGED
|
@@ -1,10 +1,11 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
import pandas as pd
|
|
|
|
| 3 |
import vec2text
|
| 4 |
from transformers import AutoModel, AutoTokenizer
|
| 5 |
from sklearn.decomposition import PCA
|
| 6 |
from utils import file_cache
|
| 7 |
-
|
| 8 |
|
| 9 |
# Caching the vec2text corrector
|
| 10 |
@st.cache_resource
|
|
@@ -35,4 +36,22 @@ def reduce_embeddings(embeddings):
|
|
| 35 |
def load_model_and_tokenizer(device="cpu"):
|
| 36 |
encoder = AutoModel.from_pretrained("sentence-transformers/gtr-t5-base").encoder.to(device)
|
| 37 |
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/gtr-t5-base")
|
| 38 |
-
return encoder, tokenizer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
import pandas as pd
|
| 3 |
+
import torch
|
| 4 |
import vec2text
|
| 5 |
from transformers import AutoModel, AutoTokenizer
|
| 6 |
from sklearn.decomposition import PCA
|
| 7 |
from utils import file_cache
|
| 8 |
+
from transformers import PreTrainedModel, PreTrainedTokenizer
|
| 9 |
|
| 10 |
# Caching the vec2text corrector
|
| 11 |
@st.cache_resource
|
|
|
|
| 36 |
def load_model_and_tokenizer(device="cpu"):
|
| 37 |
encoder = AutoModel.from_pretrained("sentence-transformers/gtr-t5-base").encoder.to(device)
|
| 38 |
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/gtr-t5-base")
|
| 39 |
+
return encoder, tokenizer
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def get_gtr_embeddings(text_list,
|
| 43 |
+
encoder: PreTrainedModel,
|
| 44 |
+
tokenizer: PreTrainedTokenizer) -> torch.Tensor:
|
| 45 |
+
|
| 46 |
+
inputs = tokenizer(text_list,
|
| 47 |
+
return_tensors="pt",
|
| 48 |
+
max_length=128,
|
| 49 |
+
truncation=True,
|
| 50 |
+
padding="max_length",).to("cuda")
|
| 51 |
+
|
| 52 |
+
with torch.no_grad():
|
| 53 |
+
model_output = encoder(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
|
| 54 |
+
hidden_state = model_output.last_hidden_state
|
| 55 |
+
embeddings = vec2text.models.model_utils.mean_pool(hidden_state, inputs['attention_mask'])
|
| 56 |
+
|
| 57 |
+
return embeddings
|
views.py
CHANGED
|
@@ -5,16 +5,45 @@ from umap import UMAP
|
|
| 5 |
import plotly.express as px
|
| 6 |
import numpy as np
|
| 7 |
from streamlit_plotly_events import plotly_events
|
| 8 |
-
from resources import reduce_embeddings
|
| 9 |
import utils
|
| 10 |
import pandas as pd
|
| 11 |
from scipy.spatial import distance
|
| 12 |
|
| 13 |
dimensionality_reduction_model_name = "PCA"
|
| 14 |
|
| 15 |
-
def diffs(embeddings: np.ndarray, corrector):
|
| 16 |
-
st.
|
| 17 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
def plot(df: pd.DataFrame, embeddings: np.ndarray, vectors_2d, reducer, corrector):
|
| 20 |
|
|
@@ -88,7 +117,7 @@ def plot(df: pd.DataFrame, embeddings: np.ndarray, vectors_2d, reducer, correcto
|
|
| 88 |
|
| 89 |
if inferred_embedding is not None and (closest_sentence_index != -1):
|
| 90 |
couple = selected_sentence_embedding.squeeze(), inferred_embedding.squeeze()
|
| 91 |
-
st.markdown(
|
| 92 |
st.number_input("Euclidean", value=distance.euclidean(
|
| 93 |
*couple
|
| 94 |
), disabled=True)
|
|
|
|
| 5 |
import plotly.express as px
|
| 6 |
import numpy as np
|
| 7 |
from streamlit_plotly_events import plotly_events
|
|
|
|
| 8 |
import utils
|
| 9 |
import pandas as pd
|
| 10 |
from scipy.spatial import distance
|
| 11 |
|
| 12 |
dimensionality_reduction_model_name = "PCA"
|
| 13 |
|
| 14 |
+
def diffs(embeddings: np.ndarray, corrector, encoder, tokenizer):
|
| 15 |
+
st.title('"A man is to king, what woman is to queen"')
|
| 16 |
+
st.markdown("A well known pehnomenon in semantic vectors is the way we can do vector operations like addition and subtraction to find spacial relations in the vector space.")
|
| 17 |
+
st.markdown(
|
| 18 |
+
'In word embedding models, we have found that the relationship between words can be captured mathematically, '
|
| 19 |
+
'such that "king" is to "man" as "queen" is to "woman," demonstrating that vector arithmetic can encode analogies and semantic relationships in high-dimensional space ([Mikolov et al., 2013](https://arxiv.org/abs/1301.3781)).'
|
| 20 |
+
)
|
| 21 |
+
st.markdown("This application lets you freely explore to which extent that property applies to embedding inversion models given the other factors of inaccuracy")
|
| 22 |
+
|
| 23 |
+
generated_sentence = ""
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
with st.form(key="foo") as form:
|
| 27 |
+
submit_button = st.form_submit_button("Synthesize")
|
| 28 |
+
|
| 29 |
+
sent1 = st.text_input("Sentence 1")
|
| 30 |
+
st.latex("-")
|
| 31 |
+
sent2 = st.text_input("Sentence 2")
|
| 32 |
+
st.latex("+")
|
| 33 |
+
sent3 = st.text_input("Sentence 3")
|
| 34 |
+
st.latex("=")
|
| 35 |
+
|
| 36 |
+
if submit_button:
|
| 37 |
+
generated_sentence = "HI"
|
| 38 |
+
|
| 39 |
+
sent4 = st.text_input("Sentence 4", value=generated_sentence, disabled=True)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
if submit_button:
|
| 44 |
+
generated_sentence = "HI!"
|
| 45 |
+
|
| 46 |
+
# st.html('<a href="https://www.flaticon.com/free-icons/array" title="array icons">Array icons created by Voysla - Flaticon</a>')
|
| 47 |
|
| 48 |
def plot(df: pd.DataFrame, embeddings: np.ndarray, vectors_2d, reducer, corrector):
|
| 49 |
|
|
|
|
| 117 |
|
| 118 |
if inferred_embedding is not None and (closest_sentence_index != -1):
|
| 119 |
couple = selected_sentence_embedding.squeeze(), inferred_embedding.squeeze()
|
| 120 |
+
st.markdown("### Inferred embedding distance:")
|
| 121 |
st.number_input("Euclidean", value=distance.euclidean(
|
| 122 |
*couple
|
| 123 |
), disabled=True)
|