Commit
·
6b30d5d
1
Parent(s):
4b61117
Add explanation
Browse files- app.py +5 -1
- resources.py +2 -1
- views.py +3 -2
app.py
CHANGED
|
@@ -5,7 +5,8 @@ import views
|
|
| 5 |
from resources import load_corrector, load_data, load_model_and_tokenizer, reduce_embeddings
|
| 6 |
|
| 7 |
use_cpu = not torch.cuda.is_available()
|
| 8 |
-
device = "cpu" if use_cpu else "cuda"
|
|
|
|
| 9 |
|
| 10 |
df = load_data()
|
| 11 |
|
|
@@ -29,6 +30,9 @@ def sidebar():
|
|
| 29 |
"We explore both sequence embedding inversion using the method described in [Morris et al., 2023](https://arxiv.org/abs/2310.06816), as well as"
|
| 30 |
" dimensionality rediction transforms and inverse transforms, and its effect on embedding inversion."
|
| 31 |
)
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
sidebar()
|
| 34 |
|
|
|
|
| 5 |
from resources import load_corrector, load_data, load_model_and_tokenizer, reduce_embeddings
|
| 6 |
|
| 7 |
use_cpu = not torch.cuda.is_available()
|
| 8 |
+
# device = "cpu" if use_cpu else "cuda"
|
| 9 |
+
device = "cpu"
|
| 10 |
|
| 11 |
df = load_data()
|
| 12 |
|
|
|
|
| 30 |
"We explore both sequence embedding inversion using the method described in [Morris et al., 2023](https://arxiv.org/abs/2310.06816), as well as"
|
| 31 |
" dimensionality rediction transforms and inverse transforms, and its effect on embedding inversion."
|
| 32 |
)
|
| 33 |
+
st.sidebar.markdown(
|
| 34 |
+
"### The Dataset\nThe dataset in use is the Reddit SYAC dataset train split ([Heiervang, 2022](https://www.duo.uio.no/handle/10852/96578)), which contains the title of different clickbait articles."
|
| 35 |
+
)
|
| 36 |
|
| 37 |
sidebar()
|
| 38 |
|
resources.py
CHANGED
|
@@ -42,13 +42,14 @@ def load_model_and_tokenizer(device="cpu"):
|
|
| 42 |
def get_gtr_embeddings(text_list: list[str],
|
| 43 |
encoder: PreTrainedModel,
|
| 44 |
tokenizer: PreTrainedTokenizer,
|
|
|
|
| 45 |
) -> torch.Tensor:
|
| 46 |
|
| 47 |
inputs = tokenizer(text_list,
|
| 48 |
return_tensors="pt",
|
| 49 |
max_length=128,
|
| 50 |
truncation=True,
|
| 51 |
-
padding="max_length",).to(
|
| 52 |
|
| 53 |
with torch.no_grad():
|
| 54 |
model_output = encoder(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
|
|
|
|
| 42 |
def get_gtr_embeddings(text_list: list[str],
|
| 43 |
encoder: PreTrainedModel,
|
| 44 |
tokenizer: PreTrainedTokenizer,
|
| 45 |
+
device: str,
|
| 46 |
) -> torch.Tensor:
|
| 47 |
|
| 48 |
inputs = tokenizer(text_list,
|
| 49 |
return_tensors="pt",
|
| 50 |
max_length=128,
|
| 51 |
truncation=True,
|
| 52 |
+
padding="max_length",).to(device)
|
| 53 |
|
| 54 |
with torch.no_grad():
|
| 55 |
model_output = encoder(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
|
views.py
CHANGED
|
@@ -9,9 +9,10 @@ import utils
|
|
| 9 |
import pandas as pd
|
| 10 |
from scipy.spatial import distance
|
| 11 |
from resources import get_gtr_embeddings
|
|
|
|
| 12 |
dimensionality_reduction_model_name = "PCA"
|
| 13 |
|
| 14 |
-
def diffs(embeddings: np.ndarray, corrector, encoder, tokenizer):
|
| 15 |
st.title('"A man is to king, what woman is to queen"')
|
| 16 |
st.markdown("A well known pehnomenon in semantic vectors is the way we can do vector operations like addition and subtraction to find spacial relations in the vector space.")
|
| 17 |
st.markdown(
|
|
@@ -34,7 +35,7 @@ def diffs(embeddings: np.ndarray, corrector, encoder, tokenizer):
|
|
| 34 |
st.latex("=")
|
| 35 |
|
| 36 |
if submit_button:
|
| 37 |
-
v1, v2, v3 = get_gtr_embeddings([sent1, sent2, sent3], encoder, tokenizer).to("cpu")
|
| 38 |
v4 = v1 - v2 + v3
|
| 39 |
generated_sentence, = vec2text.invert_embeddings(
|
| 40 |
embeddings=v4.unsqueeze(0).cuda(),
|
|
|
|
| 9 |
import pandas as pd
|
| 10 |
from scipy.spatial import distance
|
| 11 |
from resources import get_gtr_embeddings
|
| 12 |
+
from transformers import PreTrainedModel, PreTrainedTokenizer
|
| 13 |
dimensionality_reduction_model_name = "PCA"
|
| 14 |
|
| 15 |
+
def diffs(embeddings: np.ndarray, corrector, encoder: PreTrainedModel, tokenizer: PreTrainedTokenizer):
|
| 16 |
st.title('"A man is to king, what woman is to queen"')
|
| 17 |
st.markdown("A well known pehnomenon in semantic vectors is the way we can do vector operations like addition and subtraction to find spacial relations in the vector space.")
|
| 18 |
st.markdown(
|
|
|
|
| 35 |
st.latex("=")
|
| 36 |
|
| 37 |
if submit_button:
|
| 38 |
+
v1, v2, v3 = get_gtr_embeddings([sent1, sent2, sent3], encoder, tokenizer, device=encoder.device).to("cpu")
|
| 39 |
v4 = v1 - v2 + v3
|
| 40 |
generated_sentence, = vec2text.invert_embeddings(
|
| 41 |
embeddings=v4.unsqueeze(0).cuda(),
|