Commit
·
644a030
1
Parent(s):
93f5069
It works now
Browse files
app.py
CHANGED
|
@@ -2,20 +2,23 @@ import streamlit as st
|
|
| 2 |
import pandas as pd
|
| 3 |
import vec2text
|
| 4 |
import torch
|
| 5 |
-
from transformers import AutoModel, AutoTokenizer
|
| 6 |
from umap import UMAP
|
| 7 |
from tqdm import tqdm
|
| 8 |
import plotly.express as px
|
| 9 |
import numpy as np
|
| 10 |
from sklearn.decomposition import PCA
|
|
|
|
|
|
|
|
|
|
| 11 |
# Activate tqdm with pandas
|
| 12 |
tqdm.pandas()
|
| 13 |
|
| 14 |
@st.cache_resource
|
| 15 |
def vector_compressor_from_config():
|
| 16 |
'TODO'
|
| 17 |
-
# return PCA()
|
| 18 |
-
return UMAP()
|
| 19 |
|
| 20 |
# Caching the dataframe since loading from external source can be time-consuming
|
| 21 |
@st.cache_data
|
|
@@ -48,7 +51,6 @@ def load_embeddings():
|
|
| 48 |
embeddings = load_embeddings()
|
| 49 |
|
| 50 |
# Caching UMAP reduction as it's a heavy computation
|
| 51 |
-
|
| 52 |
@st.cache_resource
|
| 53 |
def reduce_embeddings(embeddings):
|
| 54 |
reducer = vector_compressor_from_config()
|
|
@@ -60,29 +62,51 @@ vectors_2d, reducer = reduce_embeddings(embeddings)
|
|
| 60 |
fig = px.scatter(
|
| 61 |
x=vectors_2d[:, 0],
|
| 62 |
y=vectors_2d[:, 1],
|
| 63 |
-
opacity=0.
|
| 64 |
hover_data={"Title": df["title"]},
|
| 65 |
labels={'x': 'UMAP Dimension 1', 'y': 'UMAP Dimension 2'},
|
| 66 |
-
title="UMAP Scatter Plot of Reddit Titles"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
)
|
| 68 |
|
| 69 |
-
# Display
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
import vec2text
|
| 4 |
import torch
|
| 5 |
+
from transformers import AutoModel, AutoTokenizer
|
| 6 |
from umap import UMAP
|
| 7 |
from tqdm import tqdm
|
| 8 |
import plotly.express as px
|
| 9 |
import numpy as np
|
| 10 |
from sklearn.decomposition import PCA
|
| 11 |
+
from streamlit_plotly_events import plotly_events
|
| 12 |
+
import plotly.graph_objects as go
|
| 13 |
+
import logging
|
| 14 |
# Activate tqdm with pandas
|
| 15 |
tqdm.pandas()
|
| 16 |
|
| 17 |
@st.cache_resource
|
| 18 |
def vector_compressor_from_config():
|
| 19 |
'TODO'
|
| 20 |
+
# return PCA(2)
|
| 21 |
+
return UMAP(2)
|
| 22 |
|
| 23 |
# Caching the dataframe since loading from external source can be time-consuming
|
| 24 |
@st.cache_data
|
|
|
|
| 51 |
embeddings = load_embeddings()
|
| 52 |
|
| 53 |
# Caching UMAP reduction as it's a heavy computation
|
|
|
|
| 54 |
@st.cache_resource
|
| 55 |
def reduce_embeddings(embeddings):
|
| 56 |
reducer = vector_compressor_from_config()
|
|
|
|
| 62 |
fig = px.scatter(
|
| 63 |
x=vectors_2d[:, 0],
|
| 64 |
y=vectors_2d[:, 1],
|
| 65 |
+
opacity=0.6,
|
| 66 |
hover_data={"Title": df["title"]},
|
| 67 |
labels={'x': 'UMAP Dimension 1', 'y': 'UMAP Dimension 2'},
|
| 68 |
+
title="UMAP Scatter Plot of Reddit Titles",
|
| 69 |
+
color_discrete_sequence=["#01a8d3"] # Set default blue color for points
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
# Customize the layout to adapt to browser settings (light/dark mode)
|
| 73 |
+
fig.update_layout(
|
| 74 |
+
template=None, # Let Plotly adapt automatically based on user settings
|
| 75 |
+
plot_bgcolor="rgba(0, 0, 0, 0)",
|
| 76 |
+
paper_bgcolor="rgba(0, 0, 0, 0)"
|
| 77 |
)
|
| 78 |
|
| 79 |
+
# Display the scatterplot and capture click events
|
| 80 |
+
selected_points = plotly_events(fig, click_event=True, hover_event=False, override_height=600, override_width="100%")
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
# If a point is clicked, handle the embedding inversion
|
| 84 |
+
if selected_points:
|
| 85 |
+
|
| 86 |
+
clicked_point = selected_points[0]
|
| 87 |
+
x_coord = x = clicked_point['x']
|
| 88 |
+
y_coord = y = clicked_point['y']
|
| 89 |
+
st.text(f"Embeddings shape: {embeddings.shape}")
|
| 90 |
+
st.text(f"2dvector shapes shape: {vectors_2d.shape}")
|
| 91 |
+
st.text(f"Clicked point coordinates: x = {x_coord}, y = {y_coord}")
|
| 92 |
+
st.text("fOO")
|
| 93 |
+
logging.info("Foo")
|
| 94 |
+
inferred_embedding = reducer.inverse_transform(np.array([[x, y]]) if not isinstance(reducer, UMAP) else np.array([[x, y]]))
|
| 95 |
+
logging.info("Bar")
|
| 96 |
+
|
| 97 |
+
st.text("Bar")
|
| 98 |
+
|
| 99 |
+
inferred_embedding = inferred_embedding.astype("float32")
|
| 100 |
+
st.text("Bar")
|
| 101 |
+
|
| 102 |
+
output = vec2text.invert_embeddings(
|
| 103 |
+
embeddings=torch.tensor(inferred_embedding).cuda(),
|
| 104 |
+
corrector=corrector,
|
| 105 |
+
num_steps=20,
|
| 106 |
+
)
|
| 107 |
+
st.text("Bar")
|
| 108 |
+
|
| 109 |
+
st.text(str(output))
|
| 110 |
+
st.text(str(inferred_embedding))
|
| 111 |
+
else:
|
| 112 |
+
st.text("Click on a point in the scatterplot to see its coordinates.")
|