Skip to content

Instantly share code, notes, and snippets.

@martenc
Last active March 20, 2025 15:12
Show Gist options
  • Save martenc/1a09122246451227c2b6b4577383a870 to your computer and use it in GitHub Desktop.
Save martenc/1a09122246451227c2b6b4577383a870 to your computer and use it in GitHub Desktop.
visualize the embeddings with UMAP for dimensionality reduction
from langchain_core.embeddings import Embeddings
import matplotlib.pyplot as plt
import pandas as pd
from typing import Dict, List, Tuple
import numpy as np
import plotly.express as px
from sklearn.manifold import Isomap
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import umap.umap_ as umap
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
def plot_chunk_and_query_embeddings(doc_to_embeddings_and_chunks: Dict[str, Tuple[np.array, List[str]]], title: str,
query: str=None, query_embedder: Embeddings=None):
scaler = StandardScaler()
reducer = umap.UMAP(random_state=1, n_jobs=1)
all_embeddings, all_labels, all_texts, all_papers = [], [], [], []
curr_label = 0
for doc, (embeddings, chunks) in doc_to_embeddings_and_chunks.items():
all_embeddings.append(embeddings)
all_labels.extend([curr_label] * embeddings.shape[0])
all_texts.extend(chunks)
all_papers.extend([doc] * embeddings.shape[0])
curr_label += 1
all_embeddings = np.vstack(all_embeddings)
all_embeddings = scaler.fit_transform(all_embeddings)
all_embeddings_2d = reducer.fit_transform(all_embeddings)
all_texts = [text[:100] + "..." for text in all_texts]
df = pd.DataFrame({
"x": all_embeddings_2d[:, 0],
"y": all_embeddings_2d[:, 1],
"color": all_labels,
"text": all_texts,
"paper": all_papers,
})
if query is not None:
title = f"{title} + ({query})"
fig = px.scatter(data_frame=df, x="x", y="y", color="paper", hover_data="text", hover_name="paper",
title=title, opacity=0.5)
fig.update_layout(
hoverlabel=dict(
bgcolor="white", # Set the background color
font=dict(
color="black", # Set the font color
size=12
)
),
coloraxis_showscale=False,
showlegend=False
)
if query is not None:
assert query_embedder is not None
query_embedding = np.array(query_embedder.embed_query(query))
query_embedding = np.expand_dims(query_embedding, 0)
query_embedding = scaler.transform(query_embedding)
query_2d_embedding = reducer.transform(query_embedding)
fig.add_scatter(x=query_2d_embedding[:, 0], y=query_2d_embedding[:, 1], hovertext=[query],
mode="markers", marker=dict(size=15, color='black', symbol='cross'), name='query')
return fig
# visualize the chunks
fig = plot_chunk_and_query_embeddings({"Chipnemo": (np.array(chipnemo_embeddings), chipnemo_chunks),
"Nemotron": (np.array(nemotron_embeddings), nemotron_chunks),
"gb200": (np.array(gb200_embeddings), gb200_chunks)},
title="nv-embedqa-e5-v5")
fig.show()
# try another one to get closer
query = "What is the memory of gb200?" # i was looking at nvidia industrial gpu's i can't affort rn
fig = plot_chunk_and_query_embeddings({"Chipnemo": (np.array(chipnemo_embeddings), chipnemo_chunks),
"Nemotron": (np.array(nemotron_embeddings), nemotron_chunks),
"gb200": (np.array(gb200_embeddings), gb200_chunks)},
title="nv-embedqa-e5-v5", query_embedder=embedder, query=query)
fig.show()
import matplotlib.pyplot as plt
import pandas as pd
from typing import Dict, List, Tuple
import numpy as np
import plotly.express as px
from sklearn.manifold import Isomap
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import umap.umap_ as umap
from sklearn.manifold import TSNE
def plot_space(dataset, query, nearest):
scaler = StandardScaler()
reducer = umap.UMAP(random_state=1, n_jobs=1)
all_embeddings = np.vstack(dataset)
all_embeddings = scaler.fit_transform(all_embeddings)
all_embeddings_2d = reducer.fit_transform(all_embeddings)
i= np.arange(0, dataset.shape[0])
df = pd.DataFrame({
"x": all_embeddings_2d[:, 0],
"y": all_embeddings_2d[:, 1],
"id": i
})
fig = px.scatter(data_frame=df, x="x", y="y", hover_data="id", opacity=0.5)
fig.update_layout(
hoverlabel=dict(
bgcolor="white", # Set the background color
font=dict(
color="black", # Set the font color
size=12
)
),
coloraxis_showscale=False,
showlegend=False
)
nearest_embedding = dataset[nearest]
nearest_embedding = np.expand_dims(nearest_embedding, 0)
nearest_embedding = scaler.transform(nearest_embedding)
nearest_2d_embedding = reducer.transform(nearest_embedding)
fig.add_scatter(x=nearest_2d_embedding[:, 0], y=nearest_2d_embedding[:, 1], hovertext=['nearest'],
mode="markers", marker=dict(size=10, color='red', symbol='cross'), name='nearest')
query_embedding = query
query_embedding = np.expand_dims(query_embedding, 0)
query_embedding = scaler.transform(query_embedding)
query_2d_embedding = reducer.transform(query_embedding)
fig.add_scatter(x=query_2d_embedding[:, 0], y=query_2d_embedding[:, 1], hovertext=['query'],
mode="markers", marker=dict(size=10, color='black', symbol='cross'), name='query')
fig.show()
@martenc
Copy link
Author

martenc commented Mar 20, 2025

Check the shape of embeddings. Each vector should have a size of 1x1024.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment