Last active
March 20, 2025 15:12
-
-
Save martenc/1a09122246451227c2b6b4577383a870 to your computer and use it in GitHub Desktop.
visualize the embeddings with UMAP for dimensionality reduction
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from langchain_core.embeddings import Embeddings | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
from typing import Dict, List, Tuple | |
import numpy as np | |
import plotly.express as px | |
from sklearn.manifold import Isomap | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.decomposition import PCA | |
import umap.umap_ as umap | |
import warnings | |
warnings.simplefilter(action='ignore', category=FutureWarning) | |
def plot_chunk_and_query_embeddings(doc_to_embeddings_and_chunks: Dict[str, Tuple[np.array, List[str]]], title: str, | |
query: str=None, query_embedder: Embeddings=None): | |
scaler = StandardScaler() | |
reducer = umap.UMAP(random_state=1, n_jobs=1) | |
all_embeddings, all_labels, all_texts, all_papers = [], [], [], [] | |
curr_label = 0 | |
for doc, (embeddings, chunks) in doc_to_embeddings_and_chunks.items(): | |
all_embeddings.append(embeddings) | |
all_labels.extend([curr_label] * embeddings.shape[0]) | |
all_texts.extend(chunks) | |
all_papers.extend([doc] * embeddings.shape[0]) | |
curr_label += 1 | |
all_embeddings = np.vstack(all_embeddings) | |
all_embeddings = scaler.fit_transform(all_embeddings) | |
all_embeddings_2d = reducer.fit_transform(all_embeddings) | |
all_texts = [text[:100] + "..." for text in all_texts] | |
df = pd.DataFrame({ | |
"x": all_embeddings_2d[:, 0], | |
"y": all_embeddings_2d[:, 1], | |
"color": all_labels, | |
"text": all_texts, | |
"paper": all_papers, | |
}) | |
if query is not None: | |
title = f"{title} + ({query})" | |
fig = px.scatter(data_frame=df, x="x", y="y", color="paper", hover_data="text", hover_name="paper", | |
title=title, opacity=0.5) | |
fig.update_layout( | |
hoverlabel=dict( | |
bgcolor="white", # Set the background color | |
font=dict( | |
color="black", # Set the font color | |
size=12 | |
) | |
), | |
coloraxis_showscale=False, | |
showlegend=False | |
) | |
if query is not None: | |
assert query_embedder is not None | |
query_embedding = np.array(query_embedder.embed_query(query)) | |
query_embedding = np.expand_dims(query_embedding, 0) | |
query_embedding = scaler.transform(query_embedding) | |
query_2d_embedding = reducer.transform(query_embedding) | |
fig.add_scatter(x=query_2d_embedding[:, 0], y=query_2d_embedding[:, 1], hovertext=[query], | |
mode="markers", marker=dict(size=15, color='black', symbol='cross'), name='query') | |
return fig | |
# visualize the chunks | |
fig = plot_chunk_and_query_embeddings({"Chipnemo": (np.array(chipnemo_embeddings), chipnemo_chunks), | |
"Nemotron": (np.array(nemotron_embeddings), nemotron_chunks), | |
"gb200": (np.array(gb200_embeddings), gb200_chunks)}, | |
title="nv-embedqa-e5-v5") | |
fig.show() | |
# try another one to get closer | |
query = "What is the memory of gb200?" # i was looking at nvidia industrial gpu's i can't affort rn | |
fig = plot_chunk_and_query_embeddings({"Chipnemo": (np.array(chipnemo_embeddings), chipnemo_chunks), | |
"Nemotron": (np.array(nemotron_embeddings), nemotron_chunks), | |
"gb200": (np.array(gb200_embeddings), gb200_chunks)}, | |
title="nv-embedqa-e5-v5", query_embedder=embedder, query=query) | |
fig.show() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib.pyplot as plt | |
import pandas as pd | |
from typing import Dict, List, Tuple | |
import numpy as np | |
import plotly.express as px | |
from sklearn.manifold import Isomap | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.decomposition import PCA | |
import umap.umap_ as umap | |
from sklearn.manifold import TSNE | |
def plot_space(dataset, query, nearest): | |
scaler = StandardScaler() | |
reducer = umap.UMAP(random_state=1, n_jobs=1) | |
all_embeddings = np.vstack(dataset) | |
all_embeddings = scaler.fit_transform(all_embeddings) | |
all_embeddings_2d = reducer.fit_transform(all_embeddings) | |
i= np.arange(0, dataset.shape[0]) | |
df = pd.DataFrame({ | |
"x": all_embeddings_2d[:, 0], | |
"y": all_embeddings_2d[:, 1], | |
"id": i | |
}) | |
fig = px.scatter(data_frame=df, x="x", y="y", hover_data="id", opacity=0.5) | |
fig.update_layout( | |
hoverlabel=dict( | |
bgcolor="white", # Set the background color | |
font=dict( | |
color="black", # Set the font color | |
size=12 | |
) | |
), | |
coloraxis_showscale=False, | |
showlegend=False | |
) | |
nearest_embedding = dataset[nearest] | |
nearest_embedding = np.expand_dims(nearest_embedding, 0) | |
nearest_embedding = scaler.transform(nearest_embedding) | |
nearest_2d_embedding = reducer.transform(nearest_embedding) | |
fig.add_scatter(x=nearest_2d_embedding[:, 0], y=nearest_2d_embedding[:, 1], hovertext=['nearest'], | |
mode="markers", marker=dict(size=10, color='red', symbol='cross'), name='nearest') | |
query_embedding = query | |
query_embedding = np.expand_dims(query_embedding, 0) | |
query_embedding = scaler.transform(query_embedding) | |
query_2d_embedding = reducer.transform(query_embedding) | |
fig.add_scatter(x=query_2d_embedding[:, 0], y=query_2d_embedding[:, 1], hovertext=['query'], | |
mode="markers", marker=dict(size=10, color='black', symbol='cross'), name='query') | |
fig.show() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Check the shape of embeddings. Each vector should have a size of 1x1024.