File size: 3,153 Bytes
4d42cbc 75b96db f0fdc4b 75b96db f0fdc4b 4d42cbc 502e188 4d42cbc 7ed47f4 502e188 4d42cbc f0fdc4b 4d42cbc 502e188 4d42cbc 502e188 4d42cbc f0fdc4b 4d42cbc f0fdc4b 4d42cbc f0fdc4b 4d42cbc f0fdc4b 75b96db f0fdc4b 75b96db fb0be47 502e188 4d42cbc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForCausalLM
import gradio as gr
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
# Load model and tokenizer
model_name = "baidu/ERNIE-4.5-0.3B-PT"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForCausalLM.from_pretrained(
model_name,
trust_remote_code=True,
torch_dtype=torch.float32
).to(device)
embedding_layer = model.get_input_embeddings()
# Get sentence embedding by averaging token embeddings
def get_sentence_embedding(text):
inputs = tokenizer(text, return_tensors="pt", add_special_tokens=True).to(device)
with torch.no_grad():
embeddings = embedding_layer(inputs["input_ids"])
sentence_embedding = embeddings.mean(dim=1)
return sentence_embedding
# Show token list and token IDs
def tokenize_sentence(sentence):
tokens = tokenizer.tokenize(sentence)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
return list(zip(tokens, token_ids))
# PCA plot of two sentence embeddings
def plot_embeddings(sentence1, sentence2):
emb1 = get_sentence_embedding(sentence1).cpu().numpy()
emb2 = get_sentence_embedding(sentence2).cpu().numpy()
embeddings = np.vstack([emb1, emb2]) # Shape: (2, hidden_size)
# PCA to reduce to 2D
pca = PCA(n_components=2)
reduced = pca.fit_transform(embeddings)
# Plot
fig, ax = plt.subplots()
ax.scatter(reduced[:, 0], reduced[:, 1], color=["red", "blue"])
ax.annotate("Sentence 1", (reduced[0, 0], reduced[0, 1]), color="red")
ax.annotate("Sentence 2", (reduced[1, 0], reduced[1, 1]), color="blue")
ax.set_title("2D PCA of Sentence Embeddings")
ax.set_xlabel("PCA 1")
ax.set_ylabel("PCA 2")
ax.grid(True)
return fig
# Main function to run all outputs
def analyze_sentences(sentence1, sentence2):
# Cosine similarity
emb1 = get_sentence_embedding(sentence1)
emb2 = get_sentence_embedding(sentence2)
similarity = F.cosine_similarity(emb1, emb2).item()
# Token info
tokens1 = tokenize_sentence(sentence1)
tokens2 = tokenize_sentence(sentence2)
# Plot
fig = plot_embeddings(sentence1, sentence2)
return f"Similarity: {similarity:.4f}", tokens1, tokens2, fig
# Build Gradio interface
demo = gr.Interface(
fn=analyze_sentences,
inputs=[
gr.Textbox(label="Sentence 1", placeholder="I love cat."),
gr.Textbox(label="Sentence 2", placeholder="I love dog."),
],
outputs=[
gr.Textbox(label="Cosine Similarity Score"),
gr.Dataframe(headers=["Token", "Token ID"], label="Sentence 1 Tokens"),
gr.Dataframe(headers=["Token", "Token ID"], label="Sentence 2 Tokens"),
gr.Plot(label="2D PCA Plot of Embeddings"),
],
title="ERNIE 4.5 Embedding Visualization",
description="Compare two sentences using ERNIE 4.5-0.3B's embedding layer. Outputs cosine similarity, token info, and PCA plot.",
)
if __name__ == "__main__":
demo.launch()
|