File size: 3,153 Bytes
4d42cbc
 
 
75b96db
f0fdc4b
 
 
75b96db
f0fdc4b
4d42cbc
 
502e188
4d42cbc
 
 
7ed47f4
502e188
 
4d42cbc
 
f0fdc4b
4d42cbc
502e188
4d42cbc
502e188
 
4d42cbc
 
f0fdc4b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4d42cbc
 
 
 
f0fdc4b
 
 
 
 
 
 
 
 
 
4d42cbc
f0fdc4b
4d42cbc
f0fdc4b
 
 
 
 
 
 
 
75b96db
f0fdc4b
 
75b96db
 
 
fb0be47
502e188
4d42cbc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForCausalLM
import gradio as gr
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Load model and tokenizer
model_name = "baidu/ERNIE-4.5-0.3B-PT"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    torch_dtype=torch.float32
).to(device)

embedding_layer = model.get_input_embeddings()

# Get sentence embedding by averaging token embeddings
def get_sentence_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", add_special_tokens=True).to(device)
    with torch.no_grad():
        embeddings = embedding_layer(inputs["input_ids"])
        sentence_embedding = embeddings.mean(dim=1)
    return sentence_embedding

# Show token list and token IDs
def tokenize_sentence(sentence):
    tokens = tokenizer.tokenize(sentence)
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    return list(zip(tokens, token_ids))

# PCA plot of two sentence embeddings
def plot_embeddings(sentence1, sentence2):
    emb1 = get_sentence_embedding(sentence1).cpu().numpy()
    emb2 = get_sentence_embedding(sentence2).cpu().numpy()
    embeddings = np.vstack([emb1, emb2])  # Shape: (2, hidden_size)
    
    # PCA to reduce to 2D
    pca = PCA(n_components=2)
    reduced = pca.fit_transform(embeddings)

    # Plot
    fig, ax = plt.subplots()
    ax.scatter(reduced[:, 0], reduced[:, 1], color=["red", "blue"])
    ax.annotate("Sentence 1", (reduced[0, 0], reduced[0, 1]), color="red")
    ax.annotate("Sentence 2", (reduced[1, 0], reduced[1, 1]), color="blue")
    ax.set_title("2D PCA of Sentence Embeddings")
    ax.set_xlabel("PCA 1")
    ax.set_ylabel("PCA 2")
    ax.grid(True)
    return fig

# Main function to run all outputs
def analyze_sentences(sentence1, sentence2):
    # Cosine similarity
    emb1 = get_sentence_embedding(sentence1)
    emb2 = get_sentence_embedding(sentence2)
    similarity = F.cosine_similarity(emb1, emb2).item()

    # Token info
    tokens1 = tokenize_sentence(sentence1)
    tokens2 = tokenize_sentence(sentence2)

    # Plot
    fig = plot_embeddings(sentence1, sentence2)

    return f"Similarity: {similarity:.4f}", tokens1, tokens2, fig

# Build Gradio interface
demo = gr.Interface(
    fn=analyze_sentences,
    inputs=[
        gr.Textbox(label="Sentence 1", placeholder="I love cat."),
        gr.Textbox(label="Sentence 2", placeholder="I love dog."),
    ],
    outputs=[
        gr.Textbox(label="Cosine Similarity Score"),
        gr.Dataframe(headers=["Token", "Token ID"], label="Sentence 1 Tokens"),
        gr.Dataframe(headers=["Token", "Token ID"], label="Sentence 2 Tokens"),
        gr.Plot(label="2D PCA Plot of Embeddings"),
    ],
    title="ERNIE 4.5 Embedding Visualization",
    description="Compare two sentences using ERNIE 4.5-0.3B's embedding layer. Outputs cosine similarity, token info, and PCA plot.",
)

if __name__ == "__main__":
    demo.launch()