import torch import torch.nn.functional as F from transformers import AutoTokenizer, AutoModelForCausalLM import gradio as gr import numpy as np import matplotlib.pyplot as plt from sklearn.decomposition import PCA # Load model and tokenizer model_name = "baidu/ERNIE-4.5-0.3B-PT" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = AutoModelForCausalLM.from_pretrained( model_name, trust_remote_code=True, torch_dtype=torch.float32 ).to(device) embedding_layer = model.get_input_embeddings() # Get sentence embedding by averaging token embeddings def get_sentence_embedding(text): inputs = tokenizer(text, return_tensors="pt", add_special_tokens=True).to(device) with torch.no_grad(): embeddings = embedding_layer(inputs["input_ids"]) sentence_embedding = embeddings.mean(dim=1) return sentence_embedding # Show token list and token IDs def tokenize_sentence(sentence): tokens = tokenizer.tokenize(sentence) token_ids = tokenizer.convert_tokens_to_ids(tokens) return list(zip(tokens, token_ids)) # PCA plot of two sentence embeddings def plot_embeddings(sentence1, sentence2): emb1 = get_sentence_embedding(sentence1).cpu().numpy() emb2 = get_sentence_embedding(sentence2).cpu().numpy() embeddings = np.vstack([emb1, emb2]) # Shape: (2, hidden_size) # PCA to reduce to 2D pca = PCA(n_components=2) reduced = pca.fit_transform(embeddings) # Plot fig, ax = plt.subplots() ax.scatter(reduced[:, 0], reduced[:, 1], color=["red", "blue"]) ax.annotate("Sentence 1", (reduced[0, 0], reduced[0, 1]), color="red") ax.annotate("Sentence 2", (reduced[1, 0], reduced[1, 1]), color="blue") ax.set_title("2D PCA of Sentence Embeddings") ax.set_xlabel("PCA 1") ax.set_ylabel("PCA 2") ax.grid(True) return fig # Main function to run all outputs def analyze_sentences(sentence1, sentence2): # Cosine similarity emb1 = get_sentence_embedding(sentence1) emb2 = get_sentence_embedding(sentence2) similarity = F.cosine_similarity(emb1, emb2).item() # Token info tokens1 = tokenize_sentence(sentence1) tokens2 = tokenize_sentence(sentence2) # Plot fig = plot_embeddings(sentence1, sentence2) return f"Similarity: {similarity:.4f}", tokens1, tokens2, fig # Build Gradio interface demo = gr.Interface( fn=analyze_sentences, inputs=[ gr.Textbox(label="Sentence 1", placeholder="I love cat."), gr.Textbox(label="Sentence 2", placeholder="I love dog."), ], outputs=[ gr.Textbox(label="Cosine Similarity Score"), gr.Dataframe(headers=["Token", "Token ID"], label="Sentence 1 Tokens"), gr.Dataframe(headers=["Token", "Token ID"], label="Sentence 2 Tokens"), gr.Plot(label="2D PCA Plot of Embeddings"), ], title="ERNIE 4.5 Embedding Visualization", description="Compare two sentences using ERNIE 4.5-0.3B's embedding layer. Outputs cosine similarity, token info, and PCA plot.", ) if __name__ == "__main__": demo.launch()