|
|
|
""" |
|
Martinez-Gil, J. (2025). Augmenting the Interpretability of GraphCodeBERT for Code Similarity Tasks. |
|
International Journal of Software Engineering and Knowledge Engineering, 35(05), 657–678. |
|
""" |
|
|
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
from sklearn.decomposition import PCA |
|
from transformers import RobertaTokenizer, RobertaModel |
|
import torch |
|
import gradio as gr |
|
from io import BytesIO |
|
from PIL import Image |
|
|
|
|
|
tokenizer = RobertaTokenizer.from_pretrained("microsoft/graphcodebert-base", cache_dir="models/") |
|
model = RobertaModel.from_pretrained("microsoft/graphcodebert-base", cache_dir="models/") |
|
|
|
|
|
default_code_1 = """def bubble_sort(arr): |
|
n = len(arr) |
|
for i in range(n): |
|
for j in range(0, n-i-1): |
|
if arr[j] > arr[j+1]: |
|
arr[j], arr[j+1] = arr[j+1], arr[j] |
|
return arr""" |
|
|
|
default_code_2 = """def quick_sort(arr, low, high): |
|
if low < high: |
|
pi = partition(arr, low, high) |
|
quick_sort(arr, low, pi - 1) |
|
quick_sort(arr, pi + 1, high) |
|
|
|
def partition(arr, low, high): |
|
i = (low - 1) |
|
pivot = arr[high] |
|
for j in range(low, high): |
|
if arr[j] <= pivot: |
|
i += 1 |
|
arr[i], arr[j] = arr[j], arr[i] |
|
arr[i+1], arr[high] = arr[high], arr[i+1] |
|
return (i + 1)""" |
|
|
|
|
|
def get_token_embeddings(code): |
|
inputs = tokenizer(code, return_tensors="pt", max_length=512, truncation=True, padding=True) |
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
token_embeddings = outputs.last_hidden_state.squeeze(0).cpu().numpy() |
|
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze()) |
|
return token_embeddings, tokens |
|
|
|
|
|
def compare_algorithms(code1, code2): |
|
emb1, tokens1 = get_token_embeddings(code1) |
|
emb2, tokens2 = get_token_embeddings(code2) |
|
|
|
combined = np.concatenate([emb1, emb2], axis=0) |
|
pca = PCA(n_components=2) |
|
coords = pca.fit_transform(combined) |
|
|
|
plt.figure(figsize=(6, 5), dpi=150) |
|
plt.scatter(coords[:len(tokens1), 0], coords[:len(tokens1), 1], color='red', label="Code 1", s=20) |
|
plt.scatter(coords[len(tokens1):, 0], coords[len(tokens1):, 1], color='blue', label="Code 2", s=20) |
|
plt.legend() |
|
plt.xticks([]); plt.yticks([]); plt.grid(False) |
|
|
|
buf = BytesIO() |
|
plt.savefig(buf, format='png', bbox_inches='tight') |
|
plt.close() |
|
buf.seek(0) |
|
return Image.open(buf) |
|
|
|
interface = gr.Interface( |
|
fn=compare_algorithms, |
|
inputs=[ |
|
gr.Code(language="python", value=default_code_1, label="Code 1"), |
|
gr.Code(language="python", value=default_code_2, label="Code 2") |
|
], |
|
outputs=gr.Image(type="pil", label="Token Embedding PCA"), |
|
title="GraphCodeBERT Token Embedding Comparison", |
|
description="Edit or paste two Python code snippets. This tool compares their token-level embeddings using GraphCodeBERT and PCA.", |
|
article=""" |
|
**Citation** |
|
Martinez-Gil, J. (2025). *Augmenting the Interpretability of GraphCodeBERT for Code Similarity Tasks.* International Journal of Software Engineering and Knowledge Engineering, 35(05), 657–678. |
|
|
|
**GitHub Repository** |
|
[View Source on GitHub](https://github.com/jorge-martinez-gil/graphcodebert-interpretability) |
|
""" |
|
) |
|
|
|
if __name__ == "__main__": |
|
interface.launch() |
|
|
|
|
|
|
|
|