import gradio as gr from transformers import pipeline import re from langdetect import detect import numpy as np import pandas as pd # Load models for generation and rating gen_model = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.1") rater_models = [ pipeline("text-generation", model="HuggingFaceH4/zephyr-7b-beta"), pipeline("text-generation", model="google/flan-t5-large") ] # Language list languages = { "en": "English", "es": "Spanish", "fr": "French", "de": "German", "it": "Italian", "pt": "Portuguese", "ru": "Russian", "ar": "Arabic", "hi": "Hindi", "ja": "Japanese" } def clean_text(text): return re.sub(r'[^a-zA-Z0-9]', '', text.lower()) def is_palindrome(text): cleaned = clean_text(text) return cleaned == cleaned[::-1] def grammar_prompt(pal, lang): return f'''Rate from 0 to 100 how grammatically correct this palindrome is in {lang}. Only return a number with no explanation:\n\n"{pal}"\n''' def extract_score(text): match = re.search(r"\d{1,3}", text) if match: score = int(match.group()) return min(max(score, 0), 100) return 0 def run_benchmark(): results = [] for code, lang in languages.items(): prompt = f'''Write the longest original palindrome you can in {lang}. It should be creative and not a known palindrome. If it is not a correct palindrome, you will lose points according to how correct it is.''' gen_output = gen_model(prompt, max_new_tokens=100, do_sample=True)[0]['generated_text'].strip() valid = is_palindrome(gen_output) cleaned_len = len(clean_text(gen_output)) detected_lang = detect(gen_output) scores = [] for rater in rater_models: rprompt = grammar_prompt(gen_output, lang) rtext = rater(rprompt, max_new_tokens=10)[0]['generated_text'] score = extract_score(rtext) scores.append(score) avg_score = np.mean(scores) penalty = (avg_score / 100) if valid else (avg_score / 100) * 0.5 final_score = round(cleaned_len * penalty, 2) results.append({ "Language": lang, "Palindrome": gen_output, "Valid": "✅" if valid else "❌", "Length": cleaned_len, "Grammar Score": avg_score, "Final Score": final_score, "Detected Lang": detected_lang }) df = pd.DataFrame(results).sort_values(by="Final Score", ascending=False).reset_index(drop=True) return gr.Dataframe(df) iface = gr.Interface(fn=run_benchmark, inputs=[], outputs="dataframe", title="🔁 LLM Palindrome Benchmark") iface.launch()