import gradio as gr
import pandas as pd
import numpy as np
import re
import unicodedata
import ftfy
import nltk
import os
import json
import time
from typing import Dict, Any, List, Tuple, Optional
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from bert_score import score as bert_score
from nltk.translate.meteor_score import meteor_score
import google.generativeai as genai
from groq import Groq
from dotenv import load_dotenv

# Download necessary NLTK resources
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)

# Load environment variables
load_dotenv()

# Initialize API clients (with graceful fallback if keys missing)
try:
    GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
    if GEMINI_API_KEY:
        genai.configure(api_key=GEMINI_API_KEY)
    else:
        print("Warning: GEMINI_API_KEY not found in environment variables")
except Exception as e:
    print(f"Error configuring Gemini: {str(e)}")
    GEMINI_API_KEY = None

try:
    GROQ_API_KEY = os.getenv("GROQ_API_KEY")
    if GROQ_API_KEY:
        groq_client = Groq(api_key=GROQ_API_KEY)
    else:
        print("Warning: GROQ_API_KEY not found in environment variables")
        groq_client = None
except Exception as e:
    print(f"Error configuring Groq: {str(e)}")
    groq_client = None

# Text cleaning function
def clean_text(text: str) -> str:
    """Clean text by fixing encoding issues and standardizing format"""
    if not isinstance(text, str) or not text.strip():
        return ""
    
    text = ftfy.fix_text(text)  # Fixes encoding artifacts
    text = unicodedata.normalize('NFKD', text)
    # Replace common smart quotes and dashes
    replacements = {
        'â€œ': '"', 'â€': '"', 'â€“': '-', 'â€”': '--',
        'â€¢': '*', 'â€¦': '...', 'Â': ''
    }
    for old, new in replacements.items():
        text = text.replace(old, new)
    # Remove non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    # Normalize whitespace
    return ' '.join(text.split())

# LLM Provider classes
class LLMProvider:
    def __init__(self, model_name: str):
        self.model_name = model_name
    
    def generate(self, prompt: str) -> str:
        raise NotImplementedError
    
    def get_model_name(self) -> str:
        return self.model_name

class GeminiProvider(LLMProvider):
    def __init__(self, model_name: str = "gemini-1.5-flash-latest"):
        super().__init__(model_name)
        self.available = bool(GEMINI_API_KEY)
        if self.available:
            try:
                self.model = genai.GenerativeModel(model_name)
            except Exception as e:
                print(f"Error initializing Gemini model: {str(e)}")
                self.available = False
    
    def generate(self, prompt: str) -> str:
        if not self.available:
            return "Error: Gemini API not configured properly. Check your API key."
        
        try:
            response = self.model.generate_content(prompt)
            return response.text
        except Exception as e:
            return f"Error generating with Gemini: {str(e)}"

class GroqProvider(LLMProvider):
    def __init__(self, model_name: str = "llama3-70b-8192"):
        super().__init__(model_name)
        self.available = bool(groq_client)
    
    def generate(self, prompt: str) -> str:
        if not self.available:
            return "Error: Groq API not configured properly. Check your API key."
        
        try:
            chat_completion = groq_client.chat.completions.create(
                messages=[
                    {"role": "user", "content": prompt}
                ],
                model=self.model_name,
                temperature=0.3
            )
            return chat_completion.choices[0].message.content
        except Exception as e:
            return f"Error generating with Groq: {str(e)}"

# Prompt templates
PROMPT_TEMPLATES = {
    "Strategic Narrative Architect": """Role: Strategic Narrative Architect
You are a professional content writer with expertise in creating engaging, well-structured narratives.
Your task is to rewrite the following text in a professional, engaging style while preserving all key facts and information:
{text}
Instructions:
1. Maintain all factual information and key details
2. Improve structure and flow for better readability
3. Enhance engagement through appropriate storytelling techniques
4. Use professional language appropriate for the content domain
5. Ensure the output is concise yet comprehensive
6. Begin directly with the content - do NOT include introductory phrases like "Here's a rewritten version" or "Rewritten content"
7. Write as if this is the final published version, not as a response to a rewrite request

Output:""",
    
    "Precision Storyteller": """Role: Precision Storyteller
You are a professional editor focused on accuracy, clarity, and precision.
Your task is to rewrite the following text with maximum factual accuracy while improving clarity:
{text}
Instructions:
1. Preserve all factual information with absolute precision
2. Correct any grammatical errors or awkward phrasing
3. Ensure logical flow and coherence
4. Use clear, concise language without unnecessary embellishment
5. Maintain professional tone appropriate for the content domain
6. Begin directly with the content - do NOT include introductory phrases like "Here's a rewritten version" or "Rewritten content"
7. Write as if this is the final published version, not as a response to a rewrite request

Output:"""
}
# Metric normalization ranges
NORMALIZATION_RANGES = {
    "AnswerRelevancy": (0.0, 1.0),
    "Faithfulness": (0.0, 1.0),
    "GEval": (0.0, 1.0),
    "BERTScore": (0.7, 0.95),
    "ROUGE": (0.0, 0.6),
    "BLEU": (0.0, 0.4),
    "METEOR": (0.0, 0.6)
}

# Metric weights
METRIC_WEIGHTS = {
    "AnswerRelevancy": 0.10,
    "Faithfulness": 0.10,
    "GEval": 0.025,
    "BERTScore": 0.20,
    "ROUGE": 0.15,
    "BLEU": 0.025,
    "METEOR": 0.15
}

def normalize_score(metric: str, value: float) -> float:
    """Normalize score to 0-1 scale based on metric's natural range"""
    if metric not in NORMALIZATION_RANGES or not isinstance(value, (int, float)):
        return value
    
    min_val, max_val = NORMALIZATION_RANGES[metric]
    # Handle edge cases
    if max_val <= min_val:
        return 0.5  # Default middle value if range is invalid
    
    # Normalize and clamp to [0,1]
    normalized = (value - min_val) / (max_val - min_val)
    return max(0.0, min(normalized, 1.0))

def calculate_weighted_score(scores: Dict[str, float]) -> float:
    """Calculate weighted average of normalized scores"""
    normalized_scores = {m: normalize_score(m, v) for m, v in scores.items()}
    total_weight = 0
    weighted_sum = 0
    
    for metric, weight in METRIC_WEIGHTS.items():
        if metric in normalized_scores:
            weighted_sum += normalized_scores[metric] * weight
            total_weight += weight
    
    return weighted_sum / total_weight if total_weight > 0 else 0

def evaluate_text(reference_text: str, candidate_text: str) -> Dict[str, Any]:
    """Evaluate a candidate text against a reference text"""
    # Clean both texts for consistent evaluation
    reference_text = clean_text(reference_text)
    candidate_text = clean_text(candidate_text)
    
    # Initialize evaluation metrics
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    # Calculate traditional metrics
    results = {}
    
    # BLEU Score
    try:
        smooth = SmoothingFunction().method4
        bleu = sentence_bleu(
            [reference_text.split()], 
            candidate_text.split(), 
            smoothing_function=smooth
        )
        results["BLEU"] = bleu
    except Exception as e:
        print(f"BLEU error: {str(e)}")
        results["BLEU"] = 0.0
    
    # ROUGE Score
    try:
        rouge_scores = scorer.score(reference_text, candidate_text)
        rouge = (rouge_scores['rouge1'].fmeasure + 
                 rouge_scores['rouge2'].fmeasure + 
                 rouge_scores['rougeL'].fmeasure) / 3
        results["ROUGE"] = rouge
    except Exception as e:
        print(f"ROUGE error: {str(e)}")
        results["ROUGE"] = 0.0
    
    # METEOR Score
    try:
        meteor = meteor_score(
            [reference_text.split()], 
            candidate_text.split()
        )
        results["METEOR"] = meteor
    except Exception as e:
        print(f"METEOR error: {str(e)}")
        results["METEOR"] = 0.0
    
    # BERTScore
    try:
        P, R, F1 = bert_score(
            [candidate_text], 
            [reference_text], 
            lang="en", 
            verbose=False
        )
        results["BERTScore"] = F1.item()
    except Exception as e:
        print(f"BERTScore error: {str(e)}")
        results["BERTScore"] = 0.7  # Default low value
    
    # LLM-as-judge metrics - simplified implementation since DeepEval might not be available
    try:
        # Use Gemini as judge if available
        if GEMINI_API_KEY:
            judge_model = GeminiProvider("gemini-1.5-flash-latest")
            
            # Answer Relevancy
            relevancy_prompt = f"""
            On a scale of 0.0 to 1.0, how relevant is the following candidate text to the input?
            
            Input: {reference_text[:500]}{'...' if len(reference_text) > 500 else ''}
            Candidate: {candidate_text[:500]}{'...' if len(candidate_text) > 500 else ''}
            
            Provide only a single number between 0.0 and 1.0 with no explanation.
            """
            relevancy_response = judge_model.generate(relevancy_prompt)
            try:
                relevancy_score = float(relevancy_response.strip())
                results["AnswerRelevancy"] = max(0.0, min(1.0, relevancy_score))
            except:
                results["AnswerRelevancy"] = 0.5
            
            # Faithfulness
            faithfulness_prompt = f"""
            On a scale of 0.0 to 1.0, how faithful is the candidate text to the original input in terms of factual accuracy?
            
            Input: {reference_text[:500]}{'...' if len(reference_text) > 500 else ''}
            Candidate: {candidate_text[:500]}{'...' if len(candidate_text) > 500 else ''}
            
            Provide only a single number between 0.0 and 1.0 with no explanation.
            """
            faithfulness_response = judge_model.generate(faithfulness_prompt)
            try:
                faithfulness_score = float(faithfulness_response.strip())
                results["Faithfulness"] = max(0.0, min(1.0, faithfulness_score))
            except:
                results["Faithfulness"] = 0.5
            
            # GEval
            geval_prompt = f"""
            On a scale of 0.0 to 1.0, evaluate the overall quality of the candidate text.
            Consider accuracy, completeness, fluency, and professionalism.
            
            Input: {reference_text[:500]}{'...' if len(reference_text) > 500 else ''}
            Candidate: {candidate_text[:500]}{'...' if len(candidate_text) > 500 else ''}
            
            Provide only a single number between 0.0 and 1.0 with no explanation.
            """
            geval_response = judge_model.generate(geval_prompt)
            try:
                geval_score = float(geval_response.strip())
                results["GEval"] = max(0.0, min(1.0, geval_score))
            except:
                results["GEval"] = 0.5
        else:
            # Default values if no judge model available
            results["AnswerRelevancy"] = 0.5
            results["Faithfulness"] = 0.5
            results["GEval"] = 0.5
    except Exception as e:
        print(f"LLM-as-judge error: {str(e)}")
        # Default values if DeepEval fails
        results["AnswerRelevancy"] = 0.5
        results["Faithfulness"] = 0.5
        results["GEval"] = 0.5
    
    # Calculate normalized and weighted scores
    normalized_scores = {m: normalize_score(m, v) for m, v in results.items()}
    weighted_score = calculate_weighted_score(results)
    
    # Determine interpretation
    if weighted_score >= 0.85:
        interpretation = "Outstanding performance (A) - ready for professional use"
    elif weighted_score >= 0.70:
        interpretation = "Strong performance (B) - good quality with minor improvements"
    elif weighted_score >= 0.50:
        interpretation = "Adequate performance (C) - usable but needs refinement"
    elif weighted_score >= 0.30:
        interpretation = "Weak performance (D) - requires significant revision"
    else:
        interpretation = "Poor performance (F) - likely needs complete rewriting"
    
    return {
        "candidate": candidate_text,
        "metrics": results,
        "normalized": normalized_scores,
        "weighted_score": weighted_score,
        "interpretation": interpretation
    }

def process_input(input_mode: str, reference_text: str, candidate_text: str, model_choice: str, prompt_choice: str, progress=gr.Progress()) -> Tuple[str, str, List[List[str]], str]:
    """Process input based on selected mode"""
    if not reference_text:
        return "", "", [], "Please provide reference text."
    
    # Determine model provider
    if model_choice == "Gemini":
        model_provider = GeminiProvider("gemini-1.5-flash-latest")
    elif model_choice == "Llama-3-70b":
        model_provider = GroqProvider("llama3-70b-8192")
    else:  # Llama-3-8b
        model_provider = GroqProvider("llama3-8b-8192")
    
    # Check if model is available
    if not model_provider.available:
        return "", "", [], f"Error: {model_choice} is not properly configured. Check your API key."
    
    # Process based on input mode
    if input_mode == "Reference Only (Generate Candidate)":
        progress(0.1, desc="Starting evaluation...")
        time.sleep(0.1)
        
        progress(0.3, desc="Generating rewritten content using prompt...")
        time.sleep(0.1)
        
        # Get prompt template
        prompt_template = PROMPT_TEMPLATES[prompt_choice]
        
        # Generate candidate using the selected model and prompt
        prompt = prompt_template.replace("{text}", reference_text)
        candidate = model_provider.generate(prompt)
        cleaned_candidate = clean_text(candidate)
        
        progress(0.6, desc="Calculating metrics...")
        time.sleep(0.1)
        
        # Evaluate the generated candidate
        result = evaluate_text(reference_text, cleaned_candidate)
        
        progress(0.9, desc="Finalizing results...")
        time.sleep(0.1)
        
    else:  # "Both Reference and Candidate"
        progress(0.3, desc="Calculating metrics...")
        time.sleep(0.2)
        
        # Evaluate the provided candidate
        result = evaluate_text(reference_text, candidate_text)
        
        progress(0.8, desc="Finalizing results...")
        time.sleep(0.1)
        cleaned_candidate = clean_text(candidate_text)
    
    # Format metrics for display - ONLY SHOWING NORMALIZED SCORES AND HYBRID SCORE
    metrics_table = [
        ["Metric", "Normalized Score"],
        ["AnswerRelevancy", f"{result['normalized']['AnswerRelevancy']:.4f}"],
        ["Faithfulness", f"{result['normalized']['Faithfulness']:.4f}"],
        ["GEval", f"{result['normalized']['GEval']:.4f}"],
        ["BERTScore", f"{result['normalized']['BERTScore']:.4f}"],
        ["ROUGE", f"{result['normalized']['ROUGE']:.4f}"],
        ["BLEU", f"{result['normalized']['BLEU']:.4f}"],
        ["METEOR", f"{result['normalized']['METEOR']:.4f}"],
        ["Hybrid Score", f"{result['weighted_score']:.4f}"]
    ]
    
    return (
        reference_text,
        result["candidate"],
        metrics_table,
        f"Hybrid Score: {result['weighted_score']:.4f} - {result['interpretation']}"
    )

def load_example():
    """Load the Mike Gordon example with reference and candidate texts"""
    reference_text = """When he showed up to work on 27 June, Mike Gordon was having one of the best weeks of his career. Gordon, a federal prosecutor in Tampa, had spent the last month working on a complex case involving allegations that well-known businessman Leo Govoni stole $100m from a fund for children with special needs. That Monday, the US attorney for the middle district of Florida held a press conference announcing an indictment in the case. On Wednesday, Gordon had his semi-annual performance review and received the top rating: outstanding. On Thursday, he appeared on behalf of the government in court and successfully convinced a judge that Govoni should remain in jail until his trial. As the end of the day rolled around on Friday 27 June, Gordon was fired. He wasn't given a reason for his dismissal. An office assistant simply knocked on his door while he was preparing a witness for trial, and handed him a letter that told him he was being immediately fired "Pursuant to Article II of the United States Constitution and the laws of the United States", signed by the US attorney general, Pam Bondi. Gordon was told to turn in his devices, pack up his things and leave. The firing was so abrupt he had no chance to hand over his work to colleagues. Even without an official explanation, there was little doubt why he was fired. From 2021 until the end of 2023, Gordon had volunteered for the team prosecuting people involved in the January 6 attack on the US Capitol. During his time working in a unit called the Capitol siege section, Gordon became known as one of the most skilled trial litigators and became a kind of coach to other prosecutors as they prepared for trials. But on his first day in office, Donald Trump issued a blanket pardon to anyone involved in January 6. "I got fired because I prosecuted people that this administration wanted protected. Bottom line," Gordon said. Trump has made no secret of his desire to exact revenge on those who investigated and prosecuted him and his allies. Scores of career prosecutors have been fired for getting in the president's political crosshairs. Gordon's firing is one of the best examples to date of how Trump is executing that promise and purging the justice department. It's an attack on a fundamental pillar of the rule of law - that prosecutors should make decisions about whether and how they should bring cases without political concerns. Even though January 6 was one of the most explosive political events in recent history, Gordon said he never had any conversations with a supervisor in which they discussed the political ramifications of what they were doing. The political implications "didn't matter", he said. "What we did discuss was the importance we saw in protecting democracy and prosecuting these cases. And creating the precedent and the deterrence that political violence was unacceptable. Full stop," he said. "And that was worth doing no matter what. It's why I still think all these prosecutions were worth doing. Even after the pardons, even after my own firing." The mass firings of career prosecutors is "unprecedented", said Max Stier, the CEO of the Partnership for Public Service, a watchdog group. "There's enormous discretion that prosecutors have, and there is a tradition that it's not about winning, it's about doing justice, and we're watching that tradition change into it's neither about winning or doing justice, but it's doing the bidding of President Trump," he said. Two other prosecutors who worked on January 6 cases were fired on the same day. Their dismissals came after Ed Martin, a prominent defense lawyer for January 6 defendants, launched a "weaponization working group" at the Department of Justice. Gordon and others who work with him aren't sure why he and two other colleagues were singled out among hundreds of career prosecutors who worked on January 6 cases. The justice department did not return a request for comment on his firing. One theory is that Gordon was targeted because he took on some of the most high-profile cases, including that of Richard Barnett, who was photographed with his feet on a desk in Nancy Pelosi's office (sentenced to 54 months in prison), Eric Munchel, known as Zip Tie Guy (sentenced to 57 months in prison), and Ray Epps, who became the center of conspiracy theories about January 6. In person, Gordon, who is 47 with salt-and-pepper hair, has a boyishness that belies his intensity as a prosecutor. During an interview at his home in Tampa, where an American flag was flying outside his door, he sat barefoot and in shorts, knees tucked to his chest as he recounted his time working on the January 6 cases and processing his firing. Last month, Gordon and two other justice department employees filed a federal lawsuit challenging his dismissal, alleging that they had been wrongfully fired. A litigator in his bones, Gordon has mapped out in his head how he thinks the administration is likely to defend itself. He thinks the case will ultimately be decided by the US supreme court. "I can tell you that I have been contacted plenty of times by colleagues from my former office who tell me that they're all wondering, am I next? Have I done something that's going to be on the wrong side of this administration? Am I going to be punished for some other work I've done?" he said. Before he was a prosecutor, Gordon taught high school humanities and his voice still carries the boom of someone who can hold the attention of a classroom of teenagers. He does not mix up facts. When I mistakenly said Barnett, one of the defendants he successfully prosecuted, had put his feet on Pelosi's desk, he politely responded that it had been a desk in her office. And when I repeated what I thought were the names of his two cockapoos - Cereal and Cheerio - he quietly corrected me. "Maple and Cheerio," he said. Gordon said he saw the January 6 riot unfold on television while he was folding laundry, four years after he joined the justice department. The prosecutor part of his brain quickly kicked in, he said: "I'm watching a crime scene, I'm watching a crime unfold. And there are all these television cameras around, which is really rare." In the back of his mind, he recalled something that a mentor told him as a newly minted federal prosecutor in 2017. Throughout his whole life, he had probably seen something that might be a crime and thought "somebody should do something about that". Now, he was the person who could do something about it. Initially, Gordon, who was working on violent crimes and narcotics cases in the US attorney's office for the middle district of Florida, thought that he could help on some of the cases if they involved people from around Tampa. But as the federal prosecutor's office in Washington charged with investigating January 6 began its work, more attorneys were needed and the department sent out a request for more staff. Gordon volunteered and was chosen. Working remotely from Tampa and traveling to Washington for trials, Gordon became one of several prosecutors taking on cases as they were randomly assigned. In 2022, one of the cases he prosecuted was Kyle Fitzsimons, a 39-year-old man from Maine who wore a white butcher's jacket and fur pelt, and carried an unstrung bow, and who assaulted five law enforcement officers in a span of about five minutes. He was sentenced to 87 months in prison. Gordon's performance got the attention of his supervisors and he earned a reputation as a skilled trial litigator. He was put on more trials on some of the more high-profile cases the department was prosecuting. He eventually got a new title - "senior trial counsel" - and would work with lawyers in the Capitol siege section to go over their briefs, help them prepare arguments and go to trial. "I felt the weight of the responsibility to do it well and do it right," he said. Taking a long pause, he added: "I felt passionate about the righteousness of what we were doing. "I welcome the scrutiny. I'm well aware of the irony that it's those kinds of things that are probably why I'm unemployed now, probably why I was fired, because I did take on those higher-profile things and this is the risk that sort of comes with it." "I mean, he worked his ass off," said Gregory Rosen, Gordon's boss, and the head of the Capitol siege section (Rosen resigned from the justice department in June). "He is absolutely the heavy hitter in terms of getting cases across the line and taking these high-profile cases. He doesn't shy away easily. He's not afraid of the limelight. And he knows how to put his head down and get through the nitty-gritty." Other prosecutors in the Capitol siege section would even come watch Gordon cross-examine a witness or present to a jury, said Jason Manning, a former federal prosecutor who worked with Gordon. "Mike's reputation for excellence in court was such that people really made time to go watch him perform," Manning said. "You simply don't replace people like Mike overnight." One of the highest-profile cases Gordon took on was that of Barnett, an Arkansas man who had become one of the most prominent January 6 rioters when he was photographed with his feet on a desk in Pelosi's office. He also left her a note that said: "Hey Nancy Bigo was here, biatch." Even though Barnett's conduct was "somewhere in the middle" on the spectrum of January 6 cases, Gordon understood there would be considerable public attention on the trial. He welcomed the opportunity to show the public how thoroughly the government had investigated the case against Barnett. He also embraced the opportunity to show that Barnett's crime wasn't that he merely put his feet on a desk in Pelosi's office - he came to the Capitol with weapons: a 10lb metal pole and a stun device concealed in a walking stick, which he brandished at a police officer. "The cross-examination got a lot of attention," he said. Barnett offered a litany of excuses trying to downplay his conduct on the day of January 6, at one point arguing that he had been looking for a bathroom and had wandered into Pelosi's office. Gordon pointed out he had spent a considerable amount of time wandering around the Capitol and never asked for directions to a bathroom. Barnett also stole an envelope from Pelosi's office, but said he didn't consider it a crime because it was a "biohazard" as he had gotten blood on it. He had left a quarter and said he didn't consider it theft. "He had a number of just patently obvious lies," Gordon said. "It's very clear that he had the intent, and here's all the evidence we've amassed against this guy to show the public we've done our homework. "People can see all the different witnesses. They can hear from Barnett himself, how he responds to cross-examination. Because folks go on friendly podcasts, they post on Twitter, there's nobody pushing back." Barnett was convicted on eight counts, both felonies and misdemeanors, and sentenced to more than four years in prison. After Barnett was sentenced, Gordon prosecuted the case against Epps, an Arizona man who was on the Capitol grounds on January 6 and, the night before, encouraged protesters to go to the Capitol. He initially wasn't charged with a crime, but his life was upended when he became the subject of a conspiracy theory, touted by Tucker Carlson and others, that he was a federal agent. Eventually he was charged with a misdemeanor, making him one of the few people charged with a crime who didn't enter the Capitol on January 6. Epps eventually pleaded guilty to the misdemeanor. Gordon submitted a sentencing memo on behalf of the government that said Epps should be sentenced to six months in prison, $500 in restitution and a year of supervised release. He was eventually sentenced to a year of probation. Gordon said the case was the hardest one he worked on. "Figuring out what the right thing to do with the Ray Epps case was a real challenge," he said. "I'm well aware that not everyone agrees. But someone needed to be the lightning rod to handle that, and somebody needed to be able to handle the public appearances." Edward Ungvarsky, Epps's lawyer, said that while he and Gordon disagreed on what should happen in the case, he saw Gordon as apolitical, "thoughtful" and "just very by the book". "He wasn't political in the slightest," Ungvarsky said. "I wouldn't have known if he was a Republican or a Democrat." When Gordon and I first spoke shortly after he was fired, he was hesitant to go on the record because he thought there was an outside chance that he could get his job back. "It seemed so unjust. It seemed so, not only at odds with my performance, but also what I understood my reputation to be within my office," he said. As the weeks stretched on, he said, it became clear that wasn't going to happen. He's since explained to his children why he was fired. Gordon's 10-year-old son "understands the idea that something wrong happened to me and I'm doing what I can to speak up about it and to fight back", he said. "And he really admires that. And that's another thing that gives me inspiration and fuel." These days, Gordon says that he has no regrets. "It was the right thing to do," he said. "I will always operate from the position that I will do the right thing first, and then I'll worry about what the consequences of that are." """
    
    candidate_text = """When Mike Gordon arrived at work on June 27, he was experiencing one of the best weeks of his professional life. Gordon, a federal prosecutor based in Tampa, had dedicated the past month to a complex case involving allegations that prominent businessman Leo Govoni had stolen $100m from a fund designated for children with special needs. That Monday, the US attorney for the middle district of Florida held a press conference to announce an indictment in the case. By Wednesday, Gordon received an "outstanding," the top rating, on his semi-annual performance review. On Thursday, representing the government in court, he successfully convinced a judge that Govoni should remain incarcerated until his trial. As Friday, June 27, drew to a close, Gordon was fired. He was provided no explanation for his dismissal. An office assistant simply knocked on his door while he was preparing a witness for trial and handed him a letter, stating he was being immediately fired "Pursuant to Article II of the United States Constitution and the laws of the United States," signed by the US attorney general, Pam Bondi. Gordon was instructed to surrender his devices, pack his belongings, and leave. The firing was so abrupt that he had no opportunity to transfer his ongoing work to colleagues. Even without an official explanation, there was little doubt regarding the reason for his termination. From 2021 until the end of 2023, Gordon had volunteered for the team prosecuting individuals involved in the January 6 attack on the US Capitol. During his tenure in a unit known as the Capitol siege section, Gordon gained recognition as one of the most skilled trial litigators, even becoming a sort of coach to other prosecutors as they prepared for trials. Yet, on his first day in office, Donald Trump issued a blanket pardon to anyone involved in January 6. "I got fired because I prosecuted people that this administration wanted protected. Bottom line," Gordon stated. Trump has openly expressed his desire to exact revenge on those who investigated and prosecuted him and his allies. Scores of career prosecutors have faced dismissal for finding themselves in the president's political crosshairs. Gordon's firing stands as one of the clearest examples to date of how Trump is fulfilling that promise and systematically purging the justice department. This action represents an attack on a fundamental pillar of the rule of law: the principle that prosecutors should make decisions about whether and how to bring cases without political concerns. Despite January 6 being one of the most politically explosive events in recent history, Gordon affirmed he never had any conversations with a supervisor discussing the political ramifications of their work. The political implications "didn't matter," he said. "What we did discuss was the importance we saw in protecting democracy and prosecuting these cases. And creating the precedent and the deterrence that political violence was unacceptable. Full stop," he elaborated. "And that was worth doing no matter what. It's why I still think all these prosecutions were worth doing. Even after the pardons, even after my own firing." Max Stier, CEO of the Partnership for Public Service, a watchdog group, described the mass firings of career prosecutors as "unprecedented." Stier commented, "There's enormous discretion that prosecutors have, and there is a tradition that it's not about winning, it's about doing justice, and we're watching that tradition change into it's neither about winning or doing justice, but it's doing the bidding of President Trump." Two other prosecutors who had worked on January 6 cases were fired on the same day. Their dismissals followed the launch of a "weaponization working group" at the Department of Justice by Ed Martin, a prominent defense lawyer for January 6 defendants. Gordon and his colleagues remain unsure why he and two other colleagues were singled out from the hundreds of career prosecutors who worked on January 6 cases. The justice department did not respond to a request for comment regarding his firing. One prevailing theory is that Gordon was targeted because he handled some of the most high-profile cases, including those of Richard Barnett, who was photographed with his feet on a desk in Nancy Pelosi's office and sentenced to 54 months in prison; Eric Munchel, widely known as "Zip Tie Guy," who was sentenced to 57 months in prison; and Ray Epps, who became the focal point of various conspiracy theories surrounding January 6. In person, Gordon, a 47-year-old with salt-and-pepper hair, possesses a boyishness that belies his intense demeanor as a prosecutor. During an interview at his home in Tampa, where an American flag flew outside his door, he sat barefoot and in shorts, with his knees tucked to his chest, as he recounted his time working on the January 6 cases and processing his unexpected firing. Last month, Gordon, along with two other justice department employees, filed a federal lawsuit challenging his dismissal, alleging they had been wrongfully fired. A litigator to his core, Gordon has already mentally mapped out how he anticipates the administration will defend itself. He believes the case will ultimately be decided by the US Supreme Court. "I can tell you that I have been contacted plenty of times by colleagues from my former office who tell me that they're all wondering, am I next? Have I done something that's going to be on the wrong side of this administration? Am I going to be punished for some other work I've done?" he shared. Before becoming a prosecutor, Gordon taught high school humanities, and his voice retains the resonant boom of someone capable of holding the attention of a classroom of teenagers. He is meticulous about facts. When I mistakenly said Barnett, one of the defendants he successfully prosecuted, had put his feet on Pelosi's desk, he politely corrected me, clarifying it had been a desk in her office. And when I repeated what I believed were the names of his two cockapoos - Cereal and Cheerio - he quietly corrected me again. "Maple and Cheerio," he said. Gordon recalled watching the January 6 riot unfold on television while he was folding laundry, just four years after joining the justice department. His prosecutor's mind quickly engaged, he explained: "I'm watching a crime scene, I'm watching a crime unfold. And there are all these television cameras around, which is really rare." In the back of his mind, he remembered something a mentor told him as a newly minted federal prosecutor in 2017. Throughout his entire life, she had told him, he had likely seen something that could be a crime and thought, "somebody should do something about that." Now, he was the person who could do something about it. Initially, Gordon, who was working on violent crimes and narcotics cases in the US attorney's office for the middle district of Florida, thought he might assist with cases involving individuals from the Tampa area. However, as the federal prosecutor's office in Washington tasked with investigating January 6 began its work, more attorneys were needed, and the department sent out a request for additional staff. Gordon volunteered and was selected. Working remotely from Tampa and traveling to Washington for trials, Gordon became one of several prosecutors assigned cases randomly. In 2022, he prosecuted Kyle Fitzsimons, a 39-year-old man from Maine who wore a white butcher's jacket and fur pelt, carried an unstrung bow, and assaulted five law enforcement officers in approximately five minutes. Fitzsimons was sentenced to 87 months in prison. Gordon's performance drew the attention of his supervisors, and he quickly earned a reputation as a highly skilled trial litigator. He was assigned to more trials, including some of the department's more high-profile cases. He eventually received a new title: "senior trial counsel." In this role, he collaborated with lawyers in the Capitol siege section, reviewing their briefs, assisting them in preparing arguments, and accompanying them to trial. "I felt the weight of the responsibility to do it well and do it right," he said, pausing before adding, "I felt passionate about the righteousness of what we were doing. I welcome the scrutiny. I'm well aware of the irony that it's those kinds of things that are probably why I'm unemployed now, probably why I was fired, because I did take on those higher-profile things and this is the risk that sort of comes with it." Gregory Rosen, Gordon's boss and the former head of the Capitol siege section, who resigned from the justice department in June, remarked, "I mean, he worked his ass off. He is absolutely the heavy hitter in terms of getting cases across the line and taking these high-profile cases. He doesn't shy away easily. He's not afraid of the limelight. And he knows how to put his head down and get through the nitty-gritty." Jason Manning, a former federal prosecutor who worked alongside Gordon, noted that other prosecutors in the Capitol siege section would even come to observe Gordon cross-examine a witness or present to a jury. "Mike's reputation for excellence in court was such that people really made time to go watch him perform," Manning said, adding, "You simply don't replace people like Mike overnight." One of the highest-profile cases Gordon undertook was that of Barnett, an Arkansas man who became one of the most prominent January 6 rioters after being photographed with his feet on a desk in Pelosi's office. He also left her a note that read: "Hey Nancy Bigo was here, biatch." While Barnett's conduct was "somewhere in the middle" on the spectrum of January 6 cases, Gordon understood the trial would garner considerable public attention. He welcomed the opportunity to demonstrate to the public how thoroughly the government had investigated the case against Barnett. He also embraced the chance to show that Barnett's crime was not merely putting his feet on a desk in Pelosi's office; he arrived at the Capitol armed with a 10lb metal pole and a stun device concealed within a walking stick, which he brandished at a police officer. "The cross-examination got a lot of attention," he noted. Barnett offered a litany of excuses attempting to downplay his conduct on January 6, at one point arguing he had been searching for a bathroom and had simply wandered into Pelosi's office. Gordon highlighted that Barnett had spent a considerable amount of time wandering around the Capitol yet never asked for directions to a bathroom. Barnett also stole an envelope from Pelosi's office but claimed it wasn't a crime because it was a "biohazard" due to blood on it. He left a quarter, asserting it was not theft. "He had a number of just patently obvious lies," Gordon said. "It's very clear that he had the intent, and here's all the evidence we've amassed against this guy to show the public we've done our homework. People can see all the different witnesses. They can hear from Barnett himself, how he responds to cross-examination. Because folks go on friendly podcasts, they post on Twitter, there's nobody pushing back." Barnett was convicted on eight counts, encompassing both felonies and misdemeanors, and sentenced to more than four years in prison. After Barnett's sentencing, Gordon prosecuted the case against Epps, an Arizona man who was on the Capitol grounds on January 6 and, the night before, had encouraged protesters to go to the Capitol. Initially, he was not charged with a crime, but his life was upended when he became the subject of a conspiracy theory, promoted by Tucker Carlson and others, alleging he was a federal agent. He was eventually charged with a misdemeanor, making him one of the few individuals charged with a crime who did not enter the Capitol on January 6. Epps ultimately pleaded guilty to the misdemeanor. Gordon submitted a sentencing memo on behalf of the government, recommending Epps be sentenced to six months in prison, $500 in restitution, and a year of supervised release. He was ultimately sentenced to a year of probation. Gordon described the case as the most challenging he worked on. "Figuring out what the right thing to do with the Ray Epps case was a real challenge," he said. "I'm well aware that not everyone agrees. But someone needed to be the lightning rod to handle that, and somebody needed to be able to handle the public appearances." Edward Ungvarsky, Epps's lawyer, stated that while he and Gordon disagreed on the case's outcome, he viewed Gordon as apolitical, "thoughtful," and "just very by the book." "He wasn't political in the slightest," Ungvarsky said. "I wouldn't have known if he was a Republican or a Democrat." When Gordon and I first spoke shortly after his firing, he was reluctant to speak on the record, believing there was an outside chance he might regain his job. "It seemed so unjust. It seemed so, not only at odds with my performance, but also what I understood my reputation to be within my office," he explained. As the weeks progressed, he said, it became clear that would not happen. He has since explained to his children why he was fired. Gordon's 10-year-old son "understands the idea that something wrong happened to me and I'm doing what I can to speak up about it and to fight back," he shared. "And he really admires that. And that's another thing that gives me inspiration and fuel." These days, Gordon asserts he has no regrets. "It was the right thing to do," he stated. "I will always operate from the position that I will do the right thing first, and then I'll worry about what the consequences of that are." """
    
    return (
        "Both Reference and Candidate",  # input_mode
        reference_text,                # reference_text
        candidate_text                 # candidate_text
    )


with gr.Blocks(title="LLM Evaluation Framework", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 📊 LLM Evaluation Framework for Professional Content Rewriting")
    gr.Markdown("Evaluate the quality of LLM-generated content using multiple metrics with proper normalization.")
    
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### 📥 Input Options")
            
            input_mode = gr.Radio(
                ["Reference Only (Generate Candidate)", "Both Reference and Candidate"],
                label="Input Mode",
                value="Reference Only (Generate Candidate)",
                elem_id="input-mode"
            )
            
            # Add example button
            example_btn = gr.Button("Load Example", variant="secondary", elem_id="example-btn")
            
            reference_text = gr.Textbox(
                label="Reference Text", 
                lines=10, 
                placeholder="Enter reference text to evaluate against...",
                elem_id="reference-text"
            )
            
            # Conditionally show candidate text box
            with gr.Group(visible=False) as candidate_group:
                candidate_text = gr.Textbox(
                    label="Candidate Text", 
                    lines=10, 
                    placeholder="Enter candidate text to evaluate...",
                    elem_id="candidate-text"
                )
            
            # Update visibility of candidate text box based on input mode
            def update_candidate_visibility(mode):
                return gr.update(visible=(mode == "Both Reference and Candidate"))
            
            input_mode.change(
                fn=update_candidate_visibility,
                inputs=input_mode,
                outputs=candidate_group
            )
            
            gr.Markdown("### ⚙️ Configuration")
            model_choice = gr.Radio(
                ["Gemini", "Llama-3-70b", "Llama-3-8b"], 
                label="Select Model", 
                value="Gemini",
                elem_id="model-choice"
            )
            
            prompt_choice = gr.Radio(
                ["Strategic Narrative Architect", "Precision Storyteller"],
                label="Select Prompt Template",
                value="Strategic Narrative Architect",
                elem_id="prompt-choice"
            )
            
            submit_btn = gr.Button("Evaluate", variant="primary", size="lg", elem_id="submit-btn")
        
        with gr.Column(scale=2):
            gr.Markdown("### 📄 Text Comparison")
            
            with gr.Tabs():
                with gr.TabItem("Reference Text"):
                    reference_output = gr.Textbox(
                        label="Reference Text", 
                        lines=8,
                        elem_id="reference-output"
                    )
                
                with gr.TabItem("Candidate Text"):
                    candidate_output = gr.Textbox(
                        label="Candidate Text", 
                        lines=8,
                        elem_id="candidate-output"
                    )
            
            gr.Markdown("### 📈 Evaluation Metrics")
            metrics_output = gr.Dataframe(
                label="Evaluation Metrics",
                interactive=False,
                elem_id="metrics-output"
            )
            
            gr.Markdown("### 📌 Overall Assessment")
            summary_output = gr.Textbox(
                label="Summary",
                elem_id="summary-output"
            )
    
    # Event handlers
    submit_btn.click(
        fn=process_input,
        inputs=[input_mode, reference_text, candidate_text, model_choice, prompt_choice],
        outputs=[reference_output, candidate_output, metrics_output, summary_output]
    )
    
    # Load example button handler
    example_btn.click(
        fn=load_example,
        outputs=[input_mode, reference_text, candidate_text]
    ).then(
        fn=lambda: gr.update(visible=True),
        inputs=None,
        outputs=candidate_group
    )
    
    # Add interpretation guide in an accordion
    with gr.Accordion("📚 Interpretation Guide", open=False):
        gr.Markdown("""
        ### Hybrid Score Interpretation
        
        The Hybrid Score combines multiple evaluation metrics into a single score with proper normalization:
        
        - **0.85+**: Outstanding performance (A) - ready for professional use
        - **0.70-0.85**: Strong performance (B) - good quality with minor improvements
        - **0.50-0.70**: Adequate performance (C) - usable but needs refinement
        - **0.30-0.50**: Weak performance (D) - requires significant revision
        - **<0.30**: Poor performance (F) - likely needs complete rewriting
        
        ### Key Metrics Explained
        
        | Metric | What It Measures | Why It Matters |
        |--------|------------------|----------------|
        | **AnswerRelevancy** | Is output on-topic with input? | Does the prompt stay focused despite messy input? |
        | **Faithfulness** | Are ALL facts preserved correctly? | Does it maintain accuracy when input has encoding errors? |
        | **GEval** | Overall quality assessment by another AI | How professional does the output appear? |
        | **BERTScore** | Semantic similarity to reference | How well does it capture the meaning of cleaned text? |
        | **ROUGE** | Content overlap with reference | How much key information is preserved? |
        | **BLEU** | Phrasing precision | How closely does wording match human-quality standard? |
        | **METEOR** | Linguistic quality with synonyms | How natural does the cleaned output read? |
        """)


if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
    )