import gradio as gr
import pandas as pd
from datasets import load_dataset
import plotly.graph_objects as go
import datetime
import json
import random
import os
#from model_handler import generate_response, get_inference_configs
#from enhanced_model_handler import generate_response, get_inference_configs
from model_handler_ollama import generate_response, get_inference_configs
#import torch
# Add these imports at the top of your main file
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import gradio as gr

# Set style for better-looking plots
plt.style.use('default')
sns.set_palette("husl")
# Configuration for datasets
DATASET_CONFIGS = {
    'Loggenix Synthetic AI Tasks Eval (with outputs)-small': {
        'repo_id': 'kshitijthakkar/loggenix-synthetic-ai-tasks-eval-with-outputs',
        'split': 'train'
    },
    'Loggenix Synthetic AI Tasks Eval (with outputs) v5-large': {
        'repo_id': 'kshitijthakkar/loggenix-synthetic-ai-tasks-eval_v5-with-outputs',
        'split': 'train'
    },
    'Loggenix Synthetic AI Tasks Eval (with outputs) v6-large': {
        'repo_id': 'kshitijthakkar/loggenix-synthetic-ai-tasks-eval_v6-with-outputs',
        'split': 'train'
    },
     'Loggenix Synthetic AI Tasks Eval (with outputs) v7-large': {
        'repo_id': 'kshitijthakkar/loggenix-synthetic-ai-tasks-eval_v5-with-outputs-v7-sft-v1',
        'split': 'train'
    }
}


class BenchmarkPlotter:
    def __init__(self):
        # Zero-shot benchmark data
        self.zero_shot_data = {
            'Model': ['Loggenix MoE 330M', 'SmolLM-135M', 'GPT2-137M', 'SmolLM-360M', 'Qwen2-500M',
                      'SmolLM-1.7B'],
            'Parameters': ['330M', '135M', '137M', '360M', '500M', '1.7B'],
            'Param_Numeric': [330, 135, 137, 360, 500, 1700],
            'MMLU': [24.6, 30.23, 26.29, 34.17, 31.92, 39.97],
            'HellaSwag': [25.0, 42.3, 29.76, 53.8, 47.61, 64.1],
            'PIQA': [55.0, 69.6, 62.51, 72.0, 69.31, 77.3],
            'ARC': [15.0, 44.0, 31.09, 51.1, 39.74, 61.55],
            'WinoGrande': [40.0, 52.7, 49.72, 53.7, 54.14, 56.0],
            'IsLoggenix': [True, False, False, False, False, False]
        }

        # Few-shot benchmark data
        self.few_shot_data = {
            'Model': ['Loggenix MoE 330M', 'Gemma 3 PT 1B', 'Gemma 3 PT 4B', 'Gemma 3 PT 12B',
                      'Gemma 3 PT 27B'],
            'Parameters': ['330M', '1B', '4B', '12B', '27B'],
            'Param_Numeric': [330, 1000, 4000, 12000, 27000],
            'MMLU': [25.8, 26.5, 59.6, 74.5, 78.6],
            'HellaSwag': [30.0, 62.3, 77.2, 84.2, 85.6],
            'PIQA': [80.0, 73.8, 79.6, 81.8, 83.3],
            'ARC': [10.0, 38.4, 56.2, 68.9, 70.6],
            'WinoGrande': [50.0, 58.2, 64.7, 74.3, 78.8],
            'IsLoggenix': [True, False, False, False, False]
        }

        self.df_zero = pd.DataFrame(self.zero_shot_data)
        self.df_few = pd.DataFrame(self.few_shot_data)

    def create_matplotlib_comparison(self, shot_type='zero'):
        """Create matplotlib comparison charts"""
        df = self.df_zero if shot_type == 'zero' else self.df_few

        # Create subplots
        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
        fig.suptitle(f'{shot_type.title()}-Shot Benchmark Comparison', fontsize=16, fontweight='bold')

        benchmarks = ['MMLU', 'HellaSwag', 'PIQA', 'ARC', 'WinoGrande']
        axes_flat = axes.flatten()

        # Color palette - highlight Loggenix in red
        colors = ['#ff6b6b' if is_loggenix else '#4a90e2' for is_loggenix in df['IsLoggenix']]

        for i, benchmark in enumerate(benchmarks):
            ax = axes_flat[i]
            bars = ax.bar(range(len(df)), df[benchmark], color=colors, alpha=0.8)

            # Highlight bars where Loggenix outperforms
            loggenix_score = df[df['IsLoggenix']][benchmark].iloc[0]
            for j, (bar, score) in enumerate(zip(bars, df[benchmark])):
                if not df['IsLoggenix'].iloc[j] and score < loggenix_score:
                    bar.set_color('#90EE90')  # Light green for outperformed models

            ax.set_title(f'{benchmark}', fontweight='bold')
            ax.set_ylabel('Score (%)')
            ax.set_xticks(range(len(df)))
            ax.set_xticklabels(df['Model'], rotation=45, ha='right')
            ax.grid(True, alpha=0.3)

            # Add value labels on bars
            for bar, value in zip(bars, df[benchmark]):
                height = bar.get_height()
                ax.text(bar.get_x() + bar.get_width() / 2., height + 0.5,
                        f'{value:.1f}%', ha='center', va='bottom', fontsize=9)

        # Parameter efficiency scatter plot
        ax_scatter = axes_flat[5]
        scatter = ax_scatter.scatter(df['Param_Numeric'], df['MMLU'],
                                     c=colors, s=100, alpha=0.7, edgecolors='black')
        ax_scatter.set_xlabel('Parameters (M)')
        ax_scatter.set_ylabel('MMLU Score (%)')
        ax_scatter.set_title('Parameter Efficiency (MMLU)', fontweight='bold')
        ax_scatter.set_xscale('log')
        ax_scatter.grid(True, alpha=0.3)

        # Add model labels to scatter plot
        for idx, row in df.iterrows():
            ax_scatter.annotate(row['Parameters'],
                                (row['Param_Numeric'], row['MMLU']),
                                xytext=(5, 5), textcoords='offset points',
                                fontsize=8, ha='left')

        plt.tight_layout()
        return fig

    def create_plotly_interactive(self, shot_type='zero'):
        """Create interactive plotly charts"""
        df = self.df_zero if shot_type == 'zero' else self.df_few

        # Create subplots
        fig = make_subplots(
            rows=2, cols=3,
            subplot_titles=('MMLU', 'HellaSwag', 'PIQA', 'ARC', 'WinoGrande', 'Parameter Efficiency'),
            specs=[[{"secondary_y": False}, {"secondary_y": False}, {"secondary_y": False}],
                   [{"secondary_y": False}, {"secondary_y": False}, {"type": "scatter"}]]
        )

        benchmarks = ['MMLU', 'HellaSwag', 'PIQA', 'ARC', 'WinoGrande']
        positions = [(1, 1), (1, 2), (1, 3), (2, 1), (2, 2)]

        for i, (benchmark, pos) in enumerate(zip(benchmarks, positions)):
            loggenix_score = df[df['IsLoggenix']][benchmark].iloc[0]

            # Create colors based on performance vs Loggenix
            bar_colors = []
            for idx, row in df.iterrows():
                if row['IsLoggenix']:
                    bar_colors.append('#ff6b6b')  # Red for Loggenix
                elif row[benchmark] < loggenix_score:
                    bar_colors.append('#90EE90')  # Light green for outperformed
                else:
                    bar_colors.append('#4a90e2')  # Blue for others

            fig.add_trace(
                go.Bar(
                    x=df['Model'],
                    y=df[benchmark],
                    name=benchmark,
                    marker_color=bar_colors,
                    text=[f'{val:.1f}%' for val in df[benchmark]],
                    textposition='outside',
                    showlegend=False,

                ),
                row=pos[0], col=pos[1]
            )

        # Parameter efficiency scatter plot
        fig.add_trace(
            go.Scatter(
                x=df['Param_Numeric'],
                y=df['MMLU'],
                mode='markers+text',
                text=df['Parameters'],
                textposition='top right',
                marker=dict(
                    size=12,
                    color=['#ff6b6b' if is_loggenix else '#4a90e2'
                           for is_loggenix in df['IsLoggenix']],
                    line=dict(width=1, color='black')
                ),
                name='Models',
                showlegend=False
            ),
            row=2, col=3
        )

        # Update layout
        fig.update_layout(
            title_text=f'{shot_type.title()}-Shot Benchmark Comparison',
            title_x=0.5,
            height=800,
            showlegend=False
        )

        # Update x-axis for scatter plot to log scale
        fig.update_xaxes(type="log", row=2, col=3, title_text="Parameters (M)")
        fig.update_yaxes(title_text="MMLU Score (%)", row=2, col=3)

        # Update all y-axes
        for i in range(1, 6):
            if i <= 3:
                fig.update_yaxes(title_text="Score (%)", row=1, col=i)
            elif i <= 5:
                fig.update_yaxes(title_text="Score (%)", row=2, col=i - 3)

        return fig

    def create_competitive_analysis_summary(self, shot_type='zero'):
        """Create a summary of competitive areas"""
        df = self.df_zero if shot_type == 'zero' else self.df_few
        loggenix_row = df[df['IsLoggenix']].iloc[0]

        summary = f"## Loggenix MoE 330M Analysis ({shot_type.title()}-Shot)\n\n"

        benchmarks = ['MMLU', 'HellaSwag', 'PIQA', 'ARC', 'WinoGrande']
        competitive_areas = []

        for benchmark in benchmarks:
            loggenix_score = loggenix_row[benchmark]
            outperformed = df[(~df['IsLoggenix']) & (df[benchmark] < loggenix_score)]

            if len(outperformed) > 0:
                competitive_areas.append(
                    f"**{benchmark} ({loggenix_score:.1f}%)**: Outperforms {len(outperformed)} models"
                )

            # Check for near-competitive performance
            if shot_type == 'few' and benchmark == 'PIQA' and loggenix_score >= 80:
                competitive_areas.append(
                    f"**{benchmark} ({loggenix_score:.1f}%)**: Near 12B model performance (81.8%) with 36x fewer parameters!"
                )

        if competitive_areas:
            summary += "### 🏆 Competitive Areas:\n"
            for area in competitive_areas:
                summary += f"- {area}\n"
        else:
            summary += "### 📊 Performance Notes:\n"
            summary += "- Model shows efficient performance for parameter count\n"
            summary += "- Some benchmarks show room for improvement\n"

        # Parameter efficiency note
        param_rank = df.sort_values('Param_Numeric')['Model'].tolist().index('Loggenix MoE 330M') + 1
        summary += f"\n### 📈 Efficiency Metrics:\n"
        summary += f"- Ranks #{param_rank} in parameter count\n"
        summary += f"- Strong parameter efficiency across benchmarks\n"
        summary += f"- MoE architecture enables competitive performance\n"

        return summary


# Load main dataset for inference tab
def load_inference_dataset():
    """Load the main dataset for inference use case"""
    try:
        print("Loading synthetic-ai-tasks-eval-v5 dataset...")
        dataset = load_dataset(
            'kshitijthakkar/synthetic-ai-tasks-eval-v5',
            split='train',
            trust_remote_code=True
        )
        df = dataset.to_pandas()
        print(f"✓ Successfully loaded: {len(df)} rows, {len(df.columns)} columns")
        return df
    except Exception as e:
        print(f"✗ Error loading dataset: {str(e)}")
        return pd.DataFrame({'Error': [f'Failed to load: {str(e)}']})


# Load dataset for eval samples tab
def load_eval_datasets():
    """Load all datasets for evaluation samples"""
    datasets = {}
    for display_name, config in DATASET_CONFIGS.items():
        try:
            print(f"Loading {display_name}...")
            dataset = load_dataset(
                config['repo_id'],
                split=config['split'],
                trust_remote_code=True
            )
            df = dataset.to_pandas()
            datasets[display_name] = df
            print(f"✓ Successfully loaded {display_name}: {len(df)} rows")
        except Exception as e:
            print(f"✗ Error loading {display_name}: {str(e)}")
            datasets[display_name] = pd.DataFrame({
                'Error': [f'Failed to load: {str(e)}'],
                'Dataset': [config['repo_id']]
            })
    return datasets


# Load datasets
INFERENCE_DATASET = load_inference_dataset()
EVAL_DATASETS = load_eval_datasets()


# ===== TAB 1: INFERENCE USE CASE WITH INTEGRATED FLAGGING =====

def get_task_types():
    """Get unique task types from inference dataset"""
    if 'task_type' in INFERENCE_DATASET.columns:
        task_types = INFERENCE_DATASET['task_type'].unique().tolist()
        return [str(t) for t in task_types if pd.notna(t)]
    return ["No task types available"]


def get_task_by_type(task_type):
    """Get task content by task type"""
    if 'task_type' in INFERENCE_DATASET.columns and 'task' in INFERENCE_DATASET.columns:
        filtered = INFERENCE_DATASET[INFERENCE_DATASET['task_type'] == task_type]
        if len(filtered) > 0:
            return str(filtered.iloc[0]['task'])
    return "No task found for this type"


def chat_interface_with_inference(prompt, history, system_prompt, inference_config):
    """Enhanced chat interface with model inference and history"""
    if not prompt.strip():
        return history, ""

    # Add user message to history
    history.append(("You", prompt))

    try:
        if not system_prompt.strip():
            response = "Please select a task type to load system prompt first."
        else:
            # Get inference configuration
            configs = get_inference_configs()
            config = configs.get(inference_config, configs["Optimized for Speed"])

            # Run inference using the model
            response = generate_response(
                system_prompt=system_prompt,
                user_input=prompt,
                config_name=inference_config
            )

        # Format and add AI response to history
        formatted_response = f"**AI Assistant:**\n{response}"
        history.append(("AI Assistant", formatted_response))

    except Exception as e:
        error_msg = f"**AI Assistant:**\nError during inference: {str(e)}"
        history.append(("AI Assistant", error_msg))

    return history, ""


def flag_response(history, flagged_message, flag_reason):
    """Flag a response"""
    if not flagged_message or flagged_message == "No responses available":
        return "Invalid message selection."

    try:
        flagged_index = int(flagged_message.split()[1][:-1])
        if flagged_index >= len(history) or history[flagged_index][0] != "AI Assistant":
            return "You can only flag assistant responses."

        flagged_message_content = history[flagged_index][1]

        log_entry = {
            "timestamp": datetime.datetime.now().isoformat(),
            "flag_reason": str(flag_reason),
            "flagged_message": str(flagged_message_content),
            "conversation_context": history,
        }

        os.makedirs("logs", exist_ok=True)
        with open("logs/flagged_responses.log", "a") as f:
            f.write(json.dumps(log_entry) + "\n")

        return f"Response flagged successfully: {flag_reason}"
    except Exception as e:
        return f"Error flagging response: {str(e)}"


def get_assistant_responses(history):
    """Get dropdown options for assistant responses"""
    responses = [
        f"Response {i}: {str(msg[1])[:50]}..."
        for i, msg in enumerate(history)
        if msg[0] == "AI Assistant"
    ]

    if not responses:
        responses = ["No responses available"]

    return gr.update(choices=responses, value=responses[0] if responses else "No responses available")


def display_selected_message(selected_index, history):
    """Display the selected flagged message"""
    if selected_index == "No responses available":
        return "No responses available"

    try:
        flagged_index = int(selected_index.split()[1][:-1])
        if flagged_index < len(history) and history[flagged_index][0] == "AI Assistant":
            return history[flagged_index][1]
        else:
            return "Invalid selection."
    except Exception as e:
        return f"Error: {str(e)}"


def clear_inference_history():
    """Clear chat history for inference tab"""
    return [], gr.update(choices=["No responses available"], value="No responses available")


# ===== TAB 2: EVAL SAMPLES =====

def update_eval_table(dataset_name):
    """Update eval table based on selected dataset"""
    if dataset_name in EVAL_DATASETS:
        return EVAL_DATASETS[dataset_name].head(100)
    return pd.DataFrame()


def get_eval_dataset_info(dataset_name):
    """Get info about selected eval dataset"""
    if dataset_name in EVAL_DATASETS:
        df = EVAL_DATASETS[dataset_name]
        return f"""
        **Dataset**: {dataset_name}
        - **Rows**: {len(df):,}
        - **Columns**: {len(df.columns)}
        - **Column Names**: {', '.join(df.columns.tolist())}
        """
    return "No dataset selected"


# def get_task_types_for_eval(dataset_name):
#     """Get unique task types from selected eval dataset"""
#     if dataset_name in EVAL_DATASETS and 'task_type' in EVAL_DATASETS[dataset_name].columns:
#         task_types = EVAL_DATASETS[dataset_name]['task_type'].unique().tolist()
#         return [str(t) for t in task_types if pd.notna(t)]
#     return ["No task types available"]
def get_task_types_for_eval(dataset_name):
    """Get unique task types from selected eval dataset"""
    if dataset_name in EVAL_DATASETS and 'task_type' in EVAL_DATASETS[dataset_name].columns:
        task_types = EVAL_DATASETS[dataset_name]['task_type'].unique().tolist()
        # The correct way is to return the list directly, not a joined string.
        # The list comprehension `[str(t) for t in task_types if pd.notna(t)]` already does this.
        return [str(t) for t in task_types if pd.notna(t)]
    return ["No task types available"]


def get_tasks_by_type_eval(dataset_name, task_type):
    """Get tasks filtered by dataset and task type"""
    if (dataset_name in EVAL_DATASETS and
            'task_type' in EVAL_DATASETS[dataset_name].columns and
            'task' in EVAL_DATASETS[dataset_name].columns):

        filtered = EVAL_DATASETS[dataset_name][EVAL_DATASETS[dataset_name]['task_type'] == task_type]
        if len(filtered) > 0:
            # Create display options with index and truncated task content
            tasks = []
            for idx, row in filtered.iterrows():
                task_preview = str(row['task'])[:100] + "..." if len(str(row['task'])) > 100 else str(row['task'])
                tasks.append(f"Row {idx}: {task_preview}")
            return tasks
    return ["No tasks found"]


# def get_selected_row_data(dataset_name, task_type, selected_task):
#     """Get all data for the selected row"""
#     if not selected_task or selected_task == "No tasks found":
#         return "", "", "", "", "", "",""
#
#     try:
#         # Extract row index from selected_task
#         row_idx = int(selected_task.split("Row ")[1].split(":")[0])
#
#         if dataset_name in EVAL_DATASETS:
#             df = EVAL_DATASETS[dataset_name]
#             if row_idx in df.index:
#                 row = df.loc[row_idx]
#
#                 # Extract all fields with safe handling for missing columns
#                 task = str(row.get('task', 'N/A'))
#                 task_type_val = str(row.get('task_type', 'N/A'))
#                 input_model = str(row.get('input_model', 'N/A'))
#                 expected_response = str(row.get('expected_response', 'N/A'))
#                 loggenix_output = str(row.get('loggenix_output', 'N/A'))
#                 output_model = str(row.get('output_model', 'N/A'))
#                 input_text = str(row.get('input', 'N/A'))
#
#
#                 return task_type_val, input_model, output_model, task, input_text, expected_response, loggenix_output
#
#     except Exception as e:
#         return f"Error: {str(e)}", "", "", "", "", "", "", ""
#
#     return "", "", "", "", "", "", ""

def get_selected_row_data_by_type(dataset_name, task_type):
    """Get all data for the first row of a selected dataset and task type"""
    if (dataset_name in EVAL_DATASETS and
            'task_type' in EVAL_DATASETS[dataset_name].columns and
            'task' in EVAL_DATASETS[dataset_name].columns):

        filtered = EVAL_DATASETS[dataset_name][EVAL_DATASETS[dataset_name]['task_type'] == task_type]
        if len(filtered) > 0:
            row = filtered.iloc[0] # Get the first row

            # Extract all fields with safe handling for missing columns
            task = str(row.get('task', 'N/A'))
            input_model = str(row.get('input_model', 'N/A'))
            expected_response = str(row.get('expected_response', 'N/A'))
            loggenix_output = str(row.get('loggenix_output', 'N/A'))
            output_model = str(row.get('output_model', 'N/A'))
            input_text = str(row.get('input', 'N/A'))

            return input_model, output_model, task, input_text, expected_response, loggenix_output

    return "", "", "", "", "", ""

# ===== TAB 3: VIEW FLAGGED RESPONSES =====

def read_flagged_messages():
    """Read flagged messages from log file"""
    try:
        if not os.path.exists("logs/flagged_responses.log"):
            return pd.DataFrame()

        with open("logs/flagged_responses.log", "r") as f:
            flagged_messages = f.readlines()

        if not flagged_messages:
            return pd.DataFrame()

        table_data = []
        for entry in flagged_messages:
            data = json.loads(entry)
            table_data.append({
                "Timestamp": data.get("timestamp", "N/A"),
                "Flag Reason": data.get("flag_reason", "N/A"),
                "Flagged Message": data.get("flagged_message", "N/A")[:100] + "...",
                "Conversation Context": str(len(data.get("conversation_context", []))) + " messages"
            })
        return pd.DataFrame(table_data)
    except Exception as e:
        return pd.DataFrame({"Error": [f"Error reading flagged messages: {str(e)}"]})


def handle_row_select(evt: gr.SelectData):
    """Handle row selection in flagged messages table"""
    try:
        if not os.path.exists("logs/flagged_responses.log"):
            return []

        with open("logs/flagged_responses.log", "r") as f:
            flagged_messages_log = f.readlines()

        if evt.index[0] < len(flagged_messages_log):
            selected_entry = json.loads(flagged_messages_log[evt.index[0]])
            conversation_context = selected_entry.get("conversation_context", [])
            return conversation_context
        return []
    except Exception as e:
        return [("System", f"Error loading conversation: {str(e)}")]


# ===== MAIN INTERFACE =====

def create_interface():
    with gr.Blocks(title="AI Tasks Evaluation Suite", theme=gr.themes.Soft()) as demo:
        gr.Markdown("# 🤖 AI Tasks Evaluation Suite")
        gr.Markdown("Comprehensive platform for AI model evaluation and testing")
        
        with gr.Tabs():
            # TAB 5: ABOUT (Updated with comprehensive application guide using Groups and Accordions)
            with gr.Tab("ℹ️ About"):
                with gr.Group():
                    gr.Markdown("# 🤖 AI Tasks Evaluation Suite - User Guide")
                                        
                    with gr.Accordion("📋 Application Overview", open=True, elem_classes="panel"):
                        gr.Markdown("""
                        The **AI Tasks Evaluation Suite** is a comprehensive platform for testing, evaluating, and monitoring AI model performance. 
                        This application provides multiple interfaces for model inference, dataset evaluation, response flagging, and performance analysis.
                        
                        **Key Features:**
                        - Interactive model testing with real-time inference
                        - Comprehensive dataset exploration and comparison
                        - Response quality monitoring and flagging system
                        - Performance analysis and evaluation metrics
                        """)
                    with gr.Accordion("🔧 Quick Start Guide", open=False, elem_classes="panel"):
                        gr.Markdown("""
                        ### 🧪 **For Model Testing:**
                        1. Go to **Inference Use Case** tab
                        2. Select a task type from dropdown
                        3. Choose inference configuration based on your needs
                        4. Start chatting with the model
                        5. Flag any problematic responses for review
                        
                        ### 📊 **For Dataset Exploration:**
                        1. Go to **Eval Samples** tab  
                        2. Select dataset from dropdown
                        3. Choose task type to filter results
                        4. Compare expected vs actual outputs
                        5. Use insights for model evaluation
                        
                        ### 🔍 **For Quality Monitoring:**
                        1. Use flagging feature during testing
                        2. Review flagged responses in **View Flagged Responses** tab
                        3. Analyze patterns for model improvement
                        
                        ### 💡 **Tips:**
                        - Start with "Optimized for Speed" for general testing
                        - Use specific task types to focus your evaluation
                        - Flag responses immediately when issues are noticed
                        - Regularly review flagged responses for patterns
                        - Check system requirements before running intensive tasks
                        """)
              
                    with gr.Accordion("🚀 Tab 1: Inference Use Case", open=False, elem_classes="panel"):
                        gr.Markdown("""
                        ### 🎯 **Purpose:** 
                        Interactive model testing with real-time inference and response quality monitoring.
                        
                        ### 📝 **How to Use:**
                        1. **Select Task Type**: Choose from available task types (loaded from the inference dataset)
                        2. **Configure Inference**: Select optimization level:
                        - `Optimized for Speed`: Fast responses (max 512 tokens)
                        - `Middle-ground`: Comprehensive answers (max 2048 tokens) 
                        - `Full Capacity`: Maximum context utilization (max 8192 tokens) 
                        3. **Review System Prompt**: The system prompt auto-loads based on task type (editable)
                        4. **Chat Interface**: 
                        - Enter messages in the input field
                        - View conversation history in the chat display
                        - Responses are generated using the selected model configuration
                        
                        ### 🚩 **Response Flagging Feature:**
                        - **Select Response**: Choose any AI assistant response from the dropdown
                        - **Add Reason**: Provide context for why the response needs flagging
                        - **Flag Response**: Submit the flag (logged to `logs/flagged_responses.log`)
                        - **Use Cases**: Mark inappropriate, incorrect, or problematic responses for review
                        
                        ### 🎮 **Controls:**
                        - `Send`: Submit your message for inference
                        - `Clear History`: Reset conversation (also clears flagging options)
                        - `Flag Response`: Mark selected response with reason
                        """)
                    
                    with gr.Accordion("📊 Tab 2: Eval Samples", open=False, elem_classes="panel"):
                        gr.Markdown("""
                        ### 🎯 **Purpose:** 
                        Explore evaluation datasets and compare expected vs actual model outputs.
                        
                        ### 📚 **Available Datasets:**
                        - **Loggenix Synthetic AI Tasks Eval (small)**: Compact evaluation set
                        - **Loggenix Synthetic AI Tasks Eval v5 (large)**: Extended evaluation dataset-large-models  
                        - **Loggenix Synthetic AI Tasks Eval v6 (large)**: Extended evaluation dataset-small-models
                        - **Loggenix Synthetic AI Tasks Eval v7 (large)**: Latest evaluation dataset-large-models
                        
                        ### 📝 **How to Use:**
                        1. **Select Dataset**: Choose from the dropdown (displays dataset info automatically)
                        2. **Choose Task Type**: Filter by specific task categories within the dataset
                        3. **Review Task Details**: View comprehensive information including:
                        - **Input Model**: Model used for generating the task
                        - **Output Model**: Model used for generating responses
                        - **Task**: The actual task/instruction given to the model
                        - **Input**: Raw input data or context
                        - **Expected Response**: Ground truth or reference output
                        - **Loggenix Output**: Model's actual response
                        
                        ### 🔍 **Data Fields Explained:**
                        - **task_type**: Category/classification of the task
                        - **task**: The instruction or prompt given to the model
                        - **input**: Additional context or data provided
                        - **input_model**: Model that created the task
                        - **output_model**: Model that generated the response
                        - **expected_response**: Reference/ground truth answer
                        - **loggenix_output**: Actual model output for comparison
                        """)
                    
                    with gr.Accordion("👀 Tab 3: View Flagged Responses", open=False, elem_classes="panel"):
                        gr.Markdown("""
                        ### 🎯 **Purpose:** 
                        Review and analyze previously flagged responses for quality control.
                        
                        ### ✨ **Features:**
                        - **Flagged Messages Table**: Shows all flagged responses with:
                        - Timestamp of when flagged
                        - Reason provided for flagging
                        - Preview of flagged message (first 100 characters)
                        - Conversation context summary
                        
                        ### 📝 **How to Use:**
                        1. **Refresh Data**: Click refresh to load latest flagged responses
                        2. **Select Row**: Click on any row to view full conversation context
                        3. **Review Context**: Examine the complete conversation leading to the flagged response
                        4. **Quality Analysis**: Use this data for model improvement and monitoring
                        
                        ### 💡 **Use Cases:**
                        - Monitor model response quality over time
                        - Identify patterns in problematic responses
                        - Gather data for model retraining or fine-tuning
                        - Quality assurance and compliance checking
                        """)

                    with gr.Accordion("📈 Tab 4: Model Eval Results", open=False, elem_classes="panel"):
                        gr.Markdown("""
                        ### ✅ **Status:** Live & Fully Functional

                        ### 🎯 **Current Features:**
                        - **Comprehensive Benchmark Analysis**: Compare Loggenix MoE 330M against multiple baseline models
                        - **Interactive Visualizations**: Plotly charts with hover details, zoom, and pan capabilities
                        - **Static Chart Options**: High-quality Matplotlib plots for presentations and reports
                        - **Dual Evaluation Modes**: 
                          - Zero-shot performance analysis
                          - Few-shot learning capabilities assessment
                        - **5 Standard Benchmarks**: MMLU, HellaSwag, PIQA, ARC, and WinoGrande
                        - **Parameter Efficiency Analysis**: Performance-to-size ratio visualizations
                        - **Competitive Analysis**: Automated summaries highlighting model strengths
                        - **Educational Content**: Detailed explanations of each benchmark's purpose

                        ### 📊 **Benchmark Coverage:**
                        - **MMLU**: 57 academic subjects knowledge assessment
                        - **HellaSwag**: Common sense reasoning evaluation
                        - **PIQA**: Physical interaction understanding
                        - **ARC**: Grade-school scientific reasoning
                        - **WinoGrande**: Commonsense pronoun resolution

                        ### 🏆 **Key Insights Available:**
                        - Model performance comparison against 4-6 baseline models per evaluation type
                        - Parameter efficiency rankings and analysis
                        - Identification of competitive benchmark areas
                        - Performance scaling analysis relative to model size
                        - Architecture efficiency demonstrations (MoE vs traditional models)

                        ### 🎨 **Visualization Options:**
                        - Interactive bar charts for each benchmark
                        - Parameter efficiency scatter plots with log scaling
                        - Color-coded performance comparisons (outperformed models highlighted)
                        - Responsive layouts optimized for different screen sizes
                        - Export-ready static plots for research and presentations
                        """)
                    with gr.Accordion("🛠️ Technical Specifications", open=False, elem_classes="panel"):
                        gr.Markdown("""
                        ### 🤖 **Model Details:**
                        **Primary Model**: `kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v7-sft-v1`
                        
                        - **Architecture**: Mixture of Experts (MOE)
                        - **Total Parameters**: 330M (16 experts, 2 active)
                        - **Active Parameters**: 185M
                        - **Context Length**: 8192 tokens
                        - **Precision**: FP16
                        - **Flash Attention**: Supported
                        - **Tool Calling**: Enabled
                        
                        ### ⚙️ **Performance Configurations:**
                        ```python
                        configs = {
                            "Optimized for Speed": {
                                "max_new_tokens": 512,
                                "temperature": 0.7,
                                "do_sample": True,
                                "use_cache": False
                            },
                            "Middle-ground": {
                                "max_new_tokens": 2048, 
                                "temperature": 0.8,
                                "do_sample": True,
                                "use_cache": False
                            },
                            "Full Capacity": {
                                "max_new_tokens": 8192,
                                "temperature": 0.9,
                                "do_sample": True,
                                "use_cache": False
                            }
                        }
                        ```
                        
                        ### 💻 **System Requirements:**
                        - **GPU**: GTX 1050 Ti or RTX 3060 (2-4GB VRAM sufficient)
                        - **Memory**: 8GB+ system RAM
                        - **VRAM**: 2-4GB GPU memory (450MB for 8-bit quantized)
                        - **Python**: 3.8+
                        - **Dependencies**: transformers, torch, gradio, datasets, plotly
                        """)
                
                    with gr.Accordion("⚡ Model Deployment Guide", open=False, elem_classes="panel"):
                        gr.Markdown("""
                        ### 🚀 **Two Deployment Methods Available**
                        
                        This application supports two different model backends for maximum flexibility:
                        
                        ---
                        
                        ## 🐳 **Method 1: Ollama (GGUF Quantized) - Recommended**
                        
                        ### 📥 **Installation & Setup:**
                        ```bash
                        # Install Ollama
                        curl -fsSL https://ollama.com/install.sh | sh
                        
                        # Start Ollama service
                        ollama serve
                        
                        # Pull the quantized model (Q8_0 format)
                        ollama pull hf.co/kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v7-sft-v1-Q8_0-GGUF:Q8_0
                        ```
                        
                        ### ✅ **Advantages:**
                        - **Lower VRAM Usage**: ~1-2GB VRAM on GPU
                        - **Faster Loading**: Pre-optimized GGUF format
                        - **CPU Fallback**: Can run on CPU if needed
                        - **Easy Management**: Simple model pulling and switching
                        - **Built-in API**: REST API included out of the box
                        
                        ### ⚙️ **Configuration:**
                        ```python
                        OLLAMA_BASE_URL = "http://localhost:11434"
                        MODEL_NAME = "hf.co/kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v7-sft-v1-Q8_0-GGUF:Q8_0"
                        ```
                        
                        ### 🛠️ **Tool Calling Support:**
                        Both formats supported:
                        ```python
                        # Legacy format
                        [TOOL_CALL:calculate_numbers(operation="add", num1="10", num2="5")]
                        
                        # JSON format  
                        <tool_call>{"name": "calculate_numbers", "parameters": {"operation": "add", "num1": "10", "num2": "5"}}</tool_call>
                        ```
                        
                        ---
                        
                        ## 🤗 **Method 2: HuggingFace Transformers (8-bit Quantized)**
                        
                        ### 📥 **Installation & Setup:**
                        ```bash
                        # Install dependencies
                        pip install transformers torch bitsandbytes accelerate
                        
                        # Model will auto-download on first use
                        ```
                        
                        ### ⚙️ **Quantization Configuration:**
                        ```python
                        # 8-bit quantization (recommended)
                        quantization_config = BitsAndBytesConfig(
                            load_in_8bit=True,
                            llm_int8_threshold=6.0,
                            llm_int8_has_fp16_weight=False,
                        )
                        
                        # 4-bit quantization (maximum memory savings)
                        quantization_config = BitsAndBytesConfig(
                            load_in_4bit=True,
                            bnb_4bit_compute_dtype=torch.float16,
                            bnb_4bit_quant_type="nf4",
                            bnb_4bit_use_double_quant=True,
                        )
                        ```
                        
                        ### ✅ **Advantages:**
                        - **Native Integration**: Direct HuggingFace ecosystem access
                        - **Fine-grained Control**: Detailed generation parameters
                        - **Memory Optimization**: Gradient checkpointing, flash attention
                        - **Advanced Features**: Custom sampling, beam search, etc.
                        - **Development Flexibility**: Easy model modifications
                        
                        ### 🧠 **Memory Optimizations:**
                        ```python
                        # Performance optimizations enabled
                        torch.backends.cudnn.benchmark = True
                        torch.backends.cuda.matmul.allow_tf32 = True
                        torch.backends.cudnn.allow_tf32 = True
                        
                        # Flash attention support
                        attn_implementation="flash_attention_2"
                        
                        # Gradient checkpointing
                        model.gradient_checkpointing_enable()
                        ```
                        
                        ---
                        
                        ## 📊 **Performance Comparison**
                        
                        | Feature | Ollama (GGUF) | Transformers (8-bit) |
                        |---------|---------------|---------------------|
                        | **VRAM Usage** | ~450MB | ~1.2GB |
                        | **Load Time** | Fast (< 30s) | Fast (~30s) |
                        | **Inference Speed (GPU)** | 1-3 seconds | 1-3 seconds |
                        | **Inference Speed (CPU)** | 10-30 seconds | N/A |
                        | **Memory Efficiency** | Excellent | Excellent |
                        | **Setup Complexity** | Simple | Moderate |
                        | **Customization** | Limited | High |
                            
                        ---
                        
                        ## 🔄 **Switching Between Methods**
                        
                        ### **Application Configuration:**
                        The application automatically detects which backend to use:
                        ```python
                        # Check for Ollama first
                        if check_ollama_connection():
                            use_ollama_handler()
                        else:
                            use_transformers_handler()
                        ```
                        
                        ### **Environment Variables:**
                        ```bash
                        # Force specific backend
                        export MODEL_BACKEND="ollama"  # or "transformers"
                        export OLLAMA_HOST="localhost:11434"
                        export MODEL_CACHE_DIR="/path/to/cache"
                        ```
                        
                        ---
                        
                        ## 🏗️ **Architecture Details**
                        
                        ### **Model Handler Structure:**
                        ```
                        model_handlers/
                        ├── model_handler_ollama.py      # Ollama GGUF implementation
                        ├── model_handler_transformers.py # HuggingFace implementation
                        └── base_handler.py              # Common interface
                        ```
                        
                        ### **Common Tool System:**
                        Both handlers share the same tool execution system:
                        - Mathematical operations (`calculate_numbers`)
                        - Extensible tool registry
                        - Error handling and validation
                        - Response processing and formatting
                        
                        ### **API Compatibility:**
                        Both implementations expose the same interface:
                        ```python
                        generate_response(
                            system_prompt: str,
                            user_input: str, 
                            config_name: str = "Middle-ground"
                        ) -> str
                        ```
                        
                        ---
                        
                        ## 💡 **Recommendations**
                        
                        ### **Use Ollama When:**
                        - Limited VRAM (< 2GB)
                        - CPU-only system available
                        - Quick deployment needed
                        - Running on older hardware
                        - Simple integration requirements
                        
                        ### **Use Transformers When:**
                        - GPU available (even 2GB is plenty)
                        - Maximum performance needed
                        - Custom model modifications required
                        - Advanced generation features needed
                        - Development/research environment
                        
                        ### **Hardware Recommendations:**
                        - **Ultra Budget**: Any 2GB GPU from 2016+ (GTX 1050, RX 460)
                        - **CPU-only**: Dual-core processor + 16GB RAM (Ollama)
                        - **Entry Level**: GTX 1050 Ti (2GB) - massive overkill
                        - **Mid-range**: RTX 3060 (12GB) - uses only ~10% of available VRAM
                        
                        ### **Real-world Performance (Tested on RTX 3060):**
                        - **8-bit Quantized**: Only 450MB VRAM usage
                        - **FP16 Standard**: 1.2GB VRAM usage
                        - **Response Time**: 1-3 seconds consistently
                        - **CPU Inference**: 10-30 seconds on 2-core system with 16GB RAM
                        - **Load Time**: Under 30 seconds for both methods
                        - **Minimum Hardware**: Any 2GB GPU will work comfortably
                        
                        ### **Key Performance Facts:**
                        - Model runs on hardware far less powerful than originally specified
                        - RTX 3060 uses less than 10% of its 12GB VRAM capacity
                        - Older GPUs like GTX 1050 Ti (4GB) provide excellent performance
                        - CPU-only inference is viable for non-real-time applications
                        - No need for high-end hardware like RTX 4090
                        """)    
                            
                    with gr.Accordion("📝 Logging & Data Storage", open=False, elem_classes="panel"):
                        gr.Markdown("""
                        ### 🚩 **Flagged Responses:**
                        - **Location**: `logs/flagged_responses.log`
                        - **Format**: JSON entries with timestamp, reason, message, and context
                        - **Usage**: Quality control, model improvement, compliance tracking
                        
                        ### 📁 **Data Sources:**
                        - **Inference Dataset**: `kshitijthakkar/synthetic-ai-tasks-eval-v5`
                        - **Evaluation Datasets**: Multiple versions of synthetic AI tasks with expected outputs
                        - **Model Outputs**: Generated in real-time during inference
                        """)
                    
                    with gr.Accordion("🆘 Troubleshooting", open=False, elem_classes="panel"):
                        gr.Markdown("""
                        ### ⚠️ **Common Issues:**
                        - **Dataset Loading**: Ensure internet connection for HuggingFace dataset access
                        - **Model Inference**: Check GPU memory availability
                        - **Flagging**: Verify logs directory permissions
                        - **Performance**: Adjust inference configuration for your hardware
                        
                        ### 🛠️ **Support:**
                        - Check console output for detailed error messages  
                        - Verify model handler configuration (`model_handler_ollama.py`)
                        - Ensure all dependencies are installed
                        - Monitor GPU memory usage during inference
                        - Check file permissions for logging directories
                        """)
                    
                    with gr.Group():
                        gr.Markdown("""
                        ---
                        **Developed by**: Kshitij Thakkar  
                        **Version**: AI Tasks Evaluation Suite v1.0  
                        **Last Updated**: September 2025
                        """)
            # TAB 1: INFERENCE USE CASE WITH INTEGRATED FLAGGING
            with gr.Tab("🚀 Inference Use Case"):
                gr.Markdown("## Model Inference Testing with Response Flagging")

                with gr.Row():
                    with gr.Column(scale=1):
                        # Task type dropdown
                        task_type_dropdown = gr.Dropdown(
                            choices=get_task_types(),
                            value=get_task_types()[0] if get_task_types() else None,
                            label="Task Type",
                            info="Select task type to load system prompt"
                        )

                        # Inference configuration
                        inference_config = gr.Dropdown(
                            choices=list(get_inference_configs().keys()),
                            value="Optimized for Speed",
                            label="Inference Configuration",
                            info="Select inference optimization level"
                        )

                    with gr.Column(scale=2):
                        # System prompt (editable)
                        system_prompt = gr.Textbox(
                            label="System Prompt (Editable)",
                            lines=6,
                            max_lines=10,
                            placeholder="Select a task type to load system prompt...",
                            interactive=True
                        )

                # Chat interface section
                gr.Markdown("### 💬 Chat Interface")
                with gr.Row():
                    with gr.Column(scale=2):
                        # Chat display (replacing the old textbox)
                        chat_display = gr.Chatbot(label="Conversation History", height=400)
                        chat_history_state = gr.State([])

                        # Chat input
                        with gr.Row():
                            chat_input = gr.Textbox(
                                placeholder="Enter your message here...",
                                label="Your Message",
                                scale=4
                            )
                            send_btn = gr.Button("Send", variant="primary", scale=1)

                        with gr.Row():
                            clear_chat_btn = gr.Button("🗑️ Clear History", variant="secondary")

                    # Flagging section
                    with gr.Column(scale=1):
                        gr.Markdown("### 🚩 Flag Response")

                        flagged_message_index = gr.Dropdown(
                            label="Select a response to flag",
                            choices=["No responses available"],
                            value="No responses available",
                            interactive=True
                        )

                        selected_message_display = gr.Textbox(
                            label="Selected Response",
                            interactive=False,
                            lines=4,
                            max_lines=6
                        )

                        flag_reason = gr.Textbox(
                            placeholder="Enter reason for flagging...",
                            label="Reason for Flagging"
                        )

                        flag_btn = gr.Button("🚩 Flag Response", variant="stop")
                        flag_output = gr.Textbox(label="Flagging Status", visible=True, lines=2)

                # Event handlers for Tab 1
                task_type_dropdown.change(
                    fn=get_task_by_type,
                    inputs=[task_type_dropdown],
                    outputs=[system_prompt]
                )

                # Chat functionality
                send_btn.click(
                    chat_interface_with_inference,
                    inputs=[chat_input, chat_history_state, system_prompt, inference_config],
                    outputs=[chat_display, chat_input]
                ).then(
                    lambda x: x,  # Update state
                    inputs=[chat_display],
                    outputs=[chat_history_state]
                ).then(
                    get_assistant_responses,
                    inputs=[chat_history_state],
                    outputs=[flagged_message_index]
                )

                # Enter key support for chat input
                chat_input.submit(
                    chat_interface_with_inference,
                    inputs=[chat_input, chat_history_state, system_prompt, inference_config],
                    outputs=[chat_display, chat_input]
                ).then(
                    lambda x: x,  # Update state
                    inputs=[chat_display],
                    outputs=[chat_history_state]
                ).then(
                    get_assistant_responses,
                    inputs=[chat_history_state],
                    outputs=[flagged_message_index]
                )

                clear_chat_btn.click(
                    clear_inference_history,
                    outputs=[chat_display, flagged_message_index]
                ).then(
                    lambda: [],
                    outputs=[chat_history_state]
                )

                # Flagging functionality
                flagged_message_index.change(
                    display_selected_message,
                    inputs=[flagged_message_index, chat_history_state],
                    outputs=[selected_message_display]
                )

                flag_btn.click(
                    flag_response,
                    inputs=[chat_history_state, flagged_message_index, flag_reason],
                    outputs=[flag_output]
                )

            # TAB 2: EVAL SAMPLES
            # with gr.Tab("📊 Eval Samples"):
            #     gr.Markdown("## Dataset Evaluation Samples")
            #
            #     with gr.Row():
            #         with gr.Column(scale=1):
            #             eval_dataset_dropdown = gr.Dropdown(
            #                 choices=list(EVAL_DATASETS.keys()),
            #                 value=list(EVAL_DATASETS.keys())[0] if EVAL_DATASETS else None,
            #                 label="Select Dataset",
            #                 info="Choose evaluation dataset to view"
            #             )
            #
            #             eval_dataset_info = gr.Markdown(
            #                 get_eval_dataset_info(list(EVAL_DATASETS.keys())[0] if EVAL_DATASETS else "")
            #             )
            #
            #     with gr.Row():
            #         eval_table = gr.Dataframe(
            #             value=update_eval_table(list(EVAL_DATASETS.keys())[0]) if EVAL_DATASETS else pd.DataFrame(),
            #             label="Dataset Table",
            #             max_height=800,
            #             min_width=800,
            #             interactive=True,
            #             wrap=True,
            #             show_fullscreen_button=True,
            #             show_copy_button=True,
            #             show_row_numbers=True,
            #             show_search="search",
            #             column_widths=["80px","80px","80px","150px","250px","250px","250px"]
            #         )
            #
            #     # Event handlers for Tab 2
            #     eval_dataset_dropdown.change(
            #         fn=lambda x: (update_eval_table(x), get_eval_dataset_info(x)),
            #         inputs=[eval_dataset_dropdown],
            #         outputs=[eval_table, eval_dataset_info]
            #     )
            with gr.Tab("📊 Eval Samples"):
                gr.Markdown("## Dataset Evaluation Samples")
                gr.Markdown("Select dataset and task type to view detailed information")

                with gr.Row():
                    with gr.Column(scale=1):
                        eval_dataset_dropdown = gr.Dropdown(
                            choices=list(EVAL_DATASETS.keys()),
                            value=list(EVAL_DATASETS.keys())[0] if EVAL_DATASETS else None,
                            label="Select Dataset",
                            info="Choose evaluation dataset to view"
                        )

                        eval_task_type_dropdown = gr.Dropdown(
                            choices=[],
                            label="Select Task Type",
                            info="Choose task type from selected dataset",
                            allow_custom_value=True
                        )

                    with gr.Column(scale=1):
                        eval_dataset_info = gr.Markdown(
                            get_eval_dataset_info(list(EVAL_DATASETS.keys())[0] if EVAL_DATASETS else "")
                        )

                # Task details section
                gr.Markdown("### Task Details")
                with gr.Row():
                    input_model_field = gr.Textbox(
                        label="input_model",
                        lines=1,
                        interactive=False
                    )

                    output_model_field = gr.Textbox(
                        label="output_model",
                        lines=1,
                        interactive=False
                    )
                with gr.Row():
                    task_field = gr.Textbox(
                        label="Task",
                        lines=2,
                        max_lines=5,
                        interactive=False
                    )

                with gr.Row():
                    input_field = gr.Textbox(
                        label="input",
                        lines=12,
                        max_lines=20,
                        interactive=False
                    )

                # Large text fields for outputs side by side
                gr.Markdown("### Expected vs Actual Response Comparison")

                with gr.Row():
                    loggenix_output_field = gr.Textbox(
                        label="Expected Response",
                        lines=30,
                        max_lines=40,
                        interactive=False
                    )
                    expected_response_field = gr.Textbox(
                        label="Loggenix Output",
                        lines=30,
                        max_lines=40,
                        interactive=False
                    )

                # Event handlers for Tab 2
                # eval_dataset_dropdown.change(
                #     fn=lambda x: (get_eval_dataset_info(x), get_task_types_for_eval(x), None),
                #     inputs=[eval_dataset_dropdown],
                #     outputs=[eval_dataset_info, eval_task_type_dropdown]
                # )

                # Event handlers for Tab 2
                # eval_dataset_dropdown.change(
                #     fn=lambda x: (get_eval_dataset_info(x), get_task_types_for_eval(x)),
                #     inputs=[eval_dataset_dropdown],
                #     outputs=[eval_dataset_info, eval_task_type_dropdown]
                # )
                # Define a new function instead of lambda for clarity
                def update_eval_components(dataset_name):
                    info = get_eval_dataset_info(dataset_name)
                    task_types = get_task_types_for_eval(dataset_name)
                    return info, gr.update(choices=task_types,
                                           value=task_types[0] if task_types else "No task types available")

                # In the event handlers for Tab 2, replace the existing .change with this:
                eval_dataset_dropdown.change(
                    fn=update_eval_components,
                    inputs=[eval_dataset_dropdown],
                    outputs=[eval_dataset_info, eval_task_type_dropdown]
                )
                eval_task_type_dropdown.change(
                    fn=get_selected_row_data_by_type,
                    inputs=[eval_dataset_dropdown, eval_task_type_dropdown],
                    outputs=[input_model_field, output_model_field, task_field, input_field,
                             loggenix_output_field, expected_response_field]
                )

            # NOTE: The get_tasks_by_type_eval and eval_task_dropdown.change handlers are removed as per request.
            # TAB 3: VIEW FLAGGED RESPONSES (RENAMED FROM TAB 4)
            with gr.Tab("👀 View Flagged Responses"):
                gr.Markdown("## Review Flagged Responses")

                with gr.Row():
                    with gr.Column():
                        flagged_messages_display = gr.Dataframe(
                            headers=["Timestamp", "Flag Reason", "Flagged Message", "Conversation Context"],
                            interactive=False,
                            max_height=400
                        )
                        refresh_btn = gr.Button("🔄 Refresh", variant="primary")

                    with gr.Column():
                        conversation_context_display = gr.Chatbot(
                            label="Conversation Context",
                            height=400
                        )

                # Event handlers for Tab 3
                flagged_messages_display.select(
                    handle_row_select,
                    outputs=[conversation_context_display]
                )

                refresh_btn.click(
                    read_flagged_messages,
                    outputs=[flagged_messages_display]
                )

            # TAB 4: MODEL EVAL RESULTS (UPDATED WITH BENCHMARK FUNCTIONALITY)
            with gr.Tab("📈 Model Eval Results"):
                gr.Markdown("## Model Evaluation Results")
                gr.Markdown("### 🚀 Loggenix MoE 330M Benchmark Analysis")
                gr.Markdown(
                    "Compare Loggenix MoE 330M performance against other language models across common benchmarks including MMLU, HellaSwag, PIQA, ARC, and WinoGrande.")

                # Create the benchmark plotter instance
                plotter = BenchmarkPlotter()

                def plot_comparison(shot_type, plot_type):
                    if plot_type == "Interactive (Plotly)":
                        fig = plotter.create_plotly_interactive(shot_type.lower())
                        summary = plotter.create_competitive_analysis_summary(shot_type.lower())
                        return fig, summary
                    else:
                        fig = plotter.create_matplotlib_comparison(shot_type.lower())
                        summary = plotter.create_competitive_analysis_summary(shot_type.lower())
                        return fig, summary

                # Control panel for benchmark comparison
                with gr.Row():
                    with gr.Column(scale=1):
                        gr.Markdown("#### 🎛️ Evaluation Controls")
                        shot_type = gr.Radio(
                            choices=["Zero", "Few"],
                            value="Few",
                            label="Shot Type",
                            info="Select zero-shot or few-shot evaluation"
                        )
                        plot_type = gr.Radio(
                            choices=["Interactive (Plotly)", "Static (Matplotlib)"],
                            value="Interactive (Plotly)",
                            label="Plot Type",
                            info="Choose visualization style"
                        )
                        plot_button = gr.Button("🔄 Generate Comparison", variant="primary", size="lg")

                        # Quick stats
                        gr.Markdown("#### 📊 Quick Stats")
                        gr.Markdown("""
                        - **Model Size**: 330M parameters
                        - **Architecture**: Mixture of Experts (MoE)
                        - **Benchmarks**: 5 standard evaluations
                        - **Comparison**: Against 4-5 other models per evaluation type
                        """)

                    with gr.Column(scale=2):
                        gr.Markdown("#### 🏆 Performance Overview")
                        performance_summary = gr.Markdown("""
                        **Few-Shot Highlights:**
                        - PIQA: 80.0% (competitive with much larger models)
                        - WinoGrande: 50.0% (solid common sense reasoning)
                        - Parameter efficient compared to larger alternatives

                        **Zero-Shot Highlights:**  
                        - PIQA: 55.0% (strong physical reasoning)
                        - Competitive performance across multiple benchmarks
                        - Efficient architecture for the parameter count
                        """)

                # Main visualization area
                with gr.Row():
                    with gr.Column(scale=3):
                        gr.Markdown("#### 📈 Benchmark Comparison Charts")
                        plot_output = gr.Plot(label="Performance Comparison")

                    with gr.Column(scale=1):
                        gr.Markdown("#### 📋 Analysis Summary")
                        summary_output = gr.Markdown(
                            value="Select evaluation parameters and click 'Generate Comparison' to see detailed analysis.")

                # Benchmark details section
                with gr.Row():
                    with gr.Column():
                        gr.Markdown("#### 🎯 Benchmark Descriptions")
                        gr.Markdown("""
                        **MMLU (Massive Multitask Language Understanding)**
                        - Tests knowledge across 57 academic subjects
                        - Measures broad knowledge and reasoning ability

                        **HellaSwag** 
                        - Tests common sense reasoning about everyday situations
                        - Measures ability to predict plausible continuations

                        **PIQA (Physical Interaction QA)**
                        - Tests physical common sense reasoning
                        - Measures understanding of how the physical world works
                        """)

                    with gr.Column():
                        gr.Markdown("#### 🧠 Additional Benchmarks")
                        gr.Markdown("""
                        **ARC (AI2 Reasoning Challenge)**
                        - Tests grade-school level scientific reasoning
                        - Measures ability to answer science exam questions

                        **WinoGrande**
                        - Tests commonsense reasoning through pronoun resolution
                        - Measures understanding of implicit relationships

                        **Parameter Efficiency**
                        - Compares performance relative to model size
                        - Shows value proposition of architectural choices
                        """)

                # Examples and presets
                with gr.Row():
                    gr.Markdown("#### 🎲 Quick Comparisons")

                # Examples for quick access
                gr.Examples(
                    examples=[
                        ["Few", "Interactive (Plotly)"],
                        ["Zero", "Interactive (Plotly)"],
                        ["Few", "Static (Matplotlib)"],
                        ["Zero", "Static (Matplotlib)"]
                    ],
                    inputs=[shot_type, plot_type],
                    label="Try these preset configurations:"
                )

                # Event handlers
                plot_button.click(
                    fn=plot_comparison,
                    inputs=[shot_type, plot_type],
                    outputs=[plot_output, summary_output]
                )

                # Auto-generate on load with few-shot interactive plot
                gr.Markdown("---")
                gr.Markdown(
                    "💡 **Tip**: The interactive plots allow you to hover for detailed values and zoom into specific benchmark areas.")

            # You'll also need to add this BenchmarkPlotter class definition earlier in your code
            # (before the tab definitions):


            # TAB 5: ABOUT (MOVED FROM TAB 6)
            # with gr.Tab("ℹ️ About"):
            #     gr.Markdown("## About Loggenix MOE Model")
            #
            #     gr.Markdown("""
            #     ### Model: `kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v6.2-finetuned-tool`
            #
            #     This is a fine-tuned Mixture of Experts (MOE) model designed for specialized AI tasks with tool calling capabilities.
            #
            #     #### Key Features:
            #     - **Architecture**: MOE with 0.3B total parameters, 0.1B active parameters
            #     - **Training**: Fine-tuned with learning rate 7e-5, batch size 16
            #     - **Hardware**: Optimized for RTX 4090 GPU
            #     - **Capabilities**: Tool calling, instruction following, task-specific responses
            #
            #     #### Model Specifications:
            #     - **Total Parameters**: 0.3B
            #     - **Active Parameters**: 0.1B
            #     - **Context Length**: 4096 tokens
            #     - **Precision**: FP16 for optimal performance
            #     - **Flash Attention**: Supported for faster inference
            #
            #     #### Sample Inference Code:
            #     ```python
            #     from transformers import AutoModelForCausalLM, AutoTokenizer
            #     import torch
            #
            #     # Load model and tokenizer
            #     model_id = "kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v6.2-finetuned-tool"
            #     tokenizer = AutoTokenizer.from_pretrained(model_id)
            #     model = AutoModelForCausalLM.from_pretrained(
            #         model_id,
            #         device_map="auto",
            #         torch_dtype=torch.float16,
            #         attn_implementation="flash_attention_2"
            #     ).eval()
            #
            #     # Prepare messages
            #     messages = [
            #         {"role": "system", "content": "You are a helpful AI assistant."},
            #         {"role": "user", "content": "Calculate 25 + 37"}
            #     ]
            #
            #     # Format and generate
            #     prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
            #     inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
            #
            #     with torch.no_grad():
            #         outputs = model.generate(
            #             **inputs,
            #             max_new_tokens=512,
            #             do_sample=True,
            #             temperature=0.7,
            #             pad_token_id=tokenizer.pad_token_id
            #         )
            #
            #     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
            #     print(response)
            #     ```
            #
            #     #### Tool Calling Support:
            #     The model supports structured tool calling for mathematical operations, data analysis, and other specialized tasks.
            #
            #     #### Performance Optimizations:
            #     - **Speed Mode**: Max 512 new tokens for fast responses
            #     - **Balanced Mode**: Max 2048 new tokens for comprehensive answers
            #     - **Full Capacity**: Dynamic token allocation up to context limit
            #
            #     ---
            #
            #     **Developed by**: Kshitij Thakkar
            #     **Version**: v6.2
            #     **License**: Please check model repository for licensing details
            #     """)
            
            # # TAB 5: ABOUT (Updated with comprehensive application guide)
            # with gr.Tab("ℹ️ About"):
            #     gr.Markdown("# 🤖 AI Tasks Evaluation Suite - User Guide")

            #     gr.Markdown("""
            #     ## 📋 Application Overview

            #     The **AI Tasks Evaluation Suite** is a comprehensive platform for testing, evaluating, and monitoring AI model performance. 
            #     This application provides multiple interfaces for model inference, dataset evaluation, response flagging, and performance analysis.

            #     ---

            #     ## 🚀 Tab 1: Inference Use Case

            #     **Purpose**: Interactive model testing with real-time inference and response quality monitoring.

            #     ### How to Use:
            #     1. **Select Task Type**: Choose from available task types (loaded from the inference dataset)
            #     2. **Configure Inference**: Select optimization level:
            #     - `Optimized for Speed`: Fast responses (max 512 tokens)
            #     - `Balanced Mode`: Comprehensive answers (max 2048 tokens) 
            #     - `Full Capacity`: Maximum context utilization
            #     3. **Review System Prompt**: The system prompt auto-loads based on task type (editable)
            #     4. **Chat Interface**: 
            #     - Enter messages in the input field
            #     - View conversation history in the chat display
            #     - Responses are generated using the selected model configuration

            #     ### Response Flagging Feature:
            #     - **Select Response**: Choose any AI assistant response from the dropdown
            #     - **Add Reason**: Provide context for why the response needs flagging
            #     - **Flag Response**: Submit the flag (logged to `logs/flagged_responses.log`)
            #     - **Use Cases**: Mark inappropriate, incorrect, or problematic responses for review

            #     ### Controls:
            #     - `Send`: Submit your message for inference
            #     - `Clear History`: Reset conversation (also clears flagging options)
            #     - `Flag Response`: Mark selected response with reason

            #     ---

            #     ## 📊 Tab 2: Eval Samples

            #     **Purpose**: Explore evaluation datasets and compare expected vs actual model outputs.

            #     ### Available Datasets:
            #     - **Loggenix Synthetic AI Tasks Eval (small)**: Compact evaluation set
            #     - **Loggenix Synthetic AI Tasks Eval v5 (large)**: Extended evaluation dataset  
            #     - **Loggenix Synthetic AI Tasks Eval v6 (large)**: Latest evaluation dataset

            #     ### How to Use:
            #     1. **Select Dataset**: Choose from the dropdown (displays dataset info automatically)
            #     2. **Choose Task Type**: Filter by specific task categories within the dataset
            #     3. **Review Task Details**: View comprehensive information including:
            #     - **Input Model**: Model used for generating the task
            #     - **Output Model**: Model used for generating responses
            #     - **Task**: The actual task/instruction given to the model
            #     - **Input**: Raw input data or context
            #     - **Expected Response**: Ground truth or reference output
            #     - **Loggenix Output**: Model's actual response

            #     ### Data Fields Explained:
            #     - **task_type**: Category/classification of the task
            #     - **task**: The instruction or prompt given to the model
            #     - **input**: Additional context or data provided
            #     - **input_model**: Model that created the task
            #     - **output_model**: Model that generated the response
            #     - **expected_response**: Reference/ground truth answer
            #     - **loggenix_output**: Actual model output for comparison

            #     ---

            #     ## 👀 Tab 3: View Flagged Responses

            #     **Purpose**: Review and analyze previously flagged responses for quality control.

            #     ### Features:
            #     - **Flagged Messages Table**: Shows all flagged responses with:
            #     - Timestamp of when flagged
            #     - Reason provided for flagging
            #     - Preview of flagged message (first 100 characters)
            #     - Conversation context summary

            #     ### How to Use:
            #     1. **Refresh Data**: Click refresh to load latest flagged responses
            #     2. **Select Row**: Click on any row to view full conversation context
            #     3. **Review Context**: Examine the complete conversation leading to the flagged response
            #     4. **Quality Analysis**: Use this data for model improvement and monitoring

            #     ### Use Cases:
            #     - Monitor model response quality over time
            #     - Identify patterns in problematic responses
            #     - Gather data for model retraining or fine-tuning
            #     - Quality assurance and compliance checking

            #     ---

            #     ## 📈 Tab 4: Model Eval Results

            #     **Status**: 🚧 Coming Soon

            #     **Planned Features**:
            #     - Comprehensive evaluation metrics and scoring
            #     - Performance benchmarks across different task types
            #     - Comparative analysis between model configurations
            #     - Interactive charts and visualizations
            #     - Statistical analysis of model performance
            #     - Trend analysis over time

            #     ---

            #     ## 🛠️ Technical Specifications

            #     ### Model Details:
            #     **Primary Model**: `kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v6.2-finetuned-tool`

            #     - **Architecture**: Mixture of Experts (MOE)
            #     - **Total Parameters**: 0.3B
            #     - **Active Parameters**: 0.1B
            #     - **Context Length**: 4096 tokens
            #     - **Precision**: FP16
            #     - **Flash Attention**: Supported
            #     - **Tool Calling**: Enabled

            #     ### Performance Configurations:
            #     ```python
            #     configs = {
            #         "Optimized for Speed": {
            #             "max_new_tokens": 512,
            #             "temperature": 0.7,
            #             "do_sample": True
            #         },
            #         "Balanced Mode": {
            #             "max_new_tokens": 2048, 
            #             "temperature": 0.8,
            #             "do_sample": True
            #         },
            #         "Full Capacity": {
            #             "max_new_tokens": 4000,
            #             "temperature": 0.9,
            #             "do_sample": True
            #         }
            #     }
            #     ```

            #     ### System Requirements:
            #     - **GPU**: NVIDIA RTX 4090 (recommended)
            #     - **Memory**: 16GB+ system RAM
            #     - **VRAM**: 8GB+ GPU memory
            #     - **Python**: 3.8+
            #     - **Dependencies**: transformers, torch, gradio, datasets, plotly

            #     ---

            #     ## 🔧 Quick Start Guide

            #     ### For Model Testing:
            #     1. Go to **Inference Use Case** tab
            #     2. Select a task type from dropdown
            #     3. Choose inference configuration based on your needs
            #     4. Start chatting with the model
            #     5. Flag any problematic responses for review

            #     ### For Dataset Exploration:
            #     1. Go to **Eval Samples** tab  
            #     2. Select dataset from dropdown
            #     3. Choose task type to filter results
            #     4. Compare expected vs actual outputs
            #     5. Use insights for model evaluation

            #     ### For Quality Monitoring:
            #     1. Use flagging feature during testing
            #     2. Review flagged responses in **View Flagged Responses** tab
            #     3. Analyze patterns for model improvement

            #     ---

            #     ## 📝 Logging & Data Storage

            #     ### Flagged Responses:
            #     - **Location**: `logs/flagged_responses.log`
            #     - **Format**: JSON entries with timestamp, reason, message, and context
            #     - **Usage**: Quality control, model improvement, compliance tracking

            #     ### Data Sources:
            #     - **Inference Dataset**: `kshitijthakkar/synthetic-ai-tasks-eval-v5`
            #     - **Evaluation Datasets**: Multiple versions of synthetic AI tasks with expected outputs
            #     - **Model Outputs**: Generated in real-time during inference

            #     ---

            #     ## 🆘 Troubleshooting

            #     ### Common Issues:
            #     - **Dataset Loading**: Ensure internet connection for HuggingFace dataset access
            #     - **Model Inference**: Check GPU memory availability
            #     - **Flagging**: Verify logs directory permissions
            #     - **Performance**: Adjust inference configuration for your hardware

            #     ### Support:
            #     - Check console output for detailed error messages  
            #     - Verify model handler configuration (`model_handler_ollama.py`)
            #     - Ensure all dependencies are installed

            #     ---

            #     **Developed by**: Kshitij Thakkar  
            #     **Version**: AI Tasks Evaluation Suite v1.0  
            #     **Last Updated**: September 2025
            #     """)
        # Load initial data
        demo.load(
            fn=read_flagged_messages,
            outputs=[flagged_messages_display]
        )

    return demo


# Launch the application
if __name__ == "__main__":
    print("Starting AI Tasks Evaluation Suite...")
    demo = create_interface()
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        debug=True
        #,mcp_server=True
    )