import gradio as gr import pandas as pd from datasets import load_dataset import plotly.graph_objects as go import datetime import json import random import os #from model_handler import generate_response, get_inference_configs #from enhanced_model_handler import generate_response, get_inference_configs from model_handler_ollama import generate_response, get_inference_configs #import torch # Add these imports at the top of your main file import matplotlib.pyplot as plt import seaborn as sns import pandas as pd import numpy as np import plotly.graph_objects as go import plotly.express as px from plotly.subplots import make_subplots import gradio as gr # Set style for better-looking plots plt.style.use('default') sns.set_palette("husl") # Configuration for datasets DATASET_CONFIGS = { 'Loggenix Synthetic AI Tasks Eval (with outputs)-small': { 'repo_id': 'kshitijthakkar/loggenix-synthetic-ai-tasks-eval-with-outputs', 'split': 'train' }, 'Loggenix Synthetic AI Tasks Eval (with outputs) v5-large': { 'repo_id': 'kshitijthakkar/loggenix-synthetic-ai-tasks-eval_v5-with-outputs', 'split': 'train' }, 'Loggenix Synthetic AI Tasks Eval (with outputs) v6-large': { 'repo_id': 'kshitijthakkar/loggenix-synthetic-ai-tasks-eval_v6-with-outputs', 'split': 'train' }, 'Loggenix Synthetic AI Tasks Eval (with outputs) v7-large': { 'repo_id': 'kshitijthakkar/loggenix-synthetic-ai-tasks-eval_v5-with-outputs-v7-sft-v1', 'split': 'train' } } class BenchmarkPlotter: def __init__(self): # Zero-shot benchmark data self.zero_shot_data = { 'Model': ['Loggenix MoE 330M', 'SmolLM-135M', 'GPT2-137M', 'SmolLM-360M', 'Qwen2-500M', 'SmolLM-1.7B'], 'Parameters': ['330M', '135M', '137M', '360M', '500M', '1.7B'], 'Param_Numeric': [330, 135, 137, 360, 500, 1700], 'MMLU': [24.6, 30.23, 26.29, 34.17, 31.92, 39.97], 'HellaSwag': [25.0, 42.3, 29.76, 53.8, 47.61, 64.1], 'PIQA': [55.0, 69.6, 62.51, 72.0, 69.31, 77.3], 'ARC': [15.0, 44.0, 31.09, 51.1, 39.74, 61.55], 'WinoGrande': [40.0, 52.7, 49.72, 53.7, 54.14, 56.0], 'IsLoggenix': [True, False, False, False, False, False] } # Few-shot benchmark data self.few_shot_data = { 'Model': ['Loggenix MoE 330M', 'Gemma 3 PT 1B', 'Gemma 3 PT 4B', 'Gemma 3 PT 12B', 'Gemma 3 PT 27B'], 'Parameters': ['330M', '1B', '4B', '12B', '27B'], 'Param_Numeric': [330, 1000, 4000, 12000, 27000], 'MMLU': [25.8, 26.5, 59.6, 74.5, 78.6], 'HellaSwag': [30.0, 62.3, 77.2, 84.2, 85.6], 'PIQA': [80.0, 73.8, 79.6, 81.8, 83.3], 'ARC': [10.0, 38.4, 56.2, 68.9, 70.6], 'WinoGrande': [50.0, 58.2, 64.7, 74.3, 78.8], 'IsLoggenix': [True, False, False, False, False] } self.df_zero = pd.DataFrame(self.zero_shot_data) self.df_few = pd.DataFrame(self.few_shot_data) def create_matplotlib_comparison(self, shot_type='zero'): """Create matplotlib comparison charts""" df = self.df_zero if shot_type == 'zero' else self.df_few # Create subplots fig, axes = plt.subplots(2, 3, figsize=(18, 12)) fig.suptitle(f'{shot_type.title()}-Shot Benchmark Comparison', fontsize=16, fontweight='bold') benchmarks = ['MMLU', 'HellaSwag', 'PIQA', 'ARC', 'WinoGrande'] axes_flat = axes.flatten() # Color palette - highlight Loggenix in red colors = ['#ff6b6b' if is_loggenix else '#4a90e2' for is_loggenix in df['IsLoggenix']] for i, benchmark in enumerate(benchmarks): ax = axes_flat[i] bars = ax.bar(range(len(df)), df[benchmark], color=colors, alpha=0.8) # Highlight bars where Loggenix outperforms loggenix_score = df[df['IsLoggenix']][benchmark].iloc[0] for j, (bar, score) in enumerate(zip(bars, df[benchmark])): if not df['IsLoggenix'].iloc[j] and score < loggenix_score: bar.set_color('#90EE90') # Light green for outperformed models ax.set_title(f'{benchmark}', fontweight='bold') ax.set_ylabel('Score (%)') ax.set_xticks(range(len(df))) ax.set_xticklabels(df['Model'], rotation=45, ha='right') ax.grid(True, alpha=0.3) # Add value labels on bars for bar, value in zip(bars, df[benchmark]): height = bar.get_height() ax.text(bar.get_x() + bar.get_width() / 2., height + 0.5, f'{value:.1f}%', ha='center', va='bottom', fontsize=9) # Parameter efficiency scatter plot ax_scatter = axes_flat[5] scatter = ax_scatter.scatter(df['Param_Numeric'], df['MMLU'], c=colors, s=100, alpha=0.7, edgecolors='black') ax_scatter.set_xlabel('Parameters (M)') ax_scatter.set_ylabel('MMLU Score (%)') ax_scatter.set_title('Parameter Efficiency (MMLU)', fontweight='bold') ax_scatter.set_xscale('log') ax_scatter.grid(True, alpha=0.3) # Add model labels to scatter plot for idx, row in df.iterrows(): ax_scatter.annotate(row['Parameters'], (row['Param_Numeric'], row['MMLU']), xytext=(5, 5), textcoords='offset points', fontsize=8, ha='left') plt.tight_layout() return fig def create_plotly_interactive(self, shot_type='zero'): """Create interactive plotly charts""" df = self.df_zero if shot_type == 'zero' else self.df_few # Create subplots fig = make_subplots( rows=2, cols=3, subplot_titles=('MMLU', 'HellaSwag', 'PIQA', 'ARC', 'WinoGrande', 'Parameter Efficiency'), specs=[[{"secondary_y": False}, {"secondary_y": False}, {"secondary_y": False}], [{"secondary_y": False}, {"secondary_y": False}, {"type": "scatter"}]] ) benchmarks = ['MMLU', 'HellaSwag', 'PIQA', 'ARC', 'WinoGrande'] positions = [(1, 1), (1, 2), (1, 3), (2, 1), (2, 2)] for i, (benchmark, pos) in enumerate(zip(benchmarks, positions)): loggenix_score = df[df['IsLoggenix']][benchmark].iloc[0] # Create colors based on performance vs Loggenix bar_colors = [] for idx, row in df.iterrows(): if row['IsLoggenix']: bar_colors.append('#ff6b6b') # Red for Loggenix elif row[benchmark] < loggenix_score: bar_colors.append('#90EE90') # Light green for outperformed else: bar_colors.append('#4a90e2') # Blue for others fig.add_trace( go.Bar( x=df['Model'], y=df[benchmark], name=benchmark, marker_color=bar_colors, text=[f'{val:.1f}%' for val in df[benchmark]], textposition='outside', showlegend=False, ), row=pos[0], col=pos[1] ) # Parameter efficiency scatter plot fig.add_trace( go.Scatter( x=df['Param_Numeric'], y=df['MMLU'], mode='markers+text', text=df['Parameters'], textposition='top right', marker=dict( size=12, color=['#ff6b6b' if is_loggenix else '#4a90e2' for is_loggenix in df['IsLoggenix']], line=dict(width=1, color='black') ), name='Models', showlegend=False ), row=2, col=3 ) # Update layout fig.update_layout( title_text=f'{shot_type.title()}-Shot Benchmark Comparison', title_x=0.5, height=800, showlegend=False ) # Update x-axis for scatter plot to log scale fig.update_xaxes(type="log", row=2, col=3, title_text="Parameters (M)") fig.update_yaxes(title_text="MMLU Score (%)", row=2, col=3) # Update all y-axes for i in range(1, 6): if i <= 3: fig.update_yaxes(title_text="Score (%)", row=1, col=i) elif i <= 5: fig.update_yaxes(title_text="Score (%)", row=2, col=i - 3) return fig def create_competitive_analysis_summary(self, shot_type='zero'): """Create a summary of competitive areas""" df = self.df_zero if shot_type == 'zero' else self.df_few loggenix_row = df[df['IsLoggenix']].iloc[0] summary = f"## Loggenix MoE 330M Analysis ({shot_type.title()}-Shot)\n\n" benchmarks = ['MMLU', 'HellaSwag', 'PIQA', 'ARC', 'WinoGrande'] competitive_areas = [] for benchmark in benchmarks: loggenix_score = loggenix_row[benchmark] outperformed = df[(~df['IsLoggenix']) & (df[benchmark] < loggenix_score)] if len(outperformed) > 0: competitive_areas.append( f"**{benchmark} ({loggenix_score:.1f}%)**: Outperforms {len(outperformed)} models" ) # Check for near-competitive performance if shot_type == 'few' and benchmark == 'PIQA' and loggenix_score >= 80: competitive_areas.append( f"**{benchmark} ({loggenix_score:.1f}%)**: Near 12B model performance (81.8%) with 36x fewer parameters!" ) if competitive_areas: summary += "### ๐Ÿ† Competitive Areas:\n" for area in competitive_areas: summary += f"- {area}\n" else: summary += "### ๐Ÿ“Š Performance Notes:\n" summary += "- Model shows efficient performance for parameter count\n" summary += "- Some benchmarks show room for improvement\n" # Parameter efficiency note param_rank = df.sort_values('Param_Numeric')['Model'].tolist().index('Loggenix MoE 330M') + 1 summary += f"\n### ๐Ÿ“ˆ Efficiency Metrics:\n" summary += f"- Ranks #{param_rank} in parameter count\n" summary += f"- Strong parameter efficiency across benchmarks\n" summary += f"- MoE architecture enables competitive performance\n" return summary # Load main dataset for inference tab def load_inference_dataset(): """Load the main dataset for inference use case""" try: print("Loading synthetic-ai-tasks-eval-v5 dataset...") dataset = load_dataset( 'kshitijthakkar/synthetic-ai-tasks-eval-v5', split='train', trust_remote_code=True ) df = dataset.to_pandas() print(f"โœ“ Successfully loaded: {len(df)} rows, {len(df.columns)} columns") return df except Exception as e: print(f"โœ— Error loading dataset: {str(e)}") return pd.DataFrame({'Error': [f'Failed to load: {str(e)}']}) # Load dataset for eval samples tab def load_eval_datasets(): """Load all datasets for evaluation samples""" datasets = {} for display_name, config in DATASET_CONFIGS.items(): try: print(f"Loading {display_name}...") dataset = load_dataset( config['repo_id'], split=config['split'], trust_remote_code=True ) df = dataset.to_pandas() datasets[display_name] = df print(f"โœ“ Successfully loaded {display_name}: {len(df)} rows") except Exception as e: print(f"โœ— Error loading {display_name}: {str(e)}") datasets[display_name] = pd.DataFrame({ 'Error': [f'Failed to load: {str(e)}'], 'Dataset': [config['repo_id']] }) return datasets # Load datasets INFERENCE_DATASET = load_inference_dataset() EVAL_DATASETS = load_eval_datasets() # ===== TAB 1: INFERENCE USE CASE WITH INTEGRATED FLAGGING ===== def get_task_types(): """Get unique task types from inference dataset""" if 'task_type' in INFERENCE_DATASET.columns: task_types = INFERENCE_DATASET['task_type'].unique().tolist() return [str(t) for t in task_types if pd.notna(t)] return ["No task types available"] def get_task_by_type(task_type): """Get task content by task type""" if 'task_type' in INFERENCE_DATASET.columns and 'task' in INFERENCE_DATASET.columns: filtered = INFERENCE_DATASET[INFERENCE_DATASET['task_type'] == task_type] if len(filtered) > 0: return str(filtered.iloc[0]['task']) return "No task found for this type" def chat_interface_with_inference(prompt, history, system_prompt, inference_config): """Enhanced chat interface with model inference and history""" if not prompt.strip(): return history, "" # Add user message to history history.append(("You", prompt)) try: if not system_prompt.strip(): response = "Please select a task type to load system prompt first." else: # Get inference configuration configs = get_inference_configs() config = configs.get(inference_config, configs["Optimized for Speed"]) # Run inference using the model response = generate_response( system_prompt=system_prompt, user_input=prompt, config_name=inference_config ) # Format and add AI response to history formatted_response = f"**AI Assistant:**\n{response}" history.append(("AI Assistant", formatted_response)) except Exception as e: error_msg = f"**AI Assistant:**\nError during inference: {str(e)}" history.append(("AI Assistant", error_msg)) return history, "" def flag_response(history, flagged_message, flag_reason): """Flag a response""" if not flagged_message or flagged_message == "No responses available": return "Invalid message selection." try: flagged_index = int(flagged_message.split()[1][:-1]) if flagged_index >= len(history) or history[flagged_index][0] != "AI Assistant": return "You can only flag assistant responses." flagged_message_content = history[flagged_index][1] log_entry = { "timestamp": datetime.datetime.now().isoformat(), "flag_reason": str(flag_reason), "flagged_message": str(flagged_message_content), "conversation_context": history, } os.makedirs("logs", exist_ok=True) with open("logs/flagged_responses.log", "a") as f: f.write(json.dumps(log_entry) + "\n") return f"Response flagged successfully: {flag_reason}" except Exception as e: return f"Error flagging response: {str(e)}" def get_assistant_responses(history): """Get dropdown options for assistant responses""" responses = [ f"Response {i}: {str(msg[1])[:50]}..." for i, msg in enumerate(history) if msg[0] == "AI Assistant" ] if not responses: responses = ["No responses available"] return gr.update(choices=responses, value=responses[0] if responses else "No responses available") def display_selected_message(selected_index, history): """Display the selected flagged message""" if selected_index == "No responses available": return "No responses available" try: flagged_index = int(selected_index.split()[1][:-1]) if flagged_index < len(history) and history[flagged_index][0] == "AI Assistant": return history[flagged_index][1] else: return "Invalid selection." except Exception as e: return f"Error: {str(e)}" def clear_inference_history(): """Clear chat history for inference tab""" return [], gr.update(choices=["No responses available"], value="No responses available") # ===== TAB 2: EVAL SAMPLES ===== def update_eval_table(dataset_name): """Update eval table based on selected dataset""" if dataset_name in EVAL_DATASETS: return EVAL_DATASETS[dataset_name].head(100) return pd.DataFrame() def get_eval_dataset_info(dataset_name): """Get info about selected eval dataset""" if dataset_name in EVAL_DATASETS: df = EVAL_DATASETS[dataset_name] return f""" **Dataset**: {dataset_name} - **Rows**: {len(df):,} - **Columns**: {len(df.columns)} - **Column Names**: {', '.join(df.columns.tolist())} """ return "No dataset selected" # def get_task_types_for_eval(dataset_name): # """Get unique task types from selected eval dataset""" # if dataset_name in EVAL_DATASETS and 'task_type' in EVAL_DATASETS[dataset_name].columns: # task_types = EVAL_DATASETS[dataset_name]['task_type'].unique().tolist() # return [str(t) for t in task_types if pd.notna(t)] # return ["No task types available"] def get_task_types_for_eval(dataset_name): """Get unique task types from selected eval dataset""" if dataset_name in EVAL_DATASETS and 'task_type' in EVAL_DATASETS[dataset_name].columns: task_types = EVAL_DATASETS[dataset_name]['task_type'].unique().tolist() # The correct way is to return the list directly, not a joined string. # The list comprehension `[str(t) for t in task_types if pd.notna(t)]` already does this. return [str(t) for t in task_types if pd.notna(t)] return ["No task types available"] def get_tasks_by_type_eval(dataset_name, task_type): """Get tasks filtered by dataset and task type""" if (dataset_name in EVAL_DATASETS and 'task_type' in EVAL_DATASETS[dataset_name].columns and 'task' in EVAL_DATASETS[dataset_name].columns): filtered = EVAL_DATASETS[dataset_name][EVAL_DATASETS[dataset_name]['task_type'] == task_type] if len(filtered) > 0: # Create display options with index and truncated task content tasks = [] for idx, row in filtered.iterrows(): task_preview = str(row['task'])[:100] + "..." if len(str(row['task'])) > 100 else str(row['task']) tasks.append(f"Row {idx}: {task_preview}") return tasks return ["No tasks found"] # def get_selected_row_data(dataset_name, task_type, selected_task): # """Get all data for the selected row""" # if not selected_task or selected_task == "No tasks found": # return "", "", "", "", "", "","" # # try: # # Extract row index from selected_task # row_idx = int(selected_task.split("Row ")[1].split(":")[0]) # # if dataset_name in EVAL_DATASETS: # df = EVAL_DATASETS[dataset_name] # if row_idx in df.index: # row = df.loc[row_idx] # # # Extract all fields with safe handling for missing columns # task = str(row.get('task', 'N/A')) # task_type_val = str(row.get('task_type', 'N/A')) # input_model = str(row.get('input_model', 'N/A')) # expected_response = str(row.get('expected_response', 'N/A')) # loggenix_output = str(row.get('loggenix_output', 'N/A')) # output_model = str(row.get('output_model', 'N/A')) # input_text = str(row.get('input', 'N/A')) # # # return task_type_val, input_model, output_model, task, input_text, expected_response, loggenix_output # # except Exception as e: # return f"Error: {str(e)}", "", "", "", "", "", "", "" # # return "", "", "", "", "", "", "" def get_selected_row_data_by_type(dataset_name, task_type): """Get all data for the first row of a selected dataset and task type""" if (dataset_name in EVAL_DATASETS and 'task_type' in EVAL_DATASETS[dataset_name].columns and 'task' in EVAL_DATASETS[dataset_name].columns): filtered = EVAL_DATASETS[dataset_name][EVAL_DATASETS[dataset_name]['task_type'] == task_type] if len(filtered) > 0: row = filtered.iloc[0] # Get the first row # Extract all fields with safe handling for missing columns task = str(row.get('task', 'N/A')) input_model = str(row.get('input_model', 'N/A')) expected_response = str(row.get('expected_response', 'N/A')) loggenix_output = str(row.get('loggenix_output', 'N/A')) output_model = str(row.get('output_model', 'N/A')) input_text = str(row.get('input', 'N/A')) return input_model, output_model, task, input_text, expected_response, loggenix_output return "", "", "", "", "", "" # ===== TAB 3: VIEW FLAGGED RESPONSES ===== def read_flagged_messages(): """Read flagged messages from log file""" try: if not os.path.exists("logs/flagged_responses.log"): return pd.DataFrame() with open("logs/flagged_responses.log", "r") as f: flagged_messages = f.readlines() if not flagged_messages: return pd.DataFrame() table_data = [] for entry in flagged_messages: data = json.loads(entry) table_data.append({ "Timestamp": data.get("timestamp", "N/A"), "Flag Reason": data.get("flag_reason", "N/A"), "Flagged Message": data.get("flagged_message", "N/A")[:100] + "...", "Conversation Context": str(len(data.get("conversation_context", []))) + " messages" }) return pd.DataFrame(table_data) except Exception as e: return pd.DataFrame({"Error": [f"Error reading flagged messages: {str(e)}"]}) def handle_row_select(evt: gr.SelectData): """Handle row selection in flagged messages table""" try: if not os.path.exists("logs/flagged_responses.log"): return [] with open("logs/flagged_responses.log", "r") as f: flagged_messages_log = f.readlines() if evt.index[0] < len(flagged_messages_log): selected_entry = json.loads(flagged_messages_log[evt.index[0]]) conversation_context = selected_entry.get("conversation_context", []) return conversation_context return [] except Exception as e: return [("System", f"Error loading conversation: {str(e)}")] # ===== MAIN INTERFACE ===== def create_interface(): with gr.Blocks(title="AI Tasks Evaluation Suite", theme=gr.themes.Soft()) as demo: gr.Markdown("# ๐Ÿค– AI Tasks Evaluation Suite") gr.Markdown("Comprehensive platform for AI model evaluation and testing") with gr.Tabs(): # TAB 5: ABOUT (Updated with comprehensive application guide using Groups and Accordions) with gr.Tab("โ„น๏ธ About"): with gr.Group(): gr.Markdown("# ๐Ÿค– AI Tasks Evaluation Suite - User Guide") with gr.Accordion("๐Ÿ“‹ Application Overview", open=True, elem_classes="panel"): gr.Markdown(""" The **AI Tasks Evaluation Suite** is a comprehensive platform for testing, evaluating, and monitoring AI model performance. This application provides multiple interfaces for model inference, dataset evaluation, response flagging, and performance analysis. **Key Features:** - Interactive model testing with real-time inference - Comprehensive dataset exploration and comparison - Response quality monitoring and flagging system - Performance analysis and evaluation metrics """) with gr.Accordion("๐Ÿ”ง Quick Start Guide", open=False, elem_classes="panel"): gr.Markdown(""" ### ๐Ÿงช **For Model Testing:** 1. Go to **Inference Use Case** tab 2. Select a task type from dropdown 3. Choose inference configuration based on your needs 4. Start chatting with the model 5. Flag any problematic responses for review ### ๐Ÿ“Š **For Dataset Exploration:** 1. Go to **Eval Samples** tab 2. Select dataset from dropdown 3. Choose task type to filter results 4. Compare expected vs actual outputs 5. Use insights for model evaluation ### ๐Ÿ” **For Quality Monitoring:** 1. Use flagging feature during testing 2. Review flagged responses in **View Flagged Responses** tab 3. Analyze patterns for model improvement ### ๐Ÿ’ก **Tips:** - Start with "Optimized for Speed" for general testing - Use specific task types to focus your evaluation - Flag responses immediately when issues are noticed - Regularly review flagged responses for patterns - Check system requirements before running intensive tasks """) with gr.Accordion("๐Ÿš€ Tab 1: Inference Use Case", open=False, elem_classes="panel"): gr.Markdown(""" ### ๐ŸŽฏ **Purpose:** Interactive model testing with real-time inference and response quality monitoring. ### ๐Ÿ“ **How to Use:** 1. **Select Task Type**: Choose from available task types (loaded from the inference dataset) 2. **Configure Inference**: Select optimization level: - `Optimized for Speed`: Fast responses (max 512 tokens) - `Middle-ground`: Comprehensive answers (max 2048 tokens) - `Full Capacity`: Maximum context utilization (max 8192 tokens) 3. **Review System Prompt**: The system prompt auto-loads based on task type (editable) 4. **Chat Interface**: - Enter messages in the input field - View conversation history in the chat display - Responses are generated using the selected model configuration ### ๐Ÿšฉ **Response Flagging Feature:** - **Select Response**: Choose any AI assistant response from the dropdown - **Add Reason**: Provide context for why the response needs flagging - **Flag Response**: Submit the flag (logged to `logs/flagged_responses.log`) - **Use Cases**: Mark inappropriate, incorrect, or problematic responses for review ### ๐ŸŽฎ **Controls:** - `Send`: Submit your message for inference - `Clear History`: Reset conversation (also clears flagging options) - `Flag Response`: Mark selected response with reason """) with gr.Accordion("๐Ÿ“Š Tab 2: Eval Samples", open=False, elem_classes="panel"): gr.Markdown(""" ### ๐ŸŽฏ **Purpose:** Explore evaluation datasets and compare expected vs actual model outputs. ### ๐Ÿ“š **Available Datasets:** - **Loggenix Synthetic AI Tasks Eval (small)**: Compact evaluation set - **Loggenix Synthetic AI Tasks Eval v5 (large)**: Extended evaluation dataset-large-models - **Loggenix Synthetic AI Tasks Eval v6 (large)**: Extended evaluation dataset-small-models - **Loggenix Synthetic AI Tasks Eval v7 (large)**: Latest evaluation dataset-large-models ### ๐Ÿ“ **How to Use:** 1. **Select Dataset**: Choose from the dropdown (displays dataset info automatically) 2. **Choose Task Type**: Filter by specific task categories within the dataset 3. **Review Task Details**: View comprehensive information including: - **Input Model**: Model used for generating the task - **Output Model**: Model used for generating responses - **Task**: The actual task/instruction given to the model - **Input**: Raw input data or context - **Expected Response**: Ground truth or reference output - **Loggenix Output**: Model's actual response ### ๐Ÿ” **Data Fields Explained:** - **task_type**: Category/classification of the task - **task**: The instruction or prompt given to the model - **input**: Additional context or data provided - **input_model**: Model that created the task - **output_model**: Model that generated the response - **expected_response**: Reference/ground truth answer - **loggenix_output**: Actual model output for comparison """) with gr.Accordion("๐Ÿ‘€ Tab 3: View Flagged Responses", open=False, elem_classes="panel"): gr.Markdown(""" ### ๐ŸŽฏ **Purpose:** Review and analyze previously flagged responses for quality control. ### โœจ **Features:** - **Flagged Messages Table**: Shows all flagged responses with: - Timestamp of when flagged - Reason provided for flagging - Preview of flagged message (first 100 characters) - Conversation context summary ### ๐Ÿ“ **How to Use:** 1. **Refresh Data**: Click refresh to load latest flagged responses 2. **Select Row**: Click on any row to view full conversation context 3. **Review Context**: Examine the complete conversation leading to the flagged response 4. **Quality Analysis**: Use this data for model improvement and monitoring ### ๐Ÿ’ก **Use Cases:** - Monitor model response quality over time - Identify patterns in problematic responses - Gather data for model retraining or fine-tuning - Quality assurance and compliance checking """) with gr.Accordion("๐Ÿ“ˆ Tab 4: Model Eval Results", open=False, elem_classes="panel"): gr.Markdown(""" ### โœ… **Status:** Live & Fully Functional ### ๐ŸŽฏ **Current Features:** - **Comprehensive Benchmark Analysis**: Compare Loggenix MoE 330M against multiple baseline models - **Interactive Visualizations**: Plotly charts with hover details, zoom, and pan capabilities - **Static Chart Options**: High-quality Matplotlib plots for presentations and reports - **Dual Evaluation Modes**: - Zero-shot performance analysis - Few-shot learning capabilities assessment - **5 Standard Benchmarks**: MMLU, HellaSwag, PIQA, ARC, and WinoGrande - **Parameter Efficiency Analysis**: Performance-to-size ratio visualizations - **Competitive Analysis**: Automated summaries highlighting model strengths - **Educational Content**: Detailed explanations of each benchmark's purpose ### ๐Ÿ“Š **Benchmark Coverage:** - **MMLU**: 57 academic subjects knowledge assessment - **HellaSwag**: Common sense reasoning evaluation - **PIQA**: Physical interaction understanding - **ARC**: Grade-school scientific reasoning - **WinoGrande**: Commonsense pronoun resolution ### ๐Ÿ† **Key Insights Available:** - Model performance comparison against 4-6 baseline models per evaluation type - Parameter efficiency rankings and analysis - Identification of competitive benchmark areas - Performance scaling analysis relative to model size - Architecture efficiency demonstrations (MoE vs traditional models) ### ๐ŸŽจ **Visualization Options:** - Interactive bar charts for each benchmark - Parameter efficiency scatter plots with log scaling - Color-coded performance comparisons (outperformed models highlighted) - Responsive layouts optimized for different screen sizes - Export-ready static plots for research and presentations """) with gr.Accordion("๐Ÿ› ๏ธ Technical Specifications", open=False, elem_classes="panel"): gr.Markdown(""" ### ๐Ÿค– **Model Details:** **Primary Model**: `kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v7-sft-v1` - **Architecture**: Mixture of Experts (MOE) - **Total Parameters**: 330M (16 experts, 2 active) - **Active Parameters**: 185M - **Context Length**: 8192 tokens - **Precision**: FP16 - **Flash Attention**: Supported - **Tool Calling**: Enabled ### โš™๏ธ **Performance Configurations:** ```python configs = { "Optimized for Speed": { "max_new_tokens": 512, "temperature": 0.7, "do_sample": True, "use_cache": False }, "Middle-ground": { "max_new_tokens": 2048, "temperature": 0.8, "do_sample": True, "use_cache": False }, "Full Capacity": { "max_new_tokens": 8192, "temperature": 0.9, "do_sample": True, "use_cache": False } } ``` ### ๐Ÿ’ป **System Requirements:** - **GPU**: GTX 1050 Ti or RTX 3060 (2-4GB VRAM sufficient) - **Memory**: 8GB+ system RAM - **VRAM**: 2-4GB GPU memory (450MB for 8-bit quantized) - **Python**: 3.8+ - **Dependencies**: transformers, torch, gradio, datasets, plotly """) with gr.Accordion("โšก Model Deployment Guide", open=False, elem_classes="panel"): gr.Markdown(""" ### ๐Ÿš€ **Two Deployment Methods Available** This application supports two different model backends for maximum flexibility: --- ## ๐Ÿณ **Method 1: Ollama (GGUF Quantized) - Recommended** ### ๐Ÿ“ฅ **Installation & Setup:** ```bash # Install Ollama curl -fsSL https://ollama.com/install.sh | sh # Start Ollama service ollama serve # Pull the quantized model (Q8_0 format) ollama pull hf.co/kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v7-sft-v1-Q8_0-GGUF:Q8_0 ``` ### โœ… **Advantages:** - **Lower VRAM Usage**: ~1-2GB VRAM on GPU - **Faster Loading**: Pre-optimized GGUF format - **CPU Fallback**: Can run on CPU if needed - **Easy Management**: Simple model pulling and switching - **Built-in API**: REST API included out of the box ### โš™๏ธ **Configuration:** ```python OLLAMA_BASE_URL = "http://localhost:11434" MODEL_NAME = "hf.co/kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v7-sft-v1-Q8_0-GGUF:Q8_0" ``` ### ๐Ÿ› ๏ธ **Tool Calling Support:** Both formats supported: ```python # Legacy format [TOOL_CALL:calculate_numbers(operation="add", num1="10", num2="5")] # JSON format {"name": "calculate_numbers", "parameters": {"operation": "add", "num1": "10", "num2": "5"}} ``` --- ## ๐Ÿค— **Method 2: HuggingFace Transformers (8-bit Quantized)** ### ๐Ÿ“ฅ **Installation & Setup:** ```bash # Install dependencies pip install transformers torch bitsandbytes accelerate # Model will auto-download on first use ``` ### โš™๏ธ **Quantization Configuration:** ```python # 8-bit quantization (recommended) quantization_config = BitsAndBytesConfig( load_in_8bit=True, llm_int8_threshold=6.0, llm_int8_has_fp16_weight=False, ) # 4-bit quantization (maximum memory savings) quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, ) ``` ### โœ… **Advantages:** - **Native Integration**: Direct HuggingFace ecosystem access - **Fine-grained Control**: Detailed generation parameters - **Memory Optimization**: Gradient checkpointing, flash attention - **Advanced Features**: Custom sampling, beam search, etc. - **Development Flexibility**: Easy model modifications ### ๐Ÿง  **Memory Optimizations:** ```python # Performance optimizations enabled torch.backends.cudnn.benchmark = True torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True # Flash attention support attn_implementation="flash_attention_2" # Gradient checkpointing model.gradient_checkpointing_enable() ``` --- ## ๐Ÿ“Š **Performance Comparison** | Feature | Ollama (GGUF) | Transformers (8-bit) | |---------|---------------|---------------------| | **VRAM Usage** | ~450MB | ~1.2GB | | **Load Time** | Fast (< 30s) | Fast (~30s) | | **Inference Speed (GPU)** | 1-3 seconds | 1-3 seconds | | **Inference Speed (CPU)** | 10-30 seconds | N/A | | **Memory Efficiency** | Excellent | Excellent | | **Setup Complexity** | Simple | Moderate | | **Customization** | Limited | High | --- ## ๐Ÿ”„ **Switching Between Methods** ### **Application Configuration:** The application automatically detects which backend to use: ```python # Check for Ollama first if check_ollama_connection(): use_ollama_handler() else: use_transformers_handler() ``` ### **Environment Variables:** ```bash # Force specific backend export MODEL_BACKEND="ollama" # or "transformers" export OLLAMA_HOST="localhost:11434" export MODEL_CACHE_DIR="/path/to/cache" ``` --- ## ๐Ÿ—๏ธ **Architecture Details** ### **Model Handler Structure:** ``` model_handlers/ โ”œโ”€โ”€ model_handler_ollama.py # Ollama GGUF implementation โ”œโ”€โ”€ model_handler_transformers.py # HuggingFace implementation โ””โ”€โ”€ base_handler.py # Common interface ``` ### **Common Tool System:** Both handlers share the same tool execution system: - Mathematical operations (`calculate_numbers`) - Extensible tool registry - Error handling and validation - Response processing and formatting ### **API Compatibility:** Both implementations expose the same interface: ```python generate_response( system_prompt: str, user_input: str, config_name: str = "Middle-ground" ) -> str ``` --- ## ๐Ÿ’ก **Recommendations** ### **Use Ollama When:** - Limited VRAM (< 2GB) - CPU-only system available - Quick deployment needed - Running on older hardware - Simple integration requirements ### **Use Transformers When:** - GPU available (even 2GB is plenty) - Maximum performance needed - Custom model modifications required - Advanced generation features needed - Development/research environment ### **Hardware Recommendations:** - **Ultra Budget**: Any 2GB GPU from 2016+ (GTX 1050, RX 460) - **CPU-only**: Dual-core processor + 16GB RAM (Ollama) - **Entry Level**: GTX 1050 Ti (2GB) - massive overkill - **Mid-range**: RTX 3060 (12GB) - uses only ~10% of available VRAM ### **Real-world Performance (Tested on RTX 3060):** - **8-bit Quantized**: Only 450MB VRAM usage - **FP16 Standard**: 1.2GB VRAM usage - **Response Time**: 1-3 seconds consistently - **CPU Inference**: 10-30 seconds on 2-core system with 16GB RAM - **Load Time**: Under 30 seconds for both methods - **Minimum Hardware**: Any 2GB GPU will work comfortably ### **Key Performance Facts:** - Model runs on hardware far less powerful than originally specified - RTX 3060 uses less than 10% of its 12GB VRAM capacity - Older GPUs like GTX 1050 Ti (4GB) provide excellent performance - CPU-only inference is viable for non-real-time applications - No need for high-end hardware like RTX 4090 """) with gr.Accordion("๐Ÿ“ Logging & Data Storage", open=False, elem_classes="panel"): gr.Markdown(""" ### ๐Ÿšฉ **Flagged Responses:** - **Location**: `logs/flagged_responses.log` - **Format**: JSON entries with timestamp, reason, message, and context - **Usage**: Quality control, model improvement, compliance tracking ### ๐Ÿ“ **Data Sources:** - **Inference Dataset**: `kshitijthakkar/synthetic-ai-tasks-eval-v5` - **Evaluation Datasets**: Multiple versions of synthetic AI tasks with expected outputs - **Model Outputs**: Generated in real-time during inference """) with gr.Accordion("๐Ÿ†˜ Troubleshooting", open=False, elem_classes="panel"): gr.Markdown(""" ### โš ๏ธ **Common Issues:** - **Dataset Loading**: Ensure internet connection for HuggingFace dataset access - **Model Inference**: Check GPU memory availability - **Flagging**: Verify logs directory permissions - **Performance**: Adjust inference configuration for your hardware ### ๐Ÿ› ๏ธ **Support:** - Check console output for detailed error messages - Verify model handler configuration (`model_handler_ollama.py`) - Ensure all dependencies are installed - Monitor GPU memory usage during inference - Check file permissions for logging directories """) with gr.Group(): gr.Markdown(""" --- **Developed by**: Kshitij Thakkar **Version**: AI Tasks Evaluation Suite v1.0 **Last Updated**: September 2025 """) # TAB 1: INFERENCE USE CASE WITH INTEGRATED FLAGGING with gr.Tab("๐Ÿš€ Inference Use Case"): gr.Markdown("## Model Inference Testing with Response Flagging") with gr.Row(): with gr.Column(scale=1): # Task type dropdown task_type_dropdown = gr.Dropdown( choices=get_task_types(), value=get_task_types()[0] if get_task_types() else None, label="Task Type", info="Select task type to load system prompt" ) # Inference configuration inference_config = gr.Dropdown( choices=list(get_inference_configs().keys()), value="Optimized for Speed", label="Inference Configuration", info="Select inference optimization level" ) with gr.Column(scale=2): # System prompt (editable) system_prompt = gr.Textbox( label="System Prompt (Editable)", lines=6, max_lines=10, placeholder="Select a task type to load system prompt...", interactive=True ) # Chat interface section gr.Markdown("### ๐Ÿ’ฌ Chat Interface") with gr.Row(): with gr.Column(scale=2): # Chat display (replacing the old textbox) chat_display = gr.Chatbot(label="Conversation History", height=400) chat_history_state = gr.State([]) # Chat input with gr.Row(): chat_input = gr.Textbox( placeholder="Enter your message here...", label="Your Message", scale=4 ) send_btn = gr.Button("Send", variant="primary", scale=1) with gr.Row(): clear_chat_btn = gr.Button("๐Ÿ—‘๏ธ Clear History", variant="secondary") # Flagging section with gr.Column(scale=1): gr.Markdown("### ๐Ÿšฉ Flag Response") flagged_message_index = gr.Dropdown( label="Select a response to flag", choices=["No responses available"], value="No responses available", interactive=True ) selected_message_display = gr.Textbox( label="Selected Response", interactive=False, lines=4, max_lines=6 ) flag_reason = gr.Textbox( placeholder="Enter reason for flagging...", label="Reason for Flagging" ) flag_btn = gr.Button("๐Ÿšฉ Flag Response", variant="stop") flag_output = gr.Textbox(label="Flagging Status", visible=True, lines=2) # Event handlers for Tab 1 task_type_dropdown.change( fn=get_task_by_type, inputs=[task_type_dropdown], outputs=[system_prompt] ) # Chat functionality send_btn.click( chat_interface_with_inference, inputs=[chat_input, chat_history_state, system_prompt, inference_config], outputs=[chat_display, chat_input] ).then( lambda x: x, # Update state inputs=[chat_display], outputs=[chat_history_state] ).then( get_assistant_responses, inputs=[chat_history_state], outputs=[flagged_message_index] ) # Enter key support for chat input chat_input.submit( chat_interface_with_inference, inputs=[chat_input, chat_history_state, system_prompt, inference_config], outputs=[chat_display, chat_input] ).then( lambda x: x, # Update state inputs=[chat_display], outputs=[chat_history_state] ).then( get_assistant_responses, inputs=[chat_history_state], outputs=[flagged_message_index] ) clear_chat_btn.click( clear_inference_history, outputs=[chat_display, flagged_message_index] ).then( lambda: [], outputs=[chat_history_state] ) # Flagging functionality flagged_message_index.change( display_selected_message, inputs=[flagged_message_index, chat_history_state], outputs=[selected_message_display] ) flag_btn.click( flag_response, inputs=[chat_history_state, flagged_message_index, flag_reason], outputs=[flag_output] ) # TAB 2: EVAL SAMPLES # with gr.Tab("๐Ÿ“Š Eval Samples"): # gr.Markdown("## Dataset Evaluation Samples") # # with gr.Row(): # with gr.Column(scale=1): # eval_dataset_dropdown = gr.Dropdown( # choices=list(EVAL_DATASETS.keys()), # value=list(EVAL_DATASETS.keys())[0] if EVAL_DATASETS else None, # label="Select Dataset", # info="Choose evaluation dataset to view" # ) # # eval_dataset_info = gr.Markdown( # get_eval_dataset_info(list(EVAL_DATASETS.keys())[0] if EVAL_DATASETS else "") # ) # # with gr.Row(): # eval_table = gr.Dataframe( # value=update_eval_table(list(EVAL_DATASETS.keys())[0]) if EVAL_DATASETS else pd.DataFrame(), # label="Dataset Table", # max_height=800, # min_width=800, # interactive=True, # wrap=True, # show_fullscreen_button=True, # show_copy_button=True, # show_row_numbers=True, # show_search="search", # column_widths=["80px","80px","80px","150px","250px","250px","250px"] # ) # # # Event handlers for Tab 2 # eval_dataset_dropdown.change( # fn=lambda x: (update_eval_table(x), get_eval_dataset_info(x)), # inputs=[eval_dataset_dropdown], # outputs=[eval_table, eval_dataset_info] # ) with gr.Tab("๐Ÿ“Š Eval Samples"): gr.Markdown("## Dataset Evaluation Samples") gr.Markdown("Select dataset and task type to view detailed information") with gr.Row(): with gr.Column(scale=1): eval_dataset_dropdown = gr.Dropdown( choices=list(EVAL_DATASETS.keys()), value=list(EVAL_DATASETS.keys())[0] if EVAL_DATASETS else None, label="Select Dataset", info="Choose evaluation dataset to view" ) eval_task_type_dropdown = gr.Dropdown( choices=[], label="Select Task Type", info="Choose task type from selected dataset", allow_custom_value=True ) with gr.Column(scale=1): eval_dataset_info = gr.Markdown( get_eval_dataset_info(list(EVAL_DATASETS.keys())[0] if EVAL_DATASETS else "") ) # Task details section gr.Markdown("### Task Details") with gr.Row(): input_model_field = gr.Textbox( label="input_model", lines=1, interactive=False ) output_model_field = gr.Textbox( label="output_model", lines=1, interactive=False ) with gr.Row(): task_field = gr.Textbox( label="Task", lines=2, max_lines=5, interactive=False ) with gr.Row(): input_field = gr.Textbox( label="input", lines=12, max_lines=20, interactive=False ) # Large text fields for outputs side by side gr.Markdown("### Expected vs Actual Response Comparison") with gr.Row(): loggenix_output_field = gr.Textbox( label="Expected Response", lines=30, max_lines=40, interactive=False ) expected_response_field = gr.Textbox( label="Loggenix Output", lines=30, max_lines=40, interactive=False ) # Event handlers for Tab 2 # eval_dataset_dropdown.change( # fn=lambda x: (get_eval_dataset_info(x), get_task_types_for_eval(x), None), # inputs=[eval_dataset_dropdown], # outputs=[eval_dataset_info, eval_task_type_dropdown] # ) # Event handlers for Tab 2 # eval_dataset_dropdown.change( # fn=lambda x: (get_eval_dataset_info(x), get_task_types_for_eval(x)), # inputs=[eval_dataset_dropdown], # outputs=[eval_dataset_info, eval_task_type_dropdown] # ) # Define a new function instead of lambda for clarity def update_eval_components(dataset_name): info = get_eval_dataset_info(dataset_name) task_types = get_task_types_for_eval(dataset_name) return info, gr.update(choices=task_types, value=task_types[0] if task_types else "No task types available") # In the event handlers for Tab 2, replace the existing .change with this: eval_dataset_dropdown.change( fn=update_eval_components, inputs=[eval_dataset_dropdown], outputs=[eval_dataset_info, eval_task_type_dropdown] ) eval_task_type_dropdown.change( fn=get_selected_row_data_by_type, inputs=[eval_dataset_dropdown, eval_task_type_dropdown], outputs=[input_model_field, output_model_field, task_field, input_field, loggenix_output_field, expected_response_field] ) # NOTE: The get_tasks_by_type_eval and eval_task_dropdown.change handlers are removed as per request. # TAB 3: VIEW FLAGGED RESPONSES (RENAMED FROM TAB 4) with gr.Tab("๐Ÿ‘€ View Flagged Responses"): gr.Markdown("## Review Flagged Responses") with gr.Row(): with gr.Column(): flagged_messages_display = gr.Dataframe( headers=["Timestamp", "Flag Reason", "Flagged Message", "Conversation Context"], interactive=False, max_height=400 ) refresh_btn = gr.Button("๐Ÿ”„ Refresh", variant="primary") with gr.Column(): conversation_context_display = gr.Chatbot( label="Conversation Context", height=400 ) # Event handlers for Tab 3 flagged_messages_display.select( handle_row_select, outputs=[conversation_context_display] ) refresh_btn.click( read_flagged_messages, outputs=[flagged_messages_display] ) # TAB 4: MODEL EVAL RESULTS (UPDATED WITH BENCHMARK FUNCTIONALITY) with gr.Tab("๐Ÿ“ˆ Model Eval Results"): gr.Markdown("## Model Evaluation Results") gr.Markdown("### ๐Ÿš€ Loggenix MoE 330M Benchmark Analysis") gr.Markdown( "Compare Loggenix MoE 330M performance against other language models across common benchmarks including MMLU, HellaSwag, PIQA, ARC, and WinoGrande.") # Create the benchmark plotter instance plotter = BenchmarkPlotter() def plot_comparison(shot_type, plot_type): if plot_type == "Interactive (Plotly)": fig = plotter.create_plotly_interactive(shot_type.lower()) summary = plotter.create_competitive_analysis_summary(shot_type.lower()) return fig, summary else: fig = plotter.create_matplotlib_comparison(shot_type.lower()) summary = plotter.create_competitive_analysis_summary(shot_type.lower()) return fig, summary # Control panel for benchmark comparison with gr.Row(): with gr.Column(scale=1): gr.Markdown("#### ๐ŸŽ›๏ธ Evaluation Controls") shot_type = gr.Radio( choices=["Zero", "Few"], value="Few", label="Shot Type", info="Select zero-shot or few-shot evaluation" ) plot_type = gr.Radio( choices=["Interactive (Plotly)", "Static (Matplotlib)"], value="Interactive (Plotly)", label="Plot Type", info="Choose visualization style" ) plot_button = gr.Button("๐Ÿ”„ Generate Comparison", variant="primary", size="lg") # Quick stats gr.Markdown("#### ๐Ÿ“Š Quick Stats") gr.Markdown(""" - **Model Size**: 330M parameters - **Architecture**: Mixture of Experts (MoE) - **Benchmarks**: 5 standard evaluations - **Comparison**: Against 4-5 other models per evaluation type """) with gr.Column(scale=2): gr.Markdown("#### ๐Ÿ† Performance Overview") performance_summary = gr.Markdown(""" **Few-Shot Highlights:** - PIQA: 80.0% (competitive with much larger models) - WinoGrande: 50.0% (solid common sense reasoning) - Parameter efficient compared to larger alternatives **Zero-Shot Highlights:** - PIQA: 55.0% (strong physical reasoning) - Competitive performance across multiple benchmarks - Efficient architecture for the parameter count """) # Main visualization area with gr.Row(): with gr.Column(scale=3): gr.Markdown("#### ๐Ÿ“ˆ Benchmark Comparison Charts") plot_output = gr.Plot(label="Performance Comparison") with gr.Column(scale=1): gr.Markdown("#### ๐Ÿ“‹ Analysis Summary") summary_output = gr.Markdown( value="Select evaluation parameters and click 'Generate Comparison' to see detailed analysis.") # Benchmark details section with gr.Row(): with gr.Column(): gr.Markdown("#### ๐ŸŽฏ Benchmark Descriptions") gr.Markdown(""" **MMLU (Massive Multitask Language Understanding)** - Tests knowledge across 57 academic subjects - Measures broad knowledge and reasoning ability **HellaSwag** - Tests common sense reasoning about everyday situations - Measures ability to predict plausible continuations **PIQA (Physical Interaction QA)** - Tests physical common sense reasoning - Measures understanding of how the physical world works """) with gr.Column(): gr.Markdown("#### ๐Ÿง  Additional Benchmarks") gr.Markdown(""" **ARC (AI2 Reasoning Challenge)** - Tests grade-school level scientific reasoning - Measures ability to answer science exam questions **WinoGrande** - Tests commonsense reasoning through pronoun resolution - Measures understanding of implicit relationships **Parameter Efficiency** - Compares performance relative to model size - Shows value proposition of architectural choices """) # Examples and presets with gr.Row(): gr.Markdown("#### ๐ŸŽฒ Quick Comparisons") # Examples for quick access gr.Examples( examples=[ ["Few", "Interactive (Plotly)"], ["Zero", "Interactive (Plotly)"], ["Few", "Static (Matplotlib)"], ["Zero", "Static (Matplotlib)"] ], inputs=[shot_type, plot_type], label="Try these preset configurations:" ) # Event handlers plot_button.click( fn=plot_comparison, inputs=[shot_type, plot_type], outputs=[plot_output, summary_output] ) # Auto-generate on load with few-shot interactive plot gr.Markdown("---") gr.Markdown( "๐Ÿ’ก **Tip**: The interactive plots allow you to hover for detailed values and zoom into specific benchmark areas.") # You'll also need to add this BenchmarkPlotter class definition earlier in your code # (before the tab definitions): # TAB 5: ABOUT (MOVED FROM TAB 6) # with gr.Tab("โ„น๏ธ About"): # gr.Markdown("## About Loggenix MOE Model") # # gr.Markdown(""" # ### Model: `kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v6.2-finetuned-tool` # # This is a fine-tuned Mixture of Experts (MOE) model designed for specialized AI tasks with tool calling capabilities. # # #### Key Features: # - **Architecture**: MOE with 0.3B total parameters, 0.1B active parameters # - **Training**: Fine-tuned with learning rate 7e-5, batch size 16 # - **Hardware**: Optimized for RTX 4090 GPU # - **Capabilities**: Tool calling, instruction following, task-specific responses # # #### Model Specifications: # - **Total Parameters**: 0.3B # - **Active Parameters**: 0.1B # - **Context Length**: 4096 tokens # - **Precision**: FP16 for optimal performance # - **Flash Attention**: Supported for faster inference # # #### Sample Inference Code: # ```python # from transformers import AutoModelForCausalLM, AutoTokenizer # import torch # # # Load model and tokenizer # model_id = "kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v6.2-finetuned-tool" # tokenizer = AutoTokenizer.from_pretrained(model_id) # model = AutoModelForCausalLM.from_pretrained( # model_id, # device_map="auto", # torch_dtype=torch.float16, # attn_implementation="flash_attention_2" # ).eval() # # # Prepare messages # messages = [ # {"role": "system", "content": "You are a helpful AI assistant."}, # {"role": "user", "content": "Calculate 25 + 37"} # ] # # # Format and generate # prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) # inputs = tokenizer(prompt, return_tensors="pt").to("cuda") # # with torch.no_grad(): # outputs = model.generate( # **inputs, # max_new_tokens=512, # do_sample=True, # temperature=0.7, # pad_token_id=tokenizer.pad_token_id # ) # # response = tokenizer.decode(outputs[0], skip_special_tokens=True) # print(response) # ``` # # #### Tool Calling Support: # The model supports structured tool calling for mathematical operations, data analysis, and other specialized tasks. # # #### Performance Optimizations: # - **Speed Mode**: Max 512 new tokens for fast responses # - **Balanced Mode**: Max 2048 new tokens for comprehensive answers # - **Full Capacity**: Dynamic token allocation up to context limit # # --- # # **Developed by**: Kshitij Thakkar # **Version**: v6.2 # **License**: Please check model repository for licensing details # """) # # TAB 5: ABOUT (Updated with comprehensive application guide) # with gr.Tab("โ„น๏ธ About"): # gr.Markdown("# ๐Ÿค– AI Tasks Evaluation Suite - User Guide") # gr.Markdown(""" # ## ๐Ÿ“‹ Application Overview # The **AI Tasks Evaluation Suite** is a comprehensive platform for testing, evaluating, and monitoring AI model performance. # This application provides multiple interfaces for model inference, dataset evaluation, response flagging, and performance analysis. # --- # ## ๐Ÿš€ Tab 1: Inference Use Case # **Purpose**: Interactive model testing with real-time inference and response quality monitoring. # ### How to Use: # 1. **Select Task Type**: Choose from available task types (loaded from the inference dataset) # 2. **Configure Inference**: Select optimization level: # - `Optimized for Speed`: Fast responses (max 512 tokens) # - `Balanced Mode`: Comprehensive answers (max 2048 tokens) # - `Full Capacity`: Maximum context utilization # 3. **Review System Prompt**: The system prompt auto-loads based on task type (editable) # 4. **Chat Interface**: # - Enter messages in the input field # - View conversation history in the chat display # - Responses are generated using the selected model configuration # ### Response Flagging Feature: # - **Select Response**: Choose any AI assistant response from the dropdown # - **Add Reason**: Provide context for why the response needs flagging # - **Flag Response**: Submit the flag (logged to `logs/flagged_responses.log`) # - **Use Cases**: Mark inappropriate, incorrect, or problematic responses for review # ### Controls: # - `Send`: Submit your message for inference # - `Clear History`: Reset conversation (also clears flagging options) # - `Flag Response`: Mark selected response with reason # --- # ## ๐Ÿ“Š Tab 2: Eval Samples # **Purpose**: Explore evaluation datasets and compare expected vs actual model outputs. # ### Available Datasets: # - **Loggenix Synthetic AI Tasks Eval (small)**: Compact evaluation set # - **Loggenix Synthetic AI Tasks Eval v5 (large)**: Extended evaluation dataset # - **Loggenix Synthetic AI Tasks Eval v6 (large)**: Latest evaluation dataset # ### How to Use: # 1. **Select Dataset**: Choose from the dropdown (displays dataset info automatically) # 2. **Choose Task Type**: Filter by specific task categories within the dataset # 3. **Review Task Details**: View comprehensive information including: # - **Input Model**: Model used for generating the task # - **Output Model**: Model used for generating responses # - **Task**: The actual task/instruction given to the model # - **Input**: Raw input data or context # - **Expected Response**: Ground truth or reference output # - **Loggenix Output**: Model's actual response # ### Data Fields Explained: # - **task_type**: Category/classification of the task # - **task**: The instruction or prompt given to the model # - **input**: Additional context or data provided # - **input_model**: Model that created the task # - **output_model**: Model that generated the response # - **expected_response**: Reference/ground truth answer # - **loggenix_output**: Actual model output for comparison # --- # ## ๐Ÿ‘€ Tab 3: View Flagged Responses # **Purpose**: Review and analyze previously flagged responses for quality control. # ### Features: # - **Flagged Messages Table**: Shows all flagged responses with: # - Timestamp of when flagged # - Reason provided for flagging # - Preview of flagged message (first 100 characters) # - Conversation context summary # ### How to Use: # 1. **Refresh Data**: Click refresh to load latest flagged responses # 2. **Select Row**: Click on any row to view full conversation context # 3. **Review Context**: Examine the complete conversation leading to the flagged response # 4. **Quality Analysis**: Use this data for model improvement and monitoring # ### Use Cases: # - Monitor model response quality over time # - Identify patterns in problematic responses # - Gather data for model retraining or fine-tuning # - Quality assurance and compliance checking # --- # ## ๐Ÿ“ˆ Tab 4: Model Eval Results # **Status**: ๐Ÿšง Coming Soon # **Planned Features**: # - Comprehensive evaluation metrics and scoring # - Performance benchmarks across different task types # - Comparative analysis between model configurations # - Interactive charts and visualizations # - Statistical analysis of model performance # - Trend analysis over time # --- # ## ๐Ÿ› ๏ธ Technical Specifications # ### Model Details: # **Primary Model**: `kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v6.2-finetuned-tool` # - **Architecture**: Mixture of Experts (MOE) # - **Total Parameters**: 0.3B # - **Active Parameters**: 0.1B # - **Context Length**: 4096 tokens # - **Precision**: FP16 # - **Flash Attention**: Supported # - **Tool Calling**: Enabled # ### Performance Configurations: # ```python # configs = { # "Optimized for Speed": { # "max_new_tokens": 512, # "temperature": 0.7, # "do_sample": True # }, # "Balanced Mode": { # "max_new_tokens": 2048, # "temperature": 0.8, # "do_sample": True # }, # "Full Capacity": { # "max_new_tokens": 4000, # "temperature": 0.9, # "do_sample": True # } # } # ``` # ### System Requirements: # - **GPU**: NVIDIA RTX 4090 (recommended) # - **Memory**: 16GB+ system RAM # - **VRAM**: 8GB+ GPU memory # - **Python**: 3.8+ # - **Dependencies**: transformers, torch, gradio, datasets, plotly # --- # ## ๐Ÿ”ง Quick Start Guide # ### For Model Testing: # 1. Go to **Inference Use Case** tab # 2. Select a task type from dropdown # 3. Choose inference configuration based on your needs # 4. Start chatting with the model # 5. Flag any problematic responses for review # ### For Dataset Exploration: # 1. Go to **Eval Samples** tab # 2. Select dataset from dropdown # 3. Choose task type to filter results # 4. Compare expected vs actual outputs # 5. Use insights for model evaluation # ### For Quality Monitoring: # 1. Use flagging feature during testing # 2. Review flagged responses in **View Flagged Responses** tab # 3. Analyze patterns for model improvement # --- # ## ๐Ÿ“ Logging & Data Storage # ### Flagged Responses: # - **Location**: `logs/flagged_responses.log` # - **Format**: JSON entries with timestamp, reason, message, and context # - **Usage**: Quality control, model improvement, compliance tracking # ### Data Sources: # - **Inference Dataset**: `kshitijthakkar/synthetic-ai-tasks-eval-v5` # - **Evaluation Datasets**: Multiple versions of synthetic AI tasks with expected outputs # - **Model Outputs**: Generated in real-time during inference # --- # ## ๐Ÿ†˜ Troubleshooting # ### Common Issues: # - **Dataset Loading**: Ensure internet connection for HuggingFace dataset access # - **Model Inference**: Check GPU memory availability # - **Flagging**: Verify logs directory permissions # - **Performance**: Adjust inference configuration for your hardware # ### Support: # - Check console output for detailed error messages # - Verify model handler configuration (`model_handler_ollama.py`) # - Ensure all dependencies are installed # --- # **Developed by**: Kshitij Thakkar # **Version**: AI Tasks Evaluation Suite v1.0 # **Last Updated**: September 2025 # """) # Load initial data demo.load( fn=read_flagged_messages, outputs=[flagged_messages_display] ) return demo # Launch the application if __name__ == "__main__": print("Starting AI Tasks Evaluation Suite...") demo = create_interface() demo.launch( server_name="0.0.0.0", server_port=7860, share=False, debug=True #,mcp_server=True )