import gradio as gr import pandas as pd from datasets import load_dataset import plotly.graph_objects as go import datetime import json import random import os from model_handler import generate_response, get_inference_configs import torch # Configuration for datasets DATASET_CONFIGS = { 'Loggenix Synthetic AI Tasks Eval (with outputs)': { 'repo_id': 'kshitijthakkar/loggenix-synthetic-ai-tasks-eval-with-outputs', 'split': 'train' }, 'Loggenix Synthetic AI Tasks Eval (with outputs) v5': { 'repo_id': 'kshitijthakkar/loggenix-synthetic-ai-tasks-eval_v5-with-outputs', 'split': 'train' } } # Load main dataset for inference tab def load_inference_dataset(): """Load the main dataset for inference use case""" try: print("Loading synthetic-ai-tasks-eval-v5 dataset...") dataset = load_dataset( 'kshitijthakkar/synthetic-ai-tasks-eval-v5', split='train', trust_remote_code=True ) df = dataset.to_pandas() print(f"✓ Successfully loaded: {len(df)} rows, {len(df.columns)} columns") return df except Exception as e: print(f"✗ Error loading dataset: {str(e)}") return pd.DataFrame({'Error': [f'Failed to load: {str(e)}']}) # Load dataset for eval samples tab def load_eval_datasets(): """Load all datasets for evaluation samples""" datasets = {} for display_name, config in DATASET_CONFIGS.items(): try: print(f"Loading {display_name}...") dataset = load_dataset( config['repo_id'], split=config['split'], trust_remote_code=True ) df = dataset.to_pandas() datasets[display_name] = df print(f"✓ Successfully loaded {display_name}: {len(df)} rows") except Exception as e: print(f"✗ Error loading {display_name}: {str(e)}") datasets[display_name] = pd.DataFrame({ 'Error': [f'Failed to load: {str(e)}'], 'Dataset': [config['repo_id']] }) return datasets # Load datasets INFERENCE_DATASET = load_inference_dataset() EVAL_DATASETS = load_eval_datasets() # ===== TAB 1: INFERENCE USE CASE ===== def get_task_types(): """Get unique task types from inference dataset""" if 'task_type' in INFERENCE_DATASET.columns: task_types = INFERENCE_DATASET['task_type'].unique().tolist() return [str(t) for t in task_types if pd.notna(t)] return ["No task types available"] def get_task_by_type(task_type): """Get task content by task type""" if 'task_type' in INFERENCE_DATASET.columns and 'task' in INFERENCE_DATASET.columns: filtered = INFERENCE_DATASET[INFERENCE_DATASET['task_type'] == task_type] if len(filtered) > 0: return str(filtered.iloc[0]['task']) return "No task found for this type" def run_inference(task_type, system_prompt, user_input, inference_config): """Run model inference""" if not user_input.strip(): return "Please enter a user input" if not system_prompt.strip(): return "Please select a task type to load system prompt" try: # Get inference configuration configs = get_inference_configs() config = configs.get(inference_config, configs["Optimized for Speed"]) # Run inference response = generate_response( system_prompt=system_prompt, user_input=user_input, config_name=inference_config ) return response except Exception as e: return f"Error during inference: {str(e)}" # ===== TAB 2: EVAL SAMPLES ===== def update_eval_table(dataset_name): """Update eval table based on selected dataset""" if dataset_name in EVAL_DATASETS: return EVAL_DATASETS[dataset_name].head(100) return pd.DataFrame() def get_eval_dataset_info(dataset_name): """Get info about selected eval dataset""" if dataset_name in EVAL_DATASETS: df = EVAL_DATASETS[dataset_name] return f""" **Dataset**: {dataset_name} - **Rows**: {len(df):,} - **Columns**: {len(df.columns)} - **Column Names**: {', '.join(df.columns.tolist())} """ return "No dataset selected" # ===== TAB 3 & 4: FLAGGING FUNCTIONALITY ===== def generate_chart(): """Generate a sample Plotly chart""" x = list(range(10)) y = [random.randint(1, 100) for _ in x] fig = go.Figure() fig.add_trace(go.Scatter(x=x, y=y, mode="lines+markers", name="Random Data")) fig.update_layout(title="Sample Chart", xaxis_title="X-axis", yaxis_title="Y-axis") return fig.to_html(full_html=False) def chat_interface(prompt, history): """Handle chat interface with history""" if not prompt.strip(): return history, "" history.append(("You", prompt)) try: if "chart" in prompt.lower() or "graph" in prompt.lower(): response = generate_chart() else: response = f"This is a demo response to: {prompt}" if isinstance(response, str): formatted_response = f"**AI Assistant:**\n{response}" history.append(("AI Assistant", formatted_response)) else: history.append(("AI Assistant", response)) except Exception as e: error_msg = f"**AI Assistant:**\nSorry, an error occurred: {str(e)}" history.append(("AI Assistant", error_msg)) return history, "" def flag_response(history, flagged_message, flag_reason): """Flag a response""" if not flagged_message or flagged_message == "No responses available": return "Invalid message selection." try: flagged_index = int(flagged_message.split()[1][:-1]) if flagged_index >= len(history) or history[flagged_index][0] != "AI Assistant": return "You can only flag assistant responses." flagged_message_content = history[flagged_index][1] log_entry = { "timestamp": datetime.datetime.now().isoformat(), "flag_reason": str(flag_reason), "flagged_message": str(flagged_message_content), "conversation_context": history, } os.makedirs("logs", exist_ok=True) with open("logs/flagged_responses.log", "a") as f: f.write(json.dumps(log_entry) + "\n") return f"Response flagged successfully" except Exception as e: return f"Error flagging response: {str(e)}" def get_assistant_responses(history): """Get dropdown options for assistant responses""" responses = [ f"Response {i}: {str(msg[1])[:50]}..." for i, msg in enumerate(history) if msg[0] == "AI Assistant" ] if not responses: responses = ["No responses available"] return gr.update(choices=responses, value=responses[0]) def display_selected_message(selected_index, history): """Display the selected flagged message""" if selected_index == "No responses available": return "No responses available" try: flagged_index = int(selected_index.split()[1][:-1]) if flagged_index < len(history) and history[flagged_index][0] == "AI Assistant": return history[flagged_index][1] else: return "Invalid selection." except Exception as e: return f"Error: {str(e)}" def read_flagged_messages(): """Read flagged messages from log file""" try: if not os.path.exists("logs/flagged_responses.log"): return pd.DataFrame() with open("logs/flagged_responses.log", "r") as f: flagged_messages = f.readlines() if not flagged_messages: return pd.DataFrame() table_data = [] for entry in flagged_messages: data = json.loads(entry) table_data.append({ "Timestamp": data.get("timestamp", "N/A"), "Flag Reason": data.get("flag_reason", "N/A"), "Flagged Message": data.get("flagged_message", "N/A")[:100] + "...", "Conversation Context": str(len(data.get("conversation_context", []))) + " messages" }) return pd.DataFrame(table_data) except Exception as e: return pd.DataFrame({"Error": [f"Error reading flagged messages: {str(e)}"]}) def handle_row_select(evt: gr.SelectData): """Handle row selection in flagged messages table""" try: if not os.path.exists("logs/flagged_responses.log"): return [] with open("logs/flagged_responses.log", "r") as f: flagged_messages_log = f.readlines() if evt.index[0] < len(flagged_messages_log): selected_entry = json.loads(flagged_messages_log[evt.index[0]]) conversation_context = selected_entry.get("conversation_context", []) return conversation_context return [] except Exception as e: return [("System", f"Error loading conversation: {str(e)}")] def clear_history(): """Clear chat history""" return [], gr.update(choices=["No responses available"], value="No responses available") # ===== MAIN INTERFACE ===== def create_interface(): with gr.Blocks(title="AI Tasks Evaluation Suite", theme=gr.themes.Soft()) as demo: gr.Markdown("# 🤖 AI Tasks Evaluation Suite") gr.Markdown("Comprehensive platform for AI model evaluation and testing") with gr.Tabs(): # TAB 1: INFERENCE USE CASE with gr.Tab("🚀 Inference Use Case"): gr.Markdown("## Model Inference Testing") with gr.Row(): with gr.Column(scale=1): # Task type dropdown task_type_dropdown = gr.Dropdown( choices=get_task_types(), value=get_task_types()[0] if get_task_types() else None, label="Task Type", info="Select task type to load system prompt" ) # Inference configuration inference_config = gr.Dropdown( choices=list(get_inference_configs().keys()), value="Optimized for Speed", label="Inference Configuration", info="Select inference optimization level" ) with gr.Column(scale=2): # System prompt (editable) system_prompt = gr.Textbox( label="System Prompt (Editable)", lines=6, max_lines=10, placeholder="Select a task type to load system prompt...", interactive=True ) with gr.Row(): with gr.Column(): # User input user_input = gr.Textbox( label="User Input", lines=4, placeholder="Enter your input here...", interactive=True ) with gr.Column(): # Model response model_response = gr.Textbox( label="Model Response", lines=8, interactive=False ) with gr.Row(): submit_btn = gr.Button("đŸ”Ĩ Run Inference", variant="primary", size="lg") clear_btn = gr.Button("đŸ—‘ī¸ Clear", variant="secondary") # Event handlers for Tab 1 task_type_dropdown.change( fn=get_task_by_type, inputs=[task_type_dropdown], outputs=[system_prompt] ) submit_btn.click( fn=run_inference, inputs=[task_type_dropdown, system_prompt, user_input, inference_config], outputs=[model_response] ) clear_btn.click( fn=lambda: ("", "", ""), outputs=[system_prompt, user_input, model_response] ) # TAB 2: EVAL SAMPLES with gr.Tab("📊 Eval Samples"): gr.Markdown("## Dataset Evaluation Samples") with gr.Row(): with gr.Column(scale=1): eval_dataset_dropdown = gr.Dropdown( choices=list(EVAL_DATASETS.keys()), value=list(EVAL_DATASETS.keys())[0] if EVAL_DATASETS else None, label="Select Dataset", info="Choose evaluation dataset to view" ) eval_dataset_info = gr.Markdown( get_eval_dataset_info(list(EVAL_DATASETS.keys())[0] if EVAL_DATASETS else "") ) with gr.Row(): eval_table = gr.Dataframe( value=update_eval_table(list(EVAL_DATASETS.keys())[0]) if EVAL_DATASETS else pd.DataFrame(), label="Dataset Table", max_height=800, min_width=800, interactive=False, wrap=True, show_fullscreen_button=True, show_copy_button=True, show_row_numbers=True, show_search="filter", ) # Event handlers for Tab 2 eval_dataset_dropdown.change( fn=lambda x: (update_eval_table(x), get_eval_dataset_info(x)), inputs=[eval_dataset_dropdown], outputs=[eval_table, eval_dataset_info] ) # TAB 3: FLAG RESPONSES with gr.Tab("🚩 Flag Responses"): gr.Markdown("## Chat Interface with Response Flagging") with gr.Row(): with gr.Column(): chat_input = gr.Textbox(placeholder="Ask something...", label="Your Message") with gr.Row(): chat_submit_btn = gr.Button("Send", variant="primary") chat_clear_btn = gr.Button("Clear History", variant="secondary") with gr.Column(): chat_display = gr.Chatbot(label="Chat History", height=400) chat_history_state = gr.State([]) gr.Markdown("### Flag Response") with gr.Row(): with gr.Column(): flagged_message_index = gr.Dropdown( label="Select a response to flag", choices=["No responses available"], value="No responses available", interactive=True ) selected_message_display = gr.Textbox( label="Selected Response", interactive=False, lines=4 ) with gr.Column(): flag_reason = gr.Textbox( placeholder="Enter reason for flagging...", label="Reason for Flagging" ) flag_btn = gr.Button("Flag Response", variant="stop") flag_output = gr.Textbox(label="Flagging Feedback", visible=True) # Event handlers for Tab 3 chat_submit_btn.click( chat_interface, inputs=[chat_input, chat_history_state], outputs=[chat_display, chat_input] ).then( get_assistant_responses, inputs=[chat_history_state], outputs=[flagged_message_index] ) chat_clear_btn.click( clear_history, outputs=[chat_display, flagged_message_index] ) flagged_message_index.change( display_selected_message, inputs=[flagged_message_index, chat_history_state], outputs=[selected_message_display] ) flag_btn.click( flag_response, inputs=[chat_history_state, flagged_message_index, flag_reason], outputs=[flag_output] ) # TAB 4: VIEW FLAGGED RESPONSES with gr.Tab("👀 View Flagged Responses"): gr.Markdown("## Review Flagged Responses") with gr.Row(): with gr.Column(): flagged_messages_display = gr.Dataframe( headers=["Timestamp", "Flag Reason", "Flagged Message", "Conversation Context"], interactive=False, max_height=400 ) refresh_btn = gr.Button("🔄 Refresh", variant="primary") with gr.Column(): conversation_context_display = gr.Chatbot( label="Conversation Context", height=400 ) # Event handlers for Tab 4 flagged_messages_display.select( handle_row_select, outputs=[conversation_context_display] ) refresh_btn.click( read_flagged_messages, outputs=[flagged_messages_display] ) # TAB 5: MODEL EVAL RESULTS with gr.Tab("📈 Model Eval Results"): gr.Markdown("## Model Evaluation Results") gr.Markdown("### 🚧 Coming Soon") gr.Markdown( "This section will display comprehensive model evaluation metrics, charts, and performance analysis.") # Placeholder content with gr.Row(): with gr.Column(): gr.Markdown("#### Evaluation Metrics") gr.Markdown("- Accuracy scores") gr.Markdown("- Performance benchmarks") gr.Markdown("- Comparative analysis") with gr.Column(): gr.Markdown("#### Visualization") gr.Markdown("- Performance charts") gr.Markdown("- Score distributions") gr.Markdown("- Trend analysis") # TAB 6: ABOUT with gr.Tab("â„šī¸ About"): gr.Markdown("## About Loggenix MOE Model") gr.Markdown(""" ### Model: `kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v6.2-finetuned-tool` This is a fine-tuned Mixture of Experts (MOE) model designed for specialized AI tasks with tool calling capabilities. #### Key Features: - **Architecture**: MOE with 0.3B total parameters, 0.1B active parameters - **Training**: Fine-tuned with learning rate 7e-5, batch size 16 - **Hardware**: Optimized for RTX 4090 GPU - **Capabilities**: Tool calling, instruction following, task-specific responses #### Model Specifications: - **Total Parameters**: 0.3B - **Active Parameters**: 0.1B - **Context Length**: 4096 tokens - **Precision**: FP16 for optimal performance - **Flash Attention**: Supported for faster inference #### Sample Inference Code: ```python from transformers import AutoModelForCausalLM, AutoTokenizer import torch # Load model and tokenizer model_id = "kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v6.2-finetuned-tool" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, device_map="auto", torch_dtype=torch.float16, attn_implementation="flash_attention_2" ).eval() # Prepare messages messages = [ {"role": "system", "content": "You are a helpful AI assistant."}, {"role": "user", "content": "Calculate 25 + 37"} ] # Format and generate prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = tokenizer(prompt, return_tensors="pt").to("cuda") with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=512, do_sample=True, temperature=0.7, pad_token_id=tokenizer.pad_token_id ) response = tokenizer.decode(outputs[0], skip_special_tokens=True) print(response) ``` #### Tool Calling Support: The model supports structured tool calling for mathematical operations, data analysis, and other specialized tasks. #### Performance Optimizations: - **Speed Mode**: Max 512 new tokens for fast responses - **Balanced Mode**: Max 2048 new tokens for comprehensive answers - **Full Capacity**: Dynamic token allocation up to context limit --- **Developed by**: Kshitij Thakkar **Version**: v6.2 **License**: Please check model repository for licensing details """) # Load initial data demo.load( fn=read_flagged_messages, outputs=[flagged_messages_display] ) return demo # Launch the application if __name__ == "__main__": print("Starting AI Tasks Evaluation Suite...") demo = create_interface() demo.launch( server_name="0.0.0.0", server_port=7860, share=False, debug=True )