Spaces:
Running
Running
import gradio as gr | |
import yaml | |
import json | |
import os | |
from typing import Dict, List, Any, Tuple | |
from datetime import datetime | |
class AIEvaluationForm: | |
def __init__(self, template_file: str = "questions.yaml"): | |
"""Initialize the evaluation form with questions from YAML file""" | |
self.template_file = template_file | |
self.template = self.load_template() | |
self.components = {} | |
def load_template(self) -> Dict: | |
"""Load evaluation template from YAML file""" | |
try: | |
with open(self.template_file, 'r', encoding='utf-8') as f: | |
return yaml.safe_load(f) | |
except FileNotFoundError: | |
raise FileNotFoundError(f"Template file '{self.template_file}' not found. Please ensure the file exists.") | |
except yaml.YAMLError as e: | |
raise ValueError(f"Error parsing YAML file: {e}") | |
def create_system_info_section(self) -> Tuple[List, Dict]: | |
"""Create the system information section""" | |
components = {} | |
with gr.Group(): | |
gr.Markdown("## π AI System Information") | |
gr.Markdown("*Please provide basic information about the AI system being evaluated.*") | |
components['name'] = gr.Textbox( | |
label="AI System Name", | |
placeholder="e.g., GPT-4, BERT, StarCoder2", | |
info="The official name of your AI system" | |
) | |
components['provider'] = gr.Textbox( | |
label="Provider/Organization", | |
placeholder="e.g., OpenAI, Google, BigCode", | |
info="The organization that developed the system" | |
) | |
components['url'] = gr.Textbox( | |
label="System URL", | |
placeholder="e.g., https://huggingface.co/model-name", | |
info="URL to the model, paper, or documentation" | |
) | |
components['type'] = gr.Dropdown( | |
choices=[ | |
"Generative Model", | |
"Discriminative Model/Classifier", | |
"Regressor", | |
"(Reinforcement Learning) Agent", | |
"Other" | |
], | |
label="System Type", | |
value="Generative Model", | |
info="Primary category of the AI system" | |
) | |
components['input modalities'] = gr.CheckboxGroup( | |
choices=[ | |
"Text", | |
"Image", | |
"Audio", | |
"Video", | |
"Tabular", | |
], | |
label="Input modalities (select all that apply)", | |
value=["Text"], | |
info="input modalities supported by the system" | |
) | |
components['output modalities'] = gr.CheckboxGroup( | |
choices=[ | |
"Text", | |
"Image", | |
"Audio", | |
"Video", | |
"Tabular", | |
], | |
label="Output Modalities (select all that apply)", | |
value=["Text"], | |
info="output modalities supported by the system" | |
) | |
return list(components.values()), components | |
def create_evaluation_sections(self) -> Tuple[List, Dict]: | |
"""Create dynamic evaluation sections from template""" | |
all_components = [] | |
section_components = {} | |
for section_name, section_data in self.template.items(): | |
with gr.Group(): | |
gr.Markdown(f"## {section_name}") | |
section_components[section_name] = {} | |
for subsection_name, subsection_data in section_data.items(): | |
with gr.Accordion(subsection_name, open=False): | |
# Explainer text | |
gr.Markdown(f"**Explainer:** {subsection_data['explainer']}") | |
# Overall status | |
status_component = gr.Radio( | |
choices=["Yes", "No", "N/A"], | |
label=f"Overall Status", | |
value="N/A", | |
info="Does this subsection apply to your system and have you conducted these evaluations?" | |
) | |
# Sources/Evidence | |
sources_component = gr.Textbox( | |
label="Sources & Evidence", | |
placeholder="Enter sources, papers, benchmarks, or evidence (one per line)\nExample:\nhttps://arxiv.org/abs/2402.19173\nBOLD Bias Benchmark\nInternal evaluation report", | |
lines=4, | |
info="Provide references to evaluations, papers, benchmarks, or internal reports" | |
) | |
# Individual questions | |
gr.Markdown("**Detailed Questions:**") | |
question_components = {} | |
# IMPORTANT: Add components in the correct order - status, sources, then questions | |
all_components.extend([status_component, sources_component]) | |
for question in subsection_data['questions']: | |
question_component = gr.Checkbox( | |
label=question, | |
value=False, | |
#info="Check if this evaluation has been performed" | |
) | |
question_components[question] = question_component | |
all_components.append(question_component) | |
section_components[section_name][subsection_name] = { | |
'status': status_component, | |
'sources': sources_component, | |
'questions': question_components | |
} | |
return all_components, section_components | |
def parse_sources(self, sources_text: str) -> List[Dict]: | |
"""Parse sources text into structured format""" | |
sources = [] | |
# Handle case where sources_text might not be a string | |
if not isinstance(sources_text, str): | |
return sources | |
if not sources_text.strip(): | |
return sources | |
for line in sources_text.strip().split('\n'): | |
line = line.strip() | |
if not line: | |
continue | |
# Determine source type based on content | |
if line.startswith('http'): | |
source_type = "π" | |
name = line.split('/')[-1] if '/' in line else line | |
elif 'internal' in line.lower() or 'proprietary' in line.lower(): | |
source_type = "π’" | |
name = line | |
else: | |
source_type = "π" | |
name = line | |
sources.append({ | |
"type": source_type, | |
"detail": line, | |
"name": name | |
}) | |
return sources | |
def generate_scorecard(self, *args) -> Tuple[Dict, str]: | |
"""Generate scorecard JSON from form inputs""" | |
# Debug: Print argument types and counts | |
print(f"Total arguments received: {len(args)}") | |
for i, arg in enumerate(args[:10]): # Print first 10 for debugging | |
print(f"Arg {i}: {type(arg)} = {arg}") | |
# Extract system info (first 5 arguments) | |
name, provider, url, sys_type, inp_modalities, out_modalities = args[:6] | |
remaining_args = list(args[5:]) | |
# Build metadata | |
metadata = { | |
"Name": name or "Unknown", | |
"Provider": provider or "Unknown", | |
"URL": url or "", | |
"Type": sys_type or "Unknown", | |
"Input Modalities": inp_modalities or [], | |
"Output Modalities": out_modalities or [] | |
} | |
# Build scores | |
scores = {} | |
arg_index = 0 | |
for section_name, section_data in self.template.items(): | |
scores[section_name] = {} | |
for subsection_name, subsection_data in section_data.items(): | |
# Get status and sources (next 2 arguments) | |
if arg_index < len(remaining_args): | |
status = remaining_args[arg_index] | |
print(f"Status for {section_name}/{subsection_name}: {type(status)} = {status}") | |
else: | |
status = "N/A" | |
if arg_index + 1 < len(remaining_args): | |
sources_text = remaining_args[arg_index + 1] | |
print(f"Sources for {section_name}/{subsection_name}: {type(sources_text)} = {sources_text}") | |
else: | |
sources_text = "" | |
# Ensure sources_text is a string | |
if not isinstance(sources_text, str): | |
sources_text = str(sources_text) if sources_text is not None else "" | |
# Parse sources | |
sources = self.parse_sources(sources_text) | |
# Get question responses | |
questions_dict = {} | |
question_start_index = arg_index + 2 | |
num_questions = len(subsection_data['questions']) | |
for i, question in enumerate(subsection_data['questions']): | |
q_index = question_start_index + i | |
if q_index < len(remaining_args): | |
questions_dict[question] = remaining_args[q_index] | |
else: | |
questions_dict[question] = False | |
# Store subsection data | |
scores[section_name][subsection_name] = { | |
"status": status, | |
"sources": sources, | |
"questions": questions_dict | |
} | |
# Move to next subsection (2 for status/sources + number of questions) | |
arg_index += 2 + num_questions | |
# Create final scorecard | |
scorecard = { | |
"metadata": metadata, | |
"scores": scores | |
} | |
# Generate filename | |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
safe_name = (name or "ai_system").replace(' ', '_').lower() | |
filename = f"{safe_name}_scorecard_{timestamp}.json" | |
return scorecard, filename | |
def create_interface(self): | |
"""Create the complete Gradio interface""" | |
with gr.Blocks( | |
title="AI System Evaluation Scorecard", | |
# theme=gr.themes.Soft(), | |
css=""" | |
.gradio-container { | |
max-width: 1400px !important; | |
margin: 0 auto !important; | |
padding: 20px !important; | |
width: 95% !important; | |
} | |
.main { | |
max-width: 1400px !important; | |
margin: 0 auto !important; | |
width: 100% !important; | |
} | |
.container { | |
max-width: 1400px !important; | |
margin: 0 auto !important; | |
width: 100% !important; | |
} | |
.accordion-header { | |
background-color: #f0f0f0 !important; | |
} | |
.block { | |
width: 100% !important; | |
} | |
/* Ensure form elements use full width */ | |
.form { | |
width: 100% !important; | |
} | |
/* Center the entire app */ | |
#root { | |
display: flex !important; | |
justify-content: center !important; | |
width: 100% !important; | |
} | |
""" | |
) as demo: | |
# Header | |
gr.Markdown(""" | |
# π AI System Evaluation Scorecard | |
This comprehensive evaluation form helps you assess AI systems across multiple dimensions including bias, | |
cultural sensitivity, environmental impact, privacy, and more. Complete the sections relevant to your system | |
to generate a detailed scorecard. | |
--- | |
""") | |
# System information section | |
system_inputs, system_components = self.create_system_info_section() | |
# Evaluation sections | |
eval_inputs, eval_components = self.create_evaluation_sections() | |
self.components = {**system_components, **eval_components} | |
# Generate button and outputs | |
with gr.Group(): | |
gr.Markdown("## π Generate Scorecard") | |
with gr.Row(): | |
generate_btn = gr.Button( | |
"π Generate Evaluation Scorecard", | |
variant="primary", | |
size="lg", | |
scale=2 | |
) | |
clear_btn = gr.Button( | |
"ποΈ Clear Form", | |
variant="secondary", | |
scale=1 | |
) | |
# Progress indicator | |
progress = gr.Progress() | |
# Outputs | |
with gr.Group(): | |
gr.Markdown("### π Generated Scorecard") | |
with gr.Row(): | |
json_output = gr.JSON( | |
label="Scorecard JSON", | |
show_label=True | |
) | |
with gr.Row(): | |
download_file = gr.File( | |
label="Download Scorecard", | |
visible=False | |
) | |
download_btn = gr.Button( | |
"πΎ Download JSON", | |
visible=False, | |
variant="secondary" | |
) | |
# Event handlers | |
all_inputs = system_inputs + eval_inputs | |
def generate_with_progress(*args): | |
"""Generate scorecard with progress indication""" | |
progress(0.3, desc="Processing inputs...") | |
scorecard, filename = self.generate_scorecard(*args) | |
progress(0.7, desc="Generating JSON...") | |
json_content = json.dumps(scorecard, indent=2) | |
progress(1.0, desc="Complete!") | |
# Save to temporary file for download | |
with open(filename, 'w') as f: | |
f.write(json_content) | |
return ( | |
scorecard, # JSON display | |
gr.File(value=filename, visible=True), # File for download | |
gr.Button(visible=True) # Show download button | |
) | |
def clear_form(): | |
"""Clear all form inputs""" | |
return [None] * len(all_inputs) | |
# Wire up events | |
generate_btn.click( | |
fn=generate_with_progress, | |
inputs=all_inputs, | |
outputs=[json_output, download_file, download_btn], | |
show_progress="full" | |
) | |
clear_btn.click( | |
fn=clear_form, | |
outputs=all_inputs | |
) | |
# Add example data button | |
with gr.Group(): | |
gr.Markdown("### π Quick Start") | |
example_btn = gr.Button("π Load Example Data", variant="secondary") | |
def load_example(): | |
"""Load example data for StarCoder2-like system""" | |
example_data = [ | |
"StarCoder2", # name | |
"BigCode", # provider | |
"https://huggingface.co/bigcode/starcoder2-15b", # url | |
"Generative Model", # type | |
["Text"] # input modalities | |
["Text"] # output modalities | |
] | |
# Add default values for evaluation sections (all N/A initially) | |
remaining_defaults = [] | |
for section_name, section_data in self.template.items(): | |
for subsection_name, subsection_data in section_data.items(): | |
remaining_defaults.extend([ | |
"N/A", # status | |
"", # sources | |
*([False] * len(subsection_data['questions'])) # questions | |
]) | |
return example_data + remaining_defaults | |
example_btn.click( | |
fn=load_example, | |
outputs=all_inputs | |
) | |
return demo | |
def main(): | |
"""Main function to run the application""" | |
try: | |
# Create the evaluation form | |
eval_form = AIEvaluationForm("questions.yaml") | |
# Create and launch the interface | |
demo = eval_form.create_interface() | |
print("π Launching AI Evaluation Scorecard...") | |
print(f"π Loading questions from: {eval_form.template_file}") | |
print(f"π Found {len(eval_form.template)} evaluation categories") | |
# Count total questions | |
total_questions = sum( | |
len(subsection['questions']) | |
for section in eval_form.template.values() | |
for subsection in section.values() | |
) | |
print(f"β Total evaluation questions: {total_questions}") | |
demo.launch( | |
ssr_mode=False, | |
share=False, | |
inbrowser=False, | |
show_error=True, | |
quiet=False | |
) | |
except FileNotFoundError as e: | |
print(f"β Error: {e}") | |
print("Please ensure 'questions.yaml' exists in the current directory.") | |
except Exception as e: | |
print(f"β Unexpected error: {e}") | |
if __name__ == "__main__": | |
main() |