Spaces:

Zen0
/

auscyberbench-evaluator

Running on Zero

Zen0 Claude commited on 20 days ago

Commit

83123ec

1 Parent(s): 629509e

Fix dependency issues and remove gated models

Critical fixes based on evaluation results:

🔧 Added Missing Dependencies:
- sentencepiece>=0.1.99 (required for Phi-3, TinyLlama tokenizers)
- protobuf>=3.20.0 (required for model loading)

❌ Removed Gated Models (require authentication):
- google/gemma-2-2b-it
- meta-llama/Llama-3.2-3B-Instruct
- meta-llama/Llama-3.1-8B-Instruct
- google/gemma-2-9b-it

❌ Removed Poor Performers:
- stabilityai/stablelm-2-1_6b-chat (0% accuracy)

✅ Model Count: 32 → 25 open, working models

📊 Updated UI:
- Recommended: 7 → 6 models
- Small: 7 → 4 models
- Medium: 6 → 4 models
- Total: 32 → 25 models
- All models are now open (no gated repos)
- Highlighted Qwen2.5-3B: 55.6% accuracy!

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (2) hide show

app.py +12 -15
requirements.txt +2 -0

app.py CHANGED Viewed

@@ -29,8 +29,7 @@ MODELS_BY_CATEGORY = {
     "✅ Recommended (Tested)": [
         "microsoft/Phi-3-mini-4k-instruct",  # Proven stable
         "microsoft/Phi-3.5-mini-instruct",  # Works well
-        "google/gemma-2-2b-it",  # Tested
-        "Qwen/Qwen2.5-3B-Instruct",  # Reliable
         "Qwen/Qwen2.5-7B-Instruct",  # Good performance
         "deepseek-ai/deepseek-llm-7b-chat",  # Previously tested 55%+
         "TinyLlama/TinyLlama-1.1B-Chat-v1.0",  # Previously tested 33%+
@@ -45,19 +44,17 @@ MODELS_BY_CATEGORY = {
     "Small Models (1-4B)": [
         "microsoft/Phi-3-mini-4k-instruct",
         "microsoft/Phi-3.5-mini-instruct",
-        "google/gemma-2-2b-it",
         "Qwen/Qwen2.5-3B-Instruct",
-        "meta-llama/Llama-3.2-3B-Instruct",
-        "stabilityai/stablelm-2-1_6b-chat",
         "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
     ],
     "Medium Models (7-12B)": [
         "mistralai/Mistral-7B-Instruct-v0.3",
         "Qwen/Qwen2.5-7B-Instruct",
-        "meta-llama/Llama-3.1-8B-Instruct",
-        "google/gemma-2-9b-it",
         "mistralai/Mistral-Nemo-Instruct-2407",
         "01-ai/Yi-1.5-9B-Chat",
     ],
     "Reasoning & Analysis": [
         "deepseek-ai/deepseek-llm-7b-chat",
@@ -545,12 +542,12 @@ with gr.Blocks(title="AusCyberBench Evaluation Dashboard", theme=gr.themes.Soft(
     gr.Markdown("""
     # 🇦🇺 AusCyberBench Evaluation Dashboard
-    **Australia's First LLM Cybersecurity Benchmark** • 13,449 Tasks • 32 Tested Models
-    Evaluate proven language models on Australian cybersecurity knowledge including
     Essential Eight, ISM Controls, Privacy Act, SOCI Act, and ACSC Threat Intelligence.
-    ✅ **Recommended models** have been tested and show reliable performance.
     """)
     with gr.Row():
@@ -559,13 +556,13 @@ with gr.Blocks(title="AusCyberBench Evaluation Dashboard", theme=gr.themes.Soft(
             # Quick selection buttons
             with gr.Row():
-                btn_recommended = gr.Button("✅ Recommended (7)", size="sm", variant="primary")
                 btn_security = gr.Button("🛡️ Security (5)", size="sm", variant="secondary")
             with gr.Row():
-                btn_small = gr.Button("Small (7)", size="sm")
-                btn_medium = gr.Button("Medium (6)", size="sm")
             with gr.Row():
-                btn_all = gr.Button("Select All (32)", size="sm")
                 btn_clear = gr.Button("Clear All", size="sm")
             # Model checkboxes by category
@@ -653,7 +650,7 @@ with gr.Blocks(title="AusCyberBench Evaluation Dashboard", theme=gr.themes.Soft(
     gr.Markdown("""
     ---
     **Dataset:** [Zen0/AusCyberBench](https://huggingface.co/datasets/Zen0/AusCyberBench) • 13,449 tasks |
-    **Models:** 32 tested LLMs |
     **License:** MIT
     """)

     "✅ Recommended (Tested)": [
         "microsoft/Phi-3-mini-4k-instruct",  # Proven stable
         "microsoft/Phi-3.5-mini-instruct",  # Works well
+        "Qwen/Qwen2.5-3B-Instruct",  # Just tested 55.6%! ⭐
         "Qwen/Qwen2.5-7B-Instruct",  # Good performance
         "deepseek-ai/deepseek-llm-7b-chat",  # Previously tested 55%+
         "TinyLlama/TinyLlama-1.1B-Chat-v1.0",  # Previously tested 33%+
     "Small Models (1-4B)": [
         "microsoft/Phi-3-mini-4k-instruct",
         "microsoft/Phi-3.5-mini-instruct",
         "Qwen/Qwen2.5-3B-Instruct",
         "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        # Removed gated models: google/gemma-2-2b-it, meta-llama/Llama-3.2-3B-Instruct
+        # Removed: stabilityai/stablelm-2-1_6b-chat (0% accuracy)
     ],
     "Medium Models (7-12B)": [
         "mistralai/Mistral-7B-Instruct-v0.3",
         "Qwen/Qwen2.5-7B-Instruct",
         "mistralai/Mistral-Nemo-Instruct-2407",
         "01-ai/Yi-1.5-9B-Chat",
+        # Removed gated models: meta-llama/Llama-3.1-8B-Instruct, google/gemma-2-9b-it
     ],
     "Reasoning & Analysis": [
         "deepseek-ai/deepseek-llm-7b-chat",
     gr.Markdown("""
     # 🇦🇺 AusCyberBench Evaluation Dashboard
+    **Australia's First LLM Cybersecurity Benchmark** • 13,449 Tasks • 25 Open Models
+    Evaluate proven open language models on Australian cybersecurity knowledge including
     Essential Eight, ISM Controls, Privacy Act, SOCI Act, and ACSC Threat Intelligence.
+    ✅ **Recommended models** have been tested: Qwen2.5-3B (55.6%), DeepSeek (55%), TinyLlama (33%)
     """)
     with gr.Row():
             # Quick selection buttons
             with gr.Row():
+                btn_recommended = gr.Button("✅ Recommended (6)", size="sm", variant="primary")
                 btn_security = gr.Button("🛡️ Security (5)", size="sm", variant="secondary")
             with gr.Row():
+                btn_small = gr.Button("Small (4)", size="sm")
+                btn_medium = gr.Button("Medium (4)", size="sm")
             with gr.Row():
+                btn_all = gr.Button("Select All (25)", size="sm")
                 btn_clear = gr.Button("Clear All", size="sm")
             # Model checkboxes by category
     gr.Markdown("""
     ---
     **Dataset:** [Zen0/AusCyberBench](https://huggingface.co/datasets/Zen0/AusCyberBench) • 13,449 tasks |
+    **Models:** 25 open LLMs (no gated models) |
     **License:** MIT
     """)

requirements.txt CHANGED Viewed

@@ -7,3 +7,5 @@ datasets>=2.18.0
 pandas>=2.0.0
 matplotlib>=3.7.0
 seaborn>=0.13.0

 pandas>=2.0.0
 matplotlib>=3.7.0
 seaborn>=0.13.0
+sentencepiece>=0.1.99
+protobuf>=3.20.0