Spaces:
Running
on
Zero
Fix dependency issues and remove gated models
Browse filesCritical fixes based on evaluation results:
π§ Added Missing Dependencies:
- sentencepiece>=0.1.99 (required for Phi-3, TinyLlama tokenizers)
- protobuf>=3.20.0 (required for model loading)
β Removed Gated Models (require authentication):
- google/gemma-2-2b-it
- meta-llama/Llama-3.2-3B-Instruct
- meta-llama/Llama-3.1-8B-Instruct
- google/gemma-2-9b-it
β Removed Poor Performers:
- stabilityai/stablelm-2-1_6b-chat (0% accuracy)
β
Model Count: 32 β 25 open, working models
π Updated UI:
- Recommended: 7 β 6 models
- Small: 7 β 4 models
- Medium: 6 β 4 models
- Total: 32 β 25 models
- All models are now open (no gated repos)
- Highlighted Qwen2.5-3B: 55.6% accuracy!
π€ Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
- app.py +12 -15
- requirements.txt +2 -0
|
@@ -29,8 +29,7 @@ MODELS_BY_CATEGORY = {
|
|
| 29 |
"β
Recommended (Tested)": [
|
| 30 |
"microsoft/Phi-3-mini-4k-instruct", # Proven stable
|
| 31 |
"microsoft/Phi-3.5-mini-instruct", # Works well
|
| 32 |
-
"
|
| 33 |
-
"Qwen/Qwen2.5-3B-Instruct", # Reliable
|
| 34 |
"Qwen/Qwen2.5-7B-Instruct", # Good performance
|
| 35 |
"deepseek-ai/deepseek-llm-7b-chat", # Previously tested 55%+
|
| 36 |
"TinyLlama/TinyLlama-1.1B-Chat-v1.0", # Previously tested 33%+
|
|
@@ -45,19 +44,17 @@ MODELS_BY_CATEGORY = {
|
|
| 45 |
"Small Models (1-4B)": [
|
| 46 |
"microsoft/Phi-3-mini-4k-instruct",
|
| 47 |
"microsoft/Phi-3.5-mini-instruct",
|
| 48 |
-
"google/gemma-2-2b-it",
|
| 49 |
"Qwen/Qwen2.5-3B-Instruct",
|
| 50 |
-
"meta-llama/Llama-3.2-3B-Instruct",
|
| 51 |
-
"stabilityai/stablelm-2-1_6b-chat",
|
| 52 |
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
|
|
|
|
|
|
| 53 |
],
|
| 54 |
"Medium Models (7-12B)": [
|
| 55 |
"mistralai/Mistral-7B-Instruct-v0.3",
|
| 56 |
"Qwen/Qwen2.5-7B-Instruct",
|
| 57 |
-
"meta-llama/Llama-3.1-8B-Instruct",
|
| 58 |
-
"google/gemma-2-9b-it",
|
| 59 |
"mistralai/Mistral-Nemo-Instruct-2407",
|
| 60 |
"01-ai/Yi-1.5-9B-Chat",
|
|
|
|
| 61 |
],
|
| 62 |
"Reasoning & Analysis": [
|
| 63 |
"deepseek-ai/deepseek-llm-7b-chat",
|
|
@@ -545,12 +542,12 @@ with gr.Blocks(title="AusCyberBench Evaluation Dashboard", theme=gr.themes.Soft(
|
|
| 545 |
gr.Markdown("""
|
| 546 |
# π¦πΊ AusCyberBench Evaluation Dashboard
|
| 547 |
|
| 548 |
-
**Australia's First LLM Cybersecurity Benchmark** β’ 13,449 Tasks β’
|
| 549 |
|
| 550 |
-
Evaluate proven language models on Australian cybersecurity knowledge including
|
| 551 |
Essential Eight, ISM Controls, Privacy Act, SOCI Act, and ACSC Threat Intelligence.
|
| 552 |
|
| 553 |
-
β
**Recommended models** have been tested
|
| 554 |
""")
|
| 555 |
|
| 556 |
with gr.Row():
|
|
@@ -559,13 +556,13 @@ with gr.Blocks(title="AusCyberBench Evaluation Dashboard", theme=gr.themes.Soft(
|
|
| 559 |
|
| 560 |
# Quick selection buttons
|
| 561 |
with gr.Row():
|
| 562 |
-
btn_recommended = gr.Button("β
Recommended (
|
| 563 |
btn_security = gr.Button("π‘οΈ Security (5)", size="sm", variant="secondary")
|
| 564 |
with gr.Row():
|
| 565 |
-
btn_small = gr.Button("Small (
|
| 566 |
-
btn_medium = gr.Button("Medium (
|
| 567 |
with gr.Row():
|
| 568 |
-
btn_all = gr.Button("Select All (
|
| 569 |
btn_clear = gr.Button("Clear All", size="sm")
|
| 570 |
|
| 571 |
# Model checkboxes by category
|
|
@@ -653,7 +650,7 @@ with gr.Blocks(title="AusCyberBench Evaluation Dashboard", theme=gr.themes.Soft(
|
|
| 653 |
gr.Markdown("""
|
| 654 |
---
|
| 655 |
**Dataset:** [Zen0/AusCyberBench](https://huggingface.co/datasets/Zen0/AusCyberBench) β’ 13,449 tasks |
|
| 656 |
-
**Models:**
|
| 657 |
**License:** MIT
|
| 658 |
""")
|
| 659 |
|
|
|
|
| 29 |
"β
Recommended (Tested)": [
|
| 30 |
"microsoft/Phi-3-mini-4k-instruct", # Proven stable
|
| 31 |
"microsoft/Phi-3.5-mini-instruct", # Works well
|
| 32 |
+
"Qwen/Qwen2.5-3B-Instruct", # Just tested 55.6%! β
|
|
|
|
| 33 |
"Qwen/Qwen2.5-7B-Instruct", # Good performance
|
| 34 |
"deepseek-ai/deepseek-llm-7b-chat", # Previously tested 55%+
|
| 35 |
"TinyLlama/TinyLlama-1.1B-Chat-v1.0", # Previously tested 33%+
|
|
|
|
| 44 |
"Small Models (1-4B)": [
|
| 45 |
"microsoft/Phi-3-mini-4k-instruct",
|
| 46 |
"microsoft/Phi-3.5-mini-instruct",
|
|
|
|
| 47 |
"Qwen/Qwen2.5-3B-Instruct",
|
|
|
|
|
|
|
| 48 |
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
| 49 |
+
# Removed gated models: google/gemma-2-2b-it, meta-llama/Llama-3.2-3B-Instruct
|
| 50 |
+
# Removed: stabilityai/stablelm-2-1_6b-chat (0% accuracy)
|
| 51 |
],
|
| 52 |
"Medium Models (7-12B)": [
|
| 53 |
"mistralai/Mistral-7B-Instruct-v0.3",
|
| 54 |
"Qwen/Qwen2.5-7B-Instruct",
|
|
|
|
|
|
|
| 55 |
"mistralai/Mistral-Nemo-Instruct-2407",
|
| 56 |
"01-ai/Yi-1.5-9B-Chat",
|
| 57 |
+
# Removed gated models: meta-llama/Llama-3.1-8B-Instruct, google/gemma-2-9b-it
|
| 58 |
],
|
| 59 |
"Reasoning & Analysis": [
|
| 60 |
"deepseek-ai/deepseek-llm-7b-chat",
|
|
|
|
| 542 |
gr.Markdown("""
|
| 543 |
# π¦πΊ AusCyberBench Evaluation Dashboard
|
| 544 |
|
| 545 |
+
**Australia's First LLM Cybersecurity Benchmark** β’ 13,449 Tasks β’ 25 Open Models
|
| 546 |
|
| 547 |
+
Evaluate proven open language models on Australian cybersecurity knowledge including
|
| 548 |
Essential Eight, ISM Controls, Privacy Act, SOCI Act, and ACSC Threat Intelligence.
|
| 549 |
|
| 550 |
+
β
**Recommended models** have been tested: Qwen2.5-3B (55.6%), DeepSeek (55%), TinyLlama (33%)
|
| 551 |
""")
|
| 552 |
|
| 553 |
with gr.Row():
|
|
|
|
| 556 |
|
| 557 |
# Quick selection buttons
|
| 558 |
with gr.Row():
|
| 559 |
+
btn_recommended = gr.Button("β
Recommended (6)", size="sm", variant="primary")
|
| 560 |
btn_security = gr.Button("π‘οΈ Security (5)", size="sm", variant="secondary")
|
| 561 |
with gr.Row():
|
| 562 |
+
btn_small = gr.Button("Small (4)", size="sm")
|
| 563 |
+
btn_medium = gr.Button("Medium (4)", size="sm")
|
| 564 |
with gr.Row():
|
| 565 |
+
btn_all = gr.Button("Select All (25)", size="sm")
|
| 566 |
btn_clear = gr.Button("Clear All", size="sm")
|
| 567 |
|
| 568 |
# Model checkboxes by category
|
|
|
|
| 650 |
gr.Markdown("""
|
| 651 |
---
|
| 652 |
**Dataset:** [Zen0/AusCyberBench](https://huggingface.co/datasets/Zen0/AusCyberBench) β’ 13,449 tasks |
|
| 653 |
+
**Models:** 25 open LLMs (no gated models) |
|
| 654 |
**License:** MIT
|
| 655 |
""")
|
| 656 |
|
|
@@ -7,3 +7,5 @@ datasets>=2.18.0
|
|
| 7 |
pandas>=2.0.0
|
| 8 |
matplotlib>=3.7.0
|
| 9 |
seaborn>=0.13.0
|
|
|
|
|
|
|
|
|
| 7 |
pandas>=2.0.0
|
| 8 |
matplotlib>=3.7.0
|
| 9 |
seaborn>=0.13.0
|
| 10 |
+
sentencepiece>=0.1.99
|
| 11 |
+
protobuf>=3.20.0
|