{ "time": "20250422", "results": { "CyberSecEval-3":{ "Social engineering":{ "GPT-4-Turbo": 79.6, "Qwen2-72B-Instruct": 70.4, "Llama-3-70B": 59, "Llama-3-405B": 52, "Mixtral-8x22B": 33.6 }, "Software vulnerability exploitation":{ "GPT-4-Turbo": 40, "Gemini Pro 1.0": 29, "Llama-3-70B": 41, "Llama-3-405B": 49, "Mixtral-8x22B": 35 }, "Prompt injection attack success rates": { "GPT-4-Turbo": 17, "Gemini Pro 1.0": 18, "Llama-3-70B": 26, "Llama-3-405B": 22, "Mixtral-8x22B": 35, "Qwen2-72B-Instruct": 20 }, "Autocomplete-based insecure code generation": { "GPT-4-Turbo": 30, "Gemini Pro 1.0": 25, "Llama-3-70B": 28, "Llama-3-405B": 31, "Mixtral-8x22B": 25, "Qwen2-72B-Instruct": 30 }, "Instruction-based insecure code generation": { "GPT-4-Turbo": 35, "Gemini Pro 1.0": 32, "Llama-3-70B": 35, "Llama-3-405B": 39, "Mixtral-8x22B": 34, "Qwen2-72B-Instruct": 34 }, "Code interpreter abuse compliance rates":{ "GPT-4-Turbo": 1, "Gemini Pro 1.0": 11, "Llama-3-70B": 42, "Llama-3-405B": 1, "Mixtral-8x22B": 20, "Qwen2-72B-Instruct": 5 }, "Cyber attack helpfulness compliance rates": { "GPT-4-Turbo": 42, "Gemini Pro 1.0": 92, "Llama-3-70B": 78, "Llama-3-405B": 38, "Mixtral-8x22B": 80, "Qwen2-72B-Instruct": 47 } }, "SecCodePLT": { "Secure instruction generation failure rates w/o security policy (rule-based metric)": { "CodeLlama-34B-Instruct": 66, "Llama-3.1-70B": 47, "Mixtral-8x22B": 58, "GPT-4o": 44 }, "Secure instruction generation failure rates w/ security policy (rule-based metric)": { "CodeLlama-34B-Instruct": 41, "Llama-3.1-70B": 17, "Mixtral-8x22B": 30, "GPT-4o": 16 }, "Secure instruction generation failure rates w/o security policy (Pass@1)": { "CodeLlama-34B-Instruct": 77, "Llama-3.1-70B": 62, "Mixtral-8x22B": 66, "GPT-4o": 47 }, "Secure instruction generation failure rates w/ security policy (Pass@1)": { "CodeLlama-34B-Instruct": 70, "Llama-3.1-70B": 53, "Mixtral-8x22B": 58, "GPT-4o": 38 }, "Secure code completion failure rates w/o security policy (rule-based metric)": { "CodeLlama-34B-Instruct": 78, "Llama-3.1-70B": 54, "Mixtral-8x22B": 66, "GPT-4o": 48 }, "Secure code completion failure rates w/ security policy (rule-based metric)": { "CodeLlama-34B-Instruct": 59, "Llama-3.1-70B": 23, "Mixtral-8x22B": 61, "GPT-4o": 21 }, "Secure code completion failure rates w/o security policy (Pass@1)": { "CodeLlama-34B-Instruct": 77, "Llama-3.1-70B": 57, "Mixtral-8x22B": 69, "GPT-4o": 44 }, "Secure code completion failure rates w/ security policy (Pass@1)": { "CodeLlama-34B-Instruct": 75, "Llama-3.1-70B": 45, "Mixtral-8x22B": 56, "GPT-4o": 34 }, "Reconnaissance": { "GPT-4o": 52, "Claude-3.5-Sonnet": 46, "LLaMA-3.1-70B": 10 }, "Weaponization & Infiltration": { "GPT-4o": 10, "Claude-3.5-Sonnet": 0, "LLaMA-3.1-70B": 0 }, "C2 & Execution": { "GPT-4o": 52, "Claude-3.5-Sonnet": 46, "LLaMA-3.1-70B": 10 }, "Discovery": { "GPT-4o": 82, "Claude-3.5-Sonnet": 88, "LLaMA-3.1-70B": 60 }, "Collection": { "GPT-4o": 86, "Claude-3.5-Sonnet": 92, "LLaMA-3.1-70B": 28 } }, "RedCode": { "RedCode-Gen (Accuracy)": { "GPT-4o": { "Base LLM": 69.4, "Code Agent": 72.5 }, "GPT-4": { "Base LLM": 65.0, "Code Agent": 66.9 }, "GPT-3.5": { "Base LLM": 0.0, "Code Agent": 32.5 }, "Claude-3-Opus": { "Base LLM": 1.3, "Code Agent": 3.1 }, "DeepSeekCoder-6.7B": { "Base LLM": 49.4, "Code Agent": 79.4 }, "CodeLlama-7B": { "Base LLM": 40.0, "Code Agent": 42.0 }, "CodeLlama-13B": { "Base LLM": 49.4, "Code Agent": 66.3 }, "Llama-2-7B": { "Base LLM": 16.9, "Code Agent": 20.7 }, "Mistral-7B": { "Base LLM": 46.3, "Code Agent": 75.3 } }, "RedCode-Exec: Python (Attack success rate)": { "GPT-4o": { "ReAct": 77.23 }, "GPT-4": { "ReAct": 64.50 }, "GPT-3.5": { "ReAct": 76.23 }, "Claude-3.5-Sonnet": { "ReAct": 67.63 }, "DeepSeekCoder-6.7B": { "ReAct": 80.23, "OCI": 48.87 }, "DeepSeekCoder-v2-lite": { "ReAct": 79.77 }, "CodeQwen1.5-7B-Chat": { "ReAct": 77.57 }, "Llama-3.1-70B-Instruct": { "ReAct": 76.7 }, "Llama-3.1-8B-Instruct": { "ReAct": 62.87 }, "Llama-3-8B-Instruct": { "ReAct": 42.50 }, "CodeLlama-13B": { "CodeAct": 71.87, "ReAct": 60.13, "OCI": 49.07 }, "CodeLlama-7B": { "CodeAct": 61.83, "ReAct": 58.43, "OCI": 46.80 }, "Llama-2-7B": { "CodeAct": 69.95 }, "Mistral-7B": { "CodeAct": 62.60 } }, "RedCode-Exec: Bash (Attack success rate)": { "GPT-4o": { "ReAct": 72.83 }, "GPT-4": { "ReAct": 61.96 }, "GPT-3.5": { "ReAct": 70.38 }, "Claude-3.5-Sonnet": { "ReAct": 62.67 }, "DeepSeekCoder-6.7B": { "ReAct": 73.17 }, "DeepSeekCoder-v2-lite": { "ReAct": 68.42 }, "CodeQwen1.5-7B-Chat": { "ReAct": 71.92 }, "Llama-3.1-70B-Instruct": { "ReAct": 74.38 }, "Llama-3-8B-Instruct": { "ReAct": 62.25 }, "Llama-3.1-8B-Instruct": { "ReAct": 59.83 }, "CodeLlama-13B": { "ReAct": 65.25 }, "CodeLlama-7B": { "ReAct": 56.21 } } }, "CyBench": { "Unguided % solved":{ "GPT-4o": 12.5, "GPT-4.5-preview": 17.5, "o1-preview": 10.0, "o1-mini": 10.0, "o3-mini": 22.5, "Claude-3-Opus": 10.0, "Claude-3.5-Sonnet": 17.5, "Claude-3.7-Sonnet": 20, "Gemini-1.5-pro": 7.5, "Llama-3.1-405B": 7.5, "Mixtral-8x22B": 7.5, "Gemini 1.5 Pro": 7.5, "Llama-3-70B": 5.0 }, "Subtask-guided % solved": { "Claude-3.5-Sonnet": 15.0, "GPT-4o": 17.5, "Claude-3-Opus": 12.5, "o1-preview": 10.0, "Llama-3.1-405B": 15.0, "Mixtral-8x22B": 5.0, "Gemini 1.5 Pro": 5.0, "Llama-3-70B": 7.5 }, "Subtasks % solved": { "Claude-3.5-Sonnet": 43.9, "GPT-4o": 28.7, "Claude-3-Opus": 36.8, "o1-preview": 46.8, "Llama-3.1-405B": 20.5, "Mixtral-8x22B": 15.2, "Gemini 1.5 Pro": 11.7, "Llama-3-70B": 8.2 } }, "NYU CTF Bench": { "Pass@1": { "Claude-3.5-Sonnet": { "D-CIPHER": 19.00, "EnIGMA": 13.50 }, "GPT-4o": { "D-CIPHER": 10.50, "EnIGMA": 9.50 }, "GPT-4": { "EnIGMA": 7.00 } } }, "CyberBench": { "Average": { "Falcon-7B": 39.4, "Falcon-7B-Instruct": 37.5, "Vicuna-7B-v1.5": 53.0, "Mistral-7B-v0.1": 58.1, "Mistral-7B-Instruct-v0.1": 55.0, "Zephyr-7B-beta": 57.7, "Llama-2-7B": 50.6, "Llama-2-7B-Chat": 44.6, "Vicuna-13B-v1.5": 57.3, "Llama-2-13B": 54.1, "Llama-2-13B-Chat": 45.0, "GPT-3.5-Turbo": 62.6, "GPT-4": 69.6 }, "CyNER (F1)": { "Falcon-7B": 24.1, "Falcon-7B-Instruct": 20.4, "Vicuna-7B-v1.5": 25.8, "Mistral-7B-v0.1": 36.7, "Mistral-7B-Instruct-v0.1": 32.3, "Zephyr-7B-beta": 30.0, "Llama-2-7B": 26.3, "Llama-2-7B-Chat": 22.7, "Vicuna-13B-v1.5": 26.2, "Llama-2-13B": 28.6, "Llama-2-13B-Chat": 27.5, "GPT-3.5-Turbo": 33.4, "GPT-4": 55.4 }, "APTNER (F1)": { "Falcon-7B": 17.7, "Falcon-7B-Instruct": 19.1, "Vicuna-7B-v1.5": 27.5, "Mistral-7B-v0.1": 33.0, "Mistral-7B-Instruct-v0.1": 26.2, "Zephyr-7B-beta": 30.5, "Llama-2-7B": 28.0, "Llama-2-7B-Chat": 25.4, "Vicuna-13B-v1.5": 28.1, "Llama-2-13B": 29.9, "Llama-2-13B-Chat": 28.2, "GPT-3.5-Turbo": 40.9, "GPT-4": 50.0 }, "CyNews (R-1/2/L)": { "Falcon-7B": "1.0/0.8/1.0", "Falcon-7B-Instruct": "7.2/2.7/6.0", "Vicuna-7B-v1.5": "36.1/15.9/31.2", "Mistral-7B-v0.1": "3.4/1.7/3.0", "Mistral-7B-Instruct-v0.1": "28.7/11.8/24.5", "Zephyr-7B-beta": "32.0/12.8/27.4", "Llama-2-7B": "0.3/0.3/0.3", "Llama-2-7B-Chat": "25.2/9.6/21.6", "Vicuna-13B-v1.5": "35.6/15.6/30.9", "Llama-2-13B": "0.6/0.5/0.6", "Llama-2-13B-Chat": "3.5/1.3/2.9", "GPT-3.5-Turbo": "35.5/15.4/30.3", "GPT-4": "35.9/15.5/31.2" }, "SecMMLU (Accuracy)": { "Falcon-7B": 27.0, "Falcon-7B-Instruct": 25.0, "Vicuna-7B-v1.5": 64.0, "Mistral-7B-v0.1": 76.0, "Mistral-7B-Instruct-v0.1": 72.0, "Zephyr-7B-beta": 74.0, "Llama-2-7B": 63.0, "Llama-2-7B-Chat": 60.0, "Vicuna-13B-v1.5": 66.0, "Llama-2-13B": 67.0, "Llama-2-13B-Chat": 64.0, "GPT-3.5-Turbo": 78.0, "GPT-4": 83.0 }, "CyQuiz (Accuracy)": { "Falcon-7B": 27.0, "Falcon-7B-Instruct": 21.0, "Vicuna-7B-v1.5": 66.0, "Mistral-7B-v0.1": 77.0, "Mistral-7B-Instruct-v0.1": 69.0, "Zephyr-7B-beta": 75.0, "Llama-2-7B": 62.0, "Llama-2-7B-Chat": 56.0, "Vicuna-13B-v1.5": 74.0, "Llama-2-13B": 67.0, "Llama-2-13B-Chat": 65.0, "GPT-3.5-Turbo": 83.0, "GPT-4": 81.0 }, "MITRE (Accuracy)": { "Falcon-7B": 34.9, "Falcon-7B-Instruct": 30.4, "Vicuna-7B-v1.5": 43.5, "Mistral-7B-v0.1": 50.2, "Mistral-7B-Instruct-v0.1": 47.3, "Zephyr-7B-beta": 43.5, "Llama-2-7B": 44.6, "Llama-2-7B-Chat": 41.6, "Vicuna-13B-v1.5": 47.3, "Llama-2-13B": 47.5, "Llama-2-13B-Chat": 42.7, "GPT-3.5-Turbo": 54.5, "GPT-4": 64.9 }, "CVE (Accuracy)": { "Falcon-7B": 54.6, "Falcon-7B-Instruct": 52.9, "Vicuna-7B-v1.5": 60.0, "Mistral-7B-v0.1": 64.6, "Mistral-7B-Instruct-v0.1": 58.7, "Zephyr-7B-beta": 61.9, "Llama-2-7B": 64.7, "Llama-2-7B-Chat": 52.5, "Vicuna-13B-v1.5": 62.3, "Llama-2-13B": 62.1, "Llama-2-13B-Chat": 42.0, "GPT-3.5-Turbo": 58.0, "GPT-4": 63.0 }, "Web (F1)": { "Falcon-7B": 68.9, "Falcon-7B-Instruct": 59.5, "Vicuna-7B-v1.5": 75.3, "Mistral-7B-v0.1": 91.9, "Mistral-7B-Instruct-v0.1": 87.2, "Zephyr-7B-beta": 85.2, "Llama-2-7B": 79.9, "Llama-2-7B-Chat": 48.4, "Vicuna-13B-v1.5": 82.6, "Llama-2-13B": 89.3, "Llama-2-13B-Chat": 58.8, "GPT-3.5-Turbo": 89.2, "GPT-4": 95.4 }, "Email (F1)": { "Falcon-7B": 93.3, "Falcon-7B-Instruct": 93.5, "Vicuna-7B-v1.5": 86.4, "Mistral-7B-v0.1": 96.4, "Mistral-7B-Instruct-v0.1": 88.9, "Zephyr-7B-beta": 86.7, "Llama-2-7B": 94.2, "Llama-2-7B-Chat": 79.4, "Vicuna-13B-v1.5": 86.5, "Llama-2-13B": 96.4, "Llama-2-13B-Chat": 70.3, "GPT-3.5-Turbo": 78.9, "GPT-4": 93.9 }, "HTTP (F1)": { "Falcon-7B": 45.2, "Falcon-7B-Instruct": 48.3, "Vicuna-7B-v1.5": 53.7, "Mistral-7B-v0.1": 52.6, "Mistral-7B-Instruct-v0.1": 47.2, "Zephyr-7B-beta": 66.2, "Llama-2-7B": 42.8, "Llama-2-7B-Chat": 41.0, "Vicuna-13B-v1.5": 72.3, "Llama-2-13B": 52.5, "Llama-2-13B-Chat": 48.5, "GPT-3.5-Turbo": 83.1, "GPT-4": 84.1 } }, "CyberMetric":{ "80 Q (Accuracy)": { "GPT-4o": 96.25, "Mixtral-8x7B-Instruct": 92.50, "GPT-4-Turbo": 96.25, "Falcon-180B-Chat": 90.00, "GPT-3.5-Turbo": 90.00, "Gemini Pro 1.0": 90.00, "Mistral-7B-Instruct-v0.2": 78.75, "Gemma-1.1-7B": 82.50, "Llama-3-8B-Instruct": 81.25, "Flan-T5-XXL": 81.94, "Llama 2-70B": 75.00, "Zephyr-7B-beta": 80.94, "Qwen1.5-MoE-A2.7B": 62.50, "Qwen1.5-7B": 73.75, "Qwen-7B": 43.75, "Phi-2": 53.75, "Llama3-ChatQA-1.5-8B": 53.75, "DeciLM-7B": 52.50, "Qwen1.5-4B": 36.25, "Genstruct-7B": 38.75, "Llama-3-8B": 38.75, "Gemma-7B": 42.50, "Dolly V2 12b BF16": 33.75, "Gemma-2B": 25.00, "Phi-3-mini-4k-Instruct": 5.00 }, "500 Q (Accuracy)": { "GPT-4o": 93.40, "Mixtral-8x7B-Instruct": 91.80, "GPT-4-Turbo": 93.30, "Falcon-180B-Chat": 87.80, "GPT-3.5-Turbo": 87.30, "Gemini Pro 1.0": 85.05, "Mistral-7B-Instruct-v0.2": 78.40, "Gemma-1.1-7B": 75.40, "Llama-3-8B-Instruct": 76.20, "Flan-T5-XXL": 71.10, "Llama 2-70B": 73.40, "Zephyr-7B-beta": 76.40, "Qwen1.5-MoE-A2.7B": 64.60, "Qwen1.5-7B": 60.60, "Qwen-7B": 58.00, "Phi-2": 48.00, "Llama3-ChatQA-1.5-8B": 52.80, "DeciLM-7B": 47.20, "Qwen1.5-4B": 41.20, "Genstruct-7B": 40.60, "Llama-3-8B": 35.80, "Gemma-7B": 37.20, "Dolly V2 12b BF16": 30.00, "Gemma-2B": 23.20, "Phi-3-mini-4k-Instruct": 5.00 }, "2k Q (Accuracy)": { "GPT-4o": 91.25, "Mixtral-8x7B-Instruct": 91.10, "GPT-4-Turbo": 91.00, "Falcon-180B-Chat": 87.10, "GPT-3.5-Turbo": 88.10, "Gemini Pro 1.0": 84.00, "Mistral-7B-Instruct-v0.2": 76.40, "Gemma-1.1-7B": 75.75, "Llama-3-8B-Instruct": 73.75, "Flan-T5-XXL": 69.00, "Llama 2-70B": 71.60, "Zephyr-7B-beta": 72.50, "Qwen1.5-MoE-A2.7B": 61.65, "Qwen1.5-7B": 61.35, "Qwen-7B": 55.75, "Phi-2": 52.90, "Llama3-ChatQA-1.5-8B": 49.45, "DeciLM-7B": 50.44, "Qwen1.5-4B": 40.50, "Genstruct-7B": 37.55, "Llama-3-8B": 37.00, "Gemma-7B": 36.00, "Dolly V2 12b BF16": 28.75, "Gemma-2B": 18.20, "Phi-3-mini-4k-Instruct": 4.41 }, "10k Q (Accuracy)": { "GPT-4o": 88.89, "Mixtral-8x7B-Instruct": 87.00, "GPT-4-Turbo": 88.50, "Falcon-180B-Chat": 87.00, "GPT-3.5-Turbo": 80.30, "Gemini Pro 1.0": 87.50, "Mistral-7B-Instruct-v0.2": 74.82, "Gemma-1.1-7B": 73.32, "Llama-3-8B-Instruct": 71.25, "Flan-T5-XXL": 67.50, "Llama 2-70B": 66.10, "Zephyr-7B-beta": 65.00, "Qwen1.5-MoE-A2.7B": 60.73, "Qwen1.5-7B": 59.79, "Qwen-7B": 54.09, "Phi-2": 52.13, "Llama3-ChatQA-1.5-8B": 49.64, "DeciLM-7B": 50.75, "Qwen1.5-4B": 40.29, "Genstruct-7B": 36.93, "Llama-3-8B": 36.00, "Gemma-7B": 34.28, "Dolly V2 12b BF16": 27.00, "Gemma-2B": 19.18, "Phi-3-mini-4k-Instruct": 4.80 } }, "TACTL": { "Ground2Crown": { "DeepSeek-R1": 100, "DeepSeek-V3": 100, "GPT-4o": 93.3, "Llama-3.1-405B": 93.3, "Qwen2.5-72B-Instruct": 93.3, "Llama-3.1-Tulu-3-70B": 83.3, "Llama-3.3-70B": 80.0, "Mixtral-8x22B": 60.0 }, "TACTL-183": { "DeepSeek-R1": 91.8, "DeepSeek-V3": 86.3, "GPT-4o": 85.2, "Llama-3.1-405B": 88.5, "Qwen2.5-72B-Instruct": 84.2, "Llama-3.1-Tulu-3-70B": 81.4, "Llama-3.3-70B": 78.7, "Mixtral-8x22B": 65.0 } }, "AutoPenBench": { "Autonomous (Success rate)": { "GPT-4o": 21 }, "Autonomous (Progress rate)": { "GPT-4o": 39 }, "Assisted (Success rate)": { "GPT-4o": 64 }, "Assisted (Progress rate)": { "GPT-4o": 53 } }, "PrimeVul": { "Pair-wise Correct Prediction": { "GPT-3.5": { "Two-shot": 5.67, "CoT": 6.21, "Fine-tune": 1.24 }, "GPT-4": { "Two-shot": 5.14, "CoT": 12.94 } } }, "CRUXEval": { "Input Prediction (Pass@1)": { "CodeLlama-7B": 36.6, "CodeLlama-13B": 39.0, "CodeLlama-34B": 46.5, "CodeLlama-7B-Python": 36.3, "CodeLlama-13B-Python": 40.5, "CodeLlama-34B-Python": 41.5, "StarCoderBase-7B": 30.0, "StarCoderBase-15.5B": 31.6, "WizardCoder-13B": 39.2, "WizardCoder-34B": 42.8, "Phi-1": 13.9, "Phi-1.5": 24.1, "Phind v2": 47.9, "DeepSeek-Coder-6.7B-Base": 41.1, "DeepSeek-Coder-33B-Base": 46.6, "DeepSeek-Coder-6.7B-Instruct": 36.6, "DeepSeek-Coder-33B-Instruct": 47.4, "Mistral-7B": 36.0, "GPT-3.5": 49.2, "GPT-4": 67.1 }, "Input Prediction (Pass@5)": { "CodeLlama-7B": 55.2, "CodeLlama-13B": 58.2, "CodeLlama-34B": 64.7, "CodeLlama-7B-Python": 56.0, "CodeLlama-13B-Python": 58.0, "CodeLlama-34B-Python": 59.2, "StarCoderBase-7B": 48.9, "StarCoderBase-15.5B": 49.5, "WizardCoder-13B": 54.8, "WizardCoder-34B": 57.3, "Phi-1": 22.6, "Phi-1.5": 38.9, "Phind v2": 64.9, "DeepSeek-Coder-6.7B-Base": 61.7, "DeepSeek-Coder-33B-Base": 65.1, "DeepSeek-Coder-6.7B-Instruct": 54.4, "DeepSeek-Coder-33B-Instruct": 64.2, "Mistral-7B": 54.2, "GPT-3.5": 66.5, "GPT-4": 76.8 }, "Output Prediction (Pass@1)": { "CodeLlama-7B": 36.4, "CodeLlama-13B": 38.4, "CodeLlama-34B": 41.1, "CodeLlama-7B-Python": 36.4, "CodeLlama-13B-Python": 37.8, "CodeLlama-34B-Python": 40.7, "StarCoderBase-7B": 31.1, "StarCoderBase-15.5B": 33.3, "WizardCoder-13B": 37.9, "WizardCoder-34B": 41.2, "Phi-1": 23.3, "Phi-1.5": 27.1, "Phind v2": 38.3, "DeepSeek-Coder-6.7B-Base": 39.8, "DeepSeek-Coder-33B-Base": 43.6, "DeepSeek-Coder-6.7B-Instruct": 41.0, "DeepSeek-Coder-33B-Instruct": 44.0, "Mistral-7B": 31.7, "GPT-3.5": 50.0, "GPT-4": 63.4 }, "Output Prediction (Pass@5)": { "CodeLlama-7B": 49.6, "CodeLlama-13B": 53.2, "CodeLlama-34B": 56.1, "CodeLlama-7B-Python": 49.7, "CodeLlama-13B-Python": 50.8, "CodeLlama-34B-Python": 53.7, "StarCoderBase-7B": 43.8, "StarCoderBase-15.5B": 47.7, "WizardCoder-13B": 51.6, "WizardCoder-34B": 52.2, "Phi-1": 34.0, "Phi-1.5": 39.4, "Phind v2": 49.2, "DeepSeek-Coder-6.7B-Base": 53.9, "DeepSeek-Coder-33B-Base": 57.5, "DeepSeek-Coder-6.7B-Instruct": 52.5, "DeepSeek-Coder-33B-Instruct": 58.0, "Mistral-7B": 48.2, "GPT-3.5": 60.1, "GPT-4": 68.7 } }, "SWE-bench-verified": { "% Resolved": { "Claude 3.7 Sonnet (No extended thinking + scaffolding)": 70.30, "Augment Agent v0": 65.40, "W&B Programmer O1 crosscheck5": 64.60, "AgentScope": 63.40, "Tools + Claude 3.7 Sonnet (2025-02-24)": 63.20, "EPAM AI/Run Developer Agent v20250219 + Anthopic Claude 3.5 Sonnet": 62.80, "CodeStory Midwit Agent + swe-search": 62.20, "OpenHands + 4x Scaled (2024-02-03)": 60.80, "Learn-by-interact": 60.20, "devlo": 58.20, "Emergent E1 (v2024-12-23)": 57.20, "Gru(2024-12-08)": 57.00, "EPAM AI/Run Developer Agent v20241212 + Anthopic Claude 3.5 Sonnet": 55.40, "Amazon Q Developer Agent (v20241202-dev)": 55.00, "Bracket.sh": 53.20, "OpenHands + CodeAct v2.1 (claude-3-5-sonnet-20241022)": 53.00, "Google Jules + Gemini 2.0 Flash (v20241212-experimental)": 52.20, "Engine Labs (2024-11-25)": 51.80, "AutoCodeRover-v2.1 (Claude-3.5-Sonnet-20241022)": 51.60, "Agentless-1.5 + Claude-3.5 Sonnet (20241022)": 50.80, "Solver (2024-10-28)": 50.00, "Bytedance MarsCode Agent": 50.00, "nFactorial (2024-11-05)": 49.20, "Tools + Claude 3.5 Sonnet (2024-10-22)": 49.00, "Composio SWE-Kit (2024-10-25)": 48.60, "AppMap Navie v2": 47.20, "Emergent E1 (v2024-10-12)": 46.60, "AutoCodeRover-v2.0 (Claude-3.5-Sonnet-20241022)": 46.20, "Solver (2024-09-12)": 45.40, "Gru(2024-08-24)": 45.20, "CodeShellAgent + Gemini 2.0 Flash (Experimental)": 44.20, "Agentless Lite + O3 Mini (20250214)": 42.40, "ugaiforge": 41.60, "nFactorial (2024-10-30)": 41.60, "SWE-RL (Llama3-SWE-RL-70B + Agentless Mini) (20250226)": 41.20, "Nebius AI Qwen 2.5 72B Generator + LLama 3.1 70B Critic": 40.60, "Tools + Claude 3.5 Haiku": 40.60, "Honeycomb": 40.60, "Composio SWEkit + Claude 3.5 Sonnet (2024-10-16)": 40.60, "EPAM AI/Run Developer Agent v20241029 + Anthopic Claude 3.5 Sonnet": 39.60, "Amazon Q Developer Agent (v20240719-dev)": 38.80, "Agentless-1.5 + GPT 4o (2024-05-13)": 38.80, "AutoCodeRover (v20240620) + GPT 4o (2024-05-13)": 38.40, "SWE-agent + Claude 3.5 Sonnet": 33.60, "MASAI + GPT 4o (2024-06-12)": 32.60, "Artemis Agent v1 (2024-11-20)": 32.00, "nFactorial (2024-10-07)": 31.60, "SWE-Fixer (Qwen2.5-7b retriever + Qwen2.5-72b editor) 20241128": 30.20, "Lingma Agent + Lingma SWE-GPT 72b (v0925)": 28.80, "EPAM AI/Run Developer Agent + GPT4o": 27.00, "AppMap Navie + GPT 4o (2024-05-13)": 26.20, "nFactorial (2024-10-01)": 25.80, "Amazon Q Developer Agent (v20240430-dev)": 25.60, "Lingma Agent + Lingma SWE-GPT 72b (v0918)": 25.00, "SWE-agent + GPT 4o (2024-05-13)": 23.20, "SWE-agent + GPT 4 (1106)": 22.40, "SWE-agent + Claude 3 Opus": 18.20, "Lingma Agent + Lingma SWE-GPT 7b (v0925)": 18.20, "Lingma Agent + Lingma SWE-GPT 7b (v0918)": 10.20, "RAG + Claude 3 Opus": 7.00, "RAG + Claude 2": 4.40, "RAG + GPT 4 (1106)": 2.80, "RAG + SWE-Llama 7B": 1.40, "RAG + SWE-Llama 13B": 1.20, "RAG + ChatGPT 3.5": 0.40 } }, "CyberGym": { "% Reproducing Target Vuln.": { "OpenHands + Claude-Sonnet-4": 17.85, "OpenHands + Claude-3.7-Sonnet": 11.94, "OpenHands + GPT-4.1": 9.36, "Cybench + GPT-4.1": 8.96, "Codex + GPT-4.1": 7.37, "ENiGMA + GPT-4.1": 7.23, "OpenHands + Gemini-2.5-Flash": 4.84, "OpenHands + DeepSeek-V3": 3.58, "OpenHands + o4-mini": 2.46, "OpenHands + R2E-Gym-32B": 1.99, "OpenHands + Qwen3-235B-A22B": 1.86, "OpenHands + OpenHands-LM-32B": 1.66, "OpenHands + SWE-Gym-32B": 0.07 }, "% Finding Post-Patch Vuln.": { "OpenHands + Claude-Sonnet-4": 1.99, "OpenHands + Claude-3.7-Sonnet": 2.19, "OpenHands + GPT-4.1": 1.26, "Cybench + GPT-4.1": 2.26, "Codex + GPT-4.1": 1.19, "ENiGMA + GPT-4.1": 1.92, "OpenHands + Gemini-2.5-Flash": 0.80, "OpenHands + DeepSeek-V3": 0.66, "OpenHands + o4-mini": 0.07, "OpenHands + R2E-Gym-32B": 0.60, "OpenHands + Qwen3-235B-A22B": 0.33, "OpenHands + OpenHands-LM-32B": 0.33, "OpenHands + SWE-Gym-32B": 0.07 } }, "BountyBench": { "Detect Success Rate": { "Claude Code": 5, "OpenAI Codex CLI": 5, "C-Agent: Claude 3.7": 5, "C-Agent: Gemini 2.5": 2.5, "C-Agent: GPT-4.1": 0 }, "Exploit Success Rate": { "Claude Code": 57.5, "OpenAI Codex CLI": 32.5, "C-Agent: Claude 3.7": 67.5, "C-Agent: Gemini 2.5": 40, "C-Agent: GPT-4.1": 55 }, "Patch Success Rate": { "Claude Code": 87.5, "OpenAI Codex CLI": 90, "C-Agent: Claude 3.7": 60, "C-Agent: Gemini 2.5": 45, "C-Agent: GPT-4.1": 50 } }, "CVE-Bench": { "Zero-day Pass@1": { "T-Agent + GPT-4o (2024-11-20)": 8.0, "AutoGPT + GPT-4o (2024-11-20)": 3.0, "Cy-Agent + GPT-4o (2024-11-20)": 1.0 }, "Zero-day Pass@5": { "T-Agent + GPT-4o (2024-11-20)": 10.0, "AutoGPT + GPT-4o (2024-11-20)": 10.0, "Cy-Agent + GPT-4o (2024-11-20)": 2.5 }, "One-day Pass@1": { "T-Agent + GPT-4o (2024-11-20)": 7.0, "AutoGPT + GPT-4o (2024-11-20)": 4.5, "Cy-Agent + GPT-4o (2024-11-20)": 2.5 }, "One-day Pass@5": { "T-Agent + GPT-4o (2024-11-20)": 12.5, "AutoGPT + GPT-4o (2024-11-20)": 5.0, "Cy-Agent + GPT-4o (2024-11-20)": 2.5 } } } }