Spaces:

abhinavthedev
/

sql-debug

Sleeping

sql-debug / tasks /task_hard.py

Upload folder using huggingface_hub

5db060f verified 29 days ago

1.78 kB

	# tasks/task_hard.py
	import random

	def generate_schema(n_rows=5000, seed=42):
	"""Generates schema + INSERT statements for n_rows transactions."""
	rng = random.Random(seed)
	# statuses = ['completed', 'pending', 'failed']
	inserts = []
	for i in range(1, n_rows + 1):
	user_id = rng.randint(1, 100)
	amount = round(rng.uniform(10, 1000), 2)
	# status = rng.choice(statuses)
	inserts.append(f"INSERT INTO transactions VALUES ({i}, {user_id}, {amount}, 'completed');")
	return (
	"CREATE TABLE transactions (id INTEGER, user_id INTEGER, amount REAL, ts TEXT, status TEXT);\n"
	+ "\n".join(inserts[:200]) # Keep it fast for demo (200 rows)
	)

	TASK = {
	"task_id": "optimize_001",
	"difficulty": "hard",
	"max_steps": 10,

	"schema_sql": generate_schema(200), # Use 200 rows for speed in hackathon

	# Slow: correlated subquery — runs inner SELECT once per outer row
	"broken_query": """
	SELECT *
	FROM transactions t1
	WHERE amount > (
	SELECT AVG(amount)
	FROM transactions t2
	WHERE t2.user_id = t1.user_id
	)
	AND t1.status = 'completed'
	""",

	"target_description": (
	"Return all completed transactions where the amount exceeds that user's average. "
	"Optimize it — avoid correlated subqueries. Use a CTE or subquery with GROUP BY."
	),

	# For hard task we grade differently — no fixed expected_rows
	"expected_rows": None,

	# We check that the query plan is efficient (no per-row correlated scans)
	"check_plan": True,

	# Keywords we look for in the agent's solution
	"good_patterns": ["WITH", "GROUP BY", "AVG("],
	}