# tasks/task_hard.py import random def generate_schema(n_rows=5000, seed=42): """Generates schema + INSERT statements for n_rows transactions.""" rng = random.Random(seed) # statuses = ['completed', 'pending', 'failed'] inserts = [] for i in range(1, n_rows + 1): user_id = rng.randint(1, 100) amount = round(rng.uniform(10, 1000), 2) # status = rng.choice(statuses) inserts.append(f"INSERT INTO transactions VALUES ({i}, {user_id}, {amount}, 'completed');") return ( "CREATE TABLE transactions (id INTEGER, user_id INTEGER, amount REAL, ts TEXT, status TEXT);\n" + "\n".join(inserts[:200]) # Keep it fast for demo (200 rows) ) TASK = { "task_id": "optimize_001", "difficulty": "hard", "max_steps": 10, "schema_sql": generate_schema(200), # Use 200 rows for speed in hackathon # Slow: correlated subquery — runs inner SELECT once per outer row "broken_query": """ SELECT * FROM transactions t1 WHERE amount > ( SELECT AVG(amount) FROM transactions t2 WHERE t2.user_id = t1.user_id ) AND t1.status = 'completed' """, "target_description": ( "Return all completed transactions where the amount exceeds that user's average. " "Optimize it — avoid correlated subqueries. Use a CTE or subquery with GROUP BY." ), # For hard task we grade differently — no fixed expected_rows "expected_rows": None, # We check that the query plan is efficient (no per-row correlated scans) "check_plan": True, # Keywords we look for in the agent's solution "good_patterns": ["WITH", "GROUP BY", "AVG("], }