Spaces:
Running
Running
File size: 33,170 Bytes
3480bcd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 |
from tavily import TavilyClient
import subprocess, tempfile, time, os
from pathlib import Path
from typing import Dict, Any, List, Literal
import shutil, zipfile
from uuid import uuid4
from smolagents import tool, CodeAgent, InferenceClientModel, ToolCallingAgent
# For Data Analysis Agent
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, r2_score, mean_squared_error
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
import joblib
import io
# Initialize Tavily client for web search
#tavily_client = TavilyClient(api_key=os.environ["TAVILY_API_KEY"])
tavily_client = TavilyClient(api_key="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")
os.environ["HF_TOKEN"] = ""
# ----------------- Tools -----------------
@tool
def internet_search(
query: str,
max_results: int = 5,
topic: Literal["general", "news", "finance","science","technology","economy"] = "general",
include_raw_content: bool = False,
)-> List[Dict[str, Any]]:
"""
Tool to perform an internet search using the Tavily API.
This tool allows the agent to gather information from the web
based on a query and a specified topic. It returns a list of
search results, optionally including the raw content of the
webpages.
Args:
query (str): The search query or keywords to look up on the web.
max_results (int, optional): Maximum number of search results to return.
Defaults to 5.
topic (Literal["general", "news", "finance", "science", "technology", "economy"], optional):
Category of the search to prioritize relevant content. Defaults to "general".
include_raw_content (bool, optional): If True, include the full raw content of the results;
otherwise, only metadata is returned. Defaults to False.
Returns:
List[Dict[str, Any]]: A list of search results from Tavily, with each item containing
relevant information such as title, URL, snippet, and optionally raw content.
"""
result1 = tavily_client.search(
query,
max_results=max_results,
include_raw_content=include_raw_content,
topic=topic,
)
return result1
@tool
def code_executor(
image: str,
cmds: List[str],
mounts: Dict[str, str] = None,
host_workspace: str = None,
container_workdir: str = "/workspace",
timeout: int = 60,
allow_network: bool = False,
) -> Dict[str, Any]:
"""
Executes a sequence of shell commands inside a Docker container.
This tool allows safe and isolated execution of code or scripts
using a specified Docker image. It supports mounting host directories,
custom working directories, timeout handling, and optional network access.
Args:
image (str): The Docker image to use for execution (e.g., "python:3.11-slim").
cmds (List[str]): A list of shell commands to run inside the container.
mounts (Dict[str, str], optional): Dictionary mapping host paths to container paths
for volume mounting. Defaults to None.
host_workspace (str, optional): Path on the host machine to use as workspace.
If None, a temporary directory is created. Defaults to None.
container_workdir (str, optional): Working directory inside the container. Defaults to "/workspace".
timeout (int, optional): Maximum execution time in seconds before terminating the process. Defaults to 60.
allow_network (bool, optional): Whether to allow network access inside the container.
Defaults to False (safe default).
Returns:
Dict[str, Any]: A dictionary containing execution results:
- stdout (str): Standard output from the container.
- stderr (str): Standard error output.
- exit_code (int): Exit code of the executed commands.
- runtime_s (float): Execution time in seconds.
- files (List[str]): List of files created in the host workspace (relative paths).
- host_workspace (str): Path to the host workspace used for execution.
Notes:
- Ensures that the host workspace is always mounted to the container.
- Normalizes Windows paths for Docker volume mounting.
- Safely handles subprocess timeouts and captures output.
"""
if host_workspace is None:
host_workspace = tempfile.mkdtemp(prefix="mini_manus_ws_")
# Ensure mounts include host_workspace -> container_workdir
mounts = dict(mounts or {})
if host_workspace not in mounts:
mounts[host_workspace] = container_workdir
docker_cmd = ["docker", "run", "--rm", "--memory", "512m", "--cpus", "1"]
if not allow_network:
docker_cmd += ["--network", "none"]
# Normalize Windows backslashes -> forward slashes for docker -v on some setups
def _norm(p: str) -> str:
return p.replace("\\", "/")
for host, cont in mounts.items():
docker_cmd += ["-v", f"{_norm(host)}:{cont}"]
docker_cmd += ["-w", container_workdir, image]
joined = " && ".join(cmds) if cmds else "echo 'No commands provided'"
docker_cmd += ["sh", "-lc", joined]
start = time.time()
try:
proc = subprocess.run(docker_cmd, capture_output=True, text=True, timeout=timeout)
runtime = time.time() - start
# Gather files from the host workspace (NOT container path)
files = []
try:
for p in Path(host_workspace).rglob("*"):
if p.is_file():
files.append(str(p.relative_to(host_workspace)))
except Exception:
files = []
return {
"stdout": proc.stdout,
"stderr": proc.stderr,
"exit_code": proc.returncode,
"runtime_s": round(runtime, 3),
"files": files,
"host_workspace": host_workspace,
}
except subprocess.TimeoutExpired as te:
return {
"stdout": te.stdout or "",
"stderr": (te.stderr or "") + f"\n[Timed out after {timeout}s]",
"exit_code": -1,
"runtime_s": round(time.time() - start, 3),
"files": [],
"host_workspace": host_workspace,
}
@tool
def save_files(manifest_files: List[Dict[str,str]], workspace: str = None) -> str:
"""
Saves a list of files to a host workspace directory.
This tool creates the specified files with their content on the host system.
Each file is defined by a dictionary containing a relative path and content.
If no workspace path is provided, a temporary directory is created automatically.
Args:
manifest_files (List[Dict[str, str]]): A list of file descriptors,
where each descriptor is a dictionary with:
- "path" (str): Relative file path (e.g., "app.py" or "src/module.py").
- "content" (str): The content to write into the file.
workspace (str, optional): Path to the host directory where files should be saved.
If None, a temporary directory is created. Defaults to None.
Returns:
str: The path to the host workspace directory where the files were saved.
Notes:
- Automatically creates parent directories if they do not exist.
- Overwrites files if they already exist at the same path.
- Useful for preparing workspaces for code execution in sandboxed environments.
"""
if workspace is None:
workspace = tempfile.mkdtemp(prefix="mini_manus_ws_")
ws = Path(workspace)
ws.mkdir(parents=True, exist_ok=True)
for f in manifest_files:
p = ws / f["path"]
p.parent.mkdir(parents=True, exist_ok=True)
p.write_text(f["content"], encoding="utf-8")
return str(ws)
# 2) List files in a workspace (relative)
@tool
def list_workspace_files(workspace: str) -> List[str]:
"""
Recursively list all files in a given workspace directory.
This tool traverses the workspace directory and collects all file paths,
returning them relative to the workspace root. It is useful for inspecting
the contents of a workspace, packaging artifacts, or tracking generated files.
Args:
workspace (str): Path to the workspace directory to list.
Returns:
List[str]: A list of file paths relative to the workspace root.
Notes:
- Only files are included; directories themselves are ignored.
- If the workspace path is invalid or an error occurs during traversal,
an empty list is returned.
- Paths are returned as strings using forward slashes.
"""
files = []
try:
for p in Path(workspace).rglob("*"):
if p.is_file():
files.append(str(p.relative_to(workspace)))
except Exception:
pass
return files
# 3) Package artifact (zip) and return path
@tool
def package_artifact(workspace: str, out_dir: str = None) -> str:
"""
Package the contents of a workspace directory into a ZIP archive.
This tool collects all files within a given workspace and compresses
them into a single ZIP file, which can be used as an artifact for
deployment, sharing, or backup purposes.
Args:
workspace (str): Path to the workspace directory to package.
out_dir (str, optional): Directory to save the generated ZIP file.
If None, a temporary directory will be created.
Returns:
str: Absolute file path of the created ZIP archive.
Notes:
- Only files are included in the ZIP archive; directories themselves
are not stored.
- The ZIP filename is automatically generated using a UUID to ensure
uniqueness.
- If `out_dir` does not exist, it will be created.
- Useful for packaging code, data, or other artifacts generated
during automated workflows.
"""
if out_dir is None:
out_dir = tempfile.mkdtemp(prefix="mini_manus_artifacts_")
Path(out_dir).mkdir(parents=True, exist_ok=True)
zip_name = Path(out_dir) / f"artifact_{uuid4().hex}.zip"
with zipfile.ZipFile(zip_name, "w", zipfile.ZIP_DEFLATED) as z:
for p in Path(workspace).rglob("*"):
if p.is_file():
z.write(p, p.relative_to(workspace))
return str(zip_name)
# 4) Cleanup workspace
@tool
def cleanup_workspace(workspace: str, keep: bool = False) -> None:
"""
Safely removes a workspace directory and all its contents.
This tool is used to clean up temporary directories created during
code execution, testing, or file manipulation. It ensures that the
workspace is deleted unless explicitly preserved.
Args:
workspace (str): Path to the workspace directory to delete.
keep (bool, optional): If True, the workspace will not be deleted.
Defaults to False.
Returns:
None
Notes:
- Any errors during deletion (e.g., non-existent directory, permission issues)
are silently ignored.
- Use `keep=True` to preserve the workspace, for example, when artifacts
need to be inspected after execution.
- Intended for host-side cleanup of temporary directories used in containerized
or local code execution workflows.
"""
if keep:
return
try:
shutil.rmtree(workspace)
except Exception:
pass
# 5) Run a manifest end-to-end using your code_executor (uses Docker image + run_commands)
@tool
def run_manifest(manifest: Dict[str, Any], base_image: str = "python:3.11-slim", timeout: int = 120, keep_workspace: bool = False) -> Dict[str, Any]:
"""
Executes a manifest of files and commands inside a Docker container and optionally packages the workspace.
This tool automates the process of:
1. Saving provided files to a host workspace.
2. Installing dependencies (if a `requirements.txt` is present or if `install_libs` is specified).
3. Running commands and optional test commands inside a Docker container.
- Commands referencing workspace files are automatically adjusted to point to the container workspace.
4. Collecting outputs, listing files, and optionally packaging the workspace into a ZIP artifact.
5. Cleaning up the workspace unless `keep_workspace=True`.
Args:
manifest (Dict[str, Any]): A dictionary describing the manifest, with the following keys:
- "files" (List[Dict[str,str]]): List of files to save, each with "path" and "content".
- "run_commands" (List[str], optional): Commands to execute inside the container.
- "test_command" (str, optional): A command for testing/verifying the execution.
- "install_libs" (List[str], optional): A list of Python packages to install dynamically
(e.g., ["crewai", "transformers"]). Installed before any run/test commands.
base_image (str, optional): Docker image to use for execution. Defaults to "python:3.11-slim".
timeout (int, optional): Maximum time in seconds for container execution. Defaults to 120.
keep_workspace (bool, optional): If True, preserves the host workspace after execution. Defaults to False.
Returns:
Dict[str, Any]: A dictionary containing execution results and metadata:
- "stdout" (str): Standard output from the execution.
- "stderr" (str): Standard error from the execution.
- "exit_code" (int): Exit code of the executed commands.
- "runtime_s" (float): Total runtime in seconds.
- "files" (List[str]): List of files present in the workspace after execution.
- "artifact" (str or None): Path to a ZIP file of the workspace, if packaging succeeded.
- "workspace" (str): Path to the host workspace.
Notes:
- If `requirements.txt` exists, dependencies are installed automatically inside the container.
- If `install_libs` is provided, those packages are installed dynamically via pip.
- Commands that reference workspace files are automatically adjusted to point to the container workspace.
- Network access is enabled briefly during dependency installation.
- Commands are executed sequentially inside the container.
- Workspace cleanup is automatic unless `keep_workspace=True`.
- Useful for safely running and testing code in isolated, reproducible environments.
"""
files = manifest.get("files", [])
run_cmds = manifest.get("run_commands", [])
test_cmd = manifest.get("test_command")
install_libs = manifest.get("install_libs", []) # π NEW
host_workspace = save_files(files) # this returns a host path
# Map host workspace -> container path
mounts = {host_workspace: "/workspace"}
# Pre-install step if requirements.txt exists
install_cmds = []
if install_libs:
# install arbitrary packages inside container
libs = " ".join(install_libs)
install_cmds.append(f"pip install {libs}")
if (Path(host_workspace) / "requirements.txt").exists():
install_cmds.append("pip install -r requirements.txt")
#NEW
def fix_file_paths(cmds: List[str]) -> List[str]:
fixed = []
for c in cmds:
parts = c.split()
if parts[0] == "python" and len(parts) > 1:
parts[1] = f"/workspace/{parts[1]}"
fixed.append(" ".join(parts))
return fixed
# Build the full command sequence (run installs first if present)
run_cmds = fix_file_paths(run_cmds)
if test_cmd:
test_cmd = fix_file_paths([test_cmd])[0]
# Build full command list
cmds = install_cmds + [f"cd /workspace && {c}" for c in run_cmds]
if test_cmd:
cmds.append(f"cd /workspace && {test_cmd}")
if not cmds:
cmds = ["cd /workspace && echo 'No commands provided'"]
# If we're installing requirements, allow network briefly (set allow_network=True)
allow_network = bool(install_cmds)
exec_res = code_executor(
image=base_image,
cmds=cmds,
mounts=mounts,
host_workspace=host_workspace,
container_workdir="/workspace",
timeout=timeout,
allow_network=allow_network,
)
# gather host-side file list (relative)
files_list = list_workspace_files(host_workspace)
# package artifact (optional)
artifact = None
try:
artifact = package_artifact(host_workspace)
except Exception:
artifact = None
result = {
"stdout": exec_res.get("stdout", ""),
"stderr": exec_res.get("stderr", ""),
"exit_code": exec_res.get("exit_code", 1),
"runtime_s": exec_res.get("runtime_s", None),
"files": files_list,
"artifact": artifact,
"workspace": host_workspace,
}
# decide whether to cleanup workspace
cleanup_workspace(host_workspace, keep=keep_workspace)
return result
def detect_target_column(df: pd.DataFrame) -> str:
"""
Heuristically detect the most likely target column based on naming, cardinality, and type.
"""
if df.empty or len(df.columns) < 2:
return None
scores = {}
for col in df.columns:
score = 0.0
name_lower = col.lower()
# Rule 1: Name matches common target keywords
keywords = ["target", "label", "class", "outcome", "result", "y", "output", "flag", "status", "churn", "survived", "price", "sale"]
if any(kw in name_lower for kw in keywords):
score += 3.0
if name_lower in ["target", "label", "class", "y"]:
score += 2.0
# Rule 2: Binary or low-cardinality categorical β likely classification
nunique = df[col].nunique()
total = len(df)
unique_ratio = nunique / total
if nunique == 2 and df[col].dtype in ["int64", "object", "category"]:
score += 4.0 # Strong signal
elif nunique <= 20 and df[col].dtype in ["int64", "object", "category"]:
score += 3.0
# Rule 3: High unique ratio + numeric β likely regression target
if unique_ratio > 0.8 and df[col].dtype in ["int64", "float64"]:
score += 2.5
# Rule 4: Avoid ID-like or high-cardinality text
id_keywords = ["id", "name", "email", "phone", "address", "username", "url", "link"]
if any(kw in name_lower for kw in id_keywords):
score -= 10.0
if nunique == total and df[col].dtype == "object":
score -= 10.0 # Likely unique identifier
scores[col] = score
# Return best candidate if score > 0
best_col = max(scores, key=scores.get)
return best_col if scores[best_col] > 0 else None
# ββββββββββββββββββββββββββββββββ
# π οΈ Tool 1: LoadData
# ββββββββββββββββββββββββββββββββ
@tool
def LoadData(filepath: str) -> dict:
"""
Loads data from a CSV file and returns it as a dictionary.
Args:
filepath (str): Path to the CSV file.
Returns:
dict: Data as dictionary (from DataFrame.to_dict()).
"""
df = pd.read_csv(filepath)
return df.to_dict()
# ββββββββββββββββββββββββββββββββ
# π οΈ Tool 2: CleanData (Enhanced)
# ββββββββββββββββββββββββββββββββ
@tool
def CleanData(data: dict, handle_outliers: bool = True, impute_strategy: str = "median_mode") -> pd.DataFrame:
"""
Cleans dataset with smart imputation, encoding, and optional outlier removal.
Args:
data (dict): Dataset in dictionary format.
handle_outliers (bool): Whether to remove outliers using IQR.
impute_strategy (str): "median_mode" or "mean_mode"
Returns:
pd.DataFrame: Cleaned dataset.
"""
df = pd.DataFrame.from_dict(data)
# Drop duplicates
df = df.drop_duplicates().reset_index(drop=True)
# Handle missing values
for col in df.columns:
if df[col].dtype in ["int64", "float64"]:
if impute_strategy == "median_mode" or df[col].skew() > 1:
fill_val = df[col].median()
else:
fill_val = df[col].mean()
df[col] = df[col].fillna(fill_val)
else:
mode = df[col].mode()
fill_val = mode[0] if len(mode) > 0 else "Unknown"
df[col] = df[col].fillna(fill_val)
# Parse datetime
for col in df.columns:
if "date" in col.lower() or "time" in col.lower():
try:
df[col] = pd.to_datetime(df[col], infer_datetime_format=True, errors="coerce")
except:
pass
# Encode categorical variables (only if not too many unique values)
for col in df.select_dtypes(include="object").columns:
if df[col].nunique() / len(df) < 0.5:
df[col] = df[col].astype("category").cat.codes
# else: leave as object (e.g., free text)
# Outlier removal (optional)
if handle_outliers:
for col in df.select_dtypes(include=["float64", "int64"]).columns:
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
count_before = len(df)
df = df[(df[col] >= lower) & (df[col] <= upper)]
if len(df) == 0:
# Avoid empty df
df = pd.DataFrame.from_dict(data) # Revert
break
return df.reset_index(drop=True)
# ββββββββββββββββββββββββββββββββ
# π Tool 3: EDA (Enhanced)
# ββββββββββββββββββββββββββββββββ
@tool
def EDA(data: dict, max_cat_plots: int = 3, max_num_plots: int = 3) -> dict:
"""
Performs advanced EDA with smart visualizations and insights.
Args:
data (dict): Dataset in dictionary format.
max_cat_plots (int): Max number of categorical distribution plots.
max_num_plots (int): Max number of numeric vs target plots.
Returns:
dict: EDA results including text, plots, and recommendations.
"""
df = pd.DataFrame.from_dict(data)
results = {}
# 1. Summary Stats
results["summary"] = df.describe(include="all").to_string()
# 2. Missing Values
missing = df.isnull().sum()
results["missing_values"] = missing[missing > 0].to_dict()
# Missingness heatmap
if missing.sum() > 0:
plt.figure(figsize=(8, 4))
sns.heatmap(df.isnull(), cbar=True, cmap="viridis", yticklabels=False)
buf = io.BytesIO()
plt.savefig(buf, format="png", bbox_inches="tight")
plt.close()
buf.seek(0)
img = Image.open(buf)
results["missingness_plot"] = img #buf
# 3. Correlation Heatmap
corr = df.corr(numeric_only=True)
if not corr.empty and len(corr.columns) > 1:
plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", square=True)
buf = io.BytesIO()
plt.savefig(buf, format="png", bbox_inches="tight")
plt.close()
buf.seek(0)
img = Image.open(buf)
results["correlation_plot"] = img #buf
# Top 5 absolute correlations
unstacked = corr.abs().unstack()
unstacked = unstacked[unstacked < 1.0]
top_corr = unstacked.sort_values(ascending=False).head(5).to_dict()
results["top_correlations"] = top_corr
# 4. Skewness & Kurtosis
numeric_cols = df.select_dtypes(include=["float64", "int64"]).columns
skew_kurt = {}
for col in numeric_cols:
skew_kurt[col] = {"skew": df[col].skew(), "kurtosis": df[col].kurtosis()}
results["skew_kurtosis"] = skew_kurt
# 5. Numeric Distributions
if len(numeric_cols) > 0:
df[numeric_cols].hist(bins=20, figsize=(12, 8), layout=(2, -1))
buf = io.BytesIO()
plt.savefig(buf, format="png", bbox_inches="tight")
plt.close()
buf.seek(0)
img = Image.open(buf)
results["numeric_distributions"] = img #buf
# 6. Categorical Distributions
cat_cols = df.select_dtypes(include=["object", "category"]).columns
for col in cat_cols[:max_cat_plots]:
plt.figure(figsize=(6, 4))
top_vals = df[col].value_counts().head(10)
sns.barplot(x=top_vals.index, y=top_vals.values)
plt.xticks(rotation=45)
buf = io.BytesIO()
plt.savefig(buf, format="png", bbox_inches="tight")
plt.close()
buf.seek(0)
img = Image.open(buf)
results[f"dist_{col}"] = img #buf
# 7. Target Relationships
target_col = detect_target_column(df)
if target_col:
results["detected_target"] = target_col
for col in numeric_cols[:max_num_plots]:
plt.figure(figsize=(6, 4))
if df[target_col].nunique() <= 20:
sns.boxplot(data=df, x=target_col, y=col)
else:
sns.scatterplot(data=df, x=col, y=target_col)
buf = io.BytesIO()
plt.savefig(buf, format="png", bbox_inches="tight")
plt.close()
buf.seek(0)
img = Image.open(buf)
results[f"{col}_vs_{target_col}"] = img #buf
# 8. Recommendations
recs = []
for col, sk in skew_kurt.items():
if abs(sk["skew"]) > 1:
recs.append(f"Feature '{col}' is skewed ({sk['skew']:.2f}) β consider log transform.")
if results["missing_values"]:
recs.append("Missing data detected β consider KNN or iterative imputation.")
if results.get("top_correlations"):
recs.append("High correlations found β consider PCA or feature selection.")
if target_col:
recs.append(f"Target variable '{target_col}' detected automatically.")
results["recommendations"] = recs
return results
# ββββββββββββββββββββββββββββββββ
# π€ Tool 4: AutoML (Enhanced)
# ββββββββββββββββββββββββββββββββ
@tool
def AutoML(data: dict, task_hint: str = None) -> dict:
"""
Enhanced AutoML with multiple models and robust evaluation.
Args:
data (dict): Cleaned dataset.
task_hint (str): "classification", "regression", or None.
Returns:
dict: Model results and metrics.
"""
df = pd.DataFrame.from_dict(data)
results = {}
target_col = detect_target_column(df)
if not target_col:
results["note"] = "No target column detected. Check column names and data."
return results
X = df.drop(columns=[target_col])
y = df[target_col]
# One-hot encode X
X = pd.get_dummies(X, drop_first=True)
if X.shape[1] == 0:
results["error"] = "No valid features after encoding."
return results
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Detect task
if task_hint:
task = task_hint
elif y.dtype in ["object", "category"] or y.nunique() <= 20:
task = "classification"
else:
task = "regression"
try:
if task == "classification":
models = {
"RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
"LogisticRegression": LogisticRegression(max_iter=1000, random_state=42)
}
results["task"] = "classification"
best_acc = 0
for name, model in models.items():
model.fit(X_train, y_train)
preds = model.predict(X_test)
acc = accuracy_score(y_test, preds)
if acc > best_acc:
best_acc = acc
results["accuracy"] = acc
results["best_model"] = name
results["report"] = classification_report(y_test, preds, zero_division=0)
if hasattr(model, "feature_importances_"):
results["feature_importance"] = dict(zip(X.columns, model.feature_importances_))
else:
models = {
"RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
"LinearRegression": LinearRegression()
}
results["task"] = "regression"
best_r2 = -float("inf")
for name, model in models.items():
model.fit(X_train, y_train)
preds = model.predict(X_test)
r2 = r2_score(y_test, preds)
if r2 > best_r2:
best_r2 = r2
results["r2_score"] = r2
results["mse"] = mean_squared_error(y_test, preds)
results["best_model"] = name
best_model = model # Keep best model
if hasattr(model, "feature_importances_"):
results["feature_importance"] = dict(zip(X.columns, model.feature_importances_))
# β
Save the best model to a temporary file
model_dir = tempfile.mkdtemp()
model_path = os.path.join(model_dir, f"trained_model_{task}.pkl")
joblib.dump({
"model": best_model,
"task": task,
"target_column": target_col,
"features": X.columns.tolist()
}, model_path)
results["model_download_path"] = model_path
results["model_info"] = f"Best model: {results['best_model']} | Task: {task} | Target: {target_col}"
except Exception as e:
results["error"] = f"Model training failed: {str(e)}"
return results
model = InferenceClientModel(
model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
token=os.environ["HF_TOKEN"],
provider="together",
max_tokens=8048
)
planner = ToolCallingAgent(
tools=[],
model=model,
name="PlannerAgent",
max_steps=10,
planning_interval=5,
description= "Breaks down complex tasks and orchestrates tools for execution",
)
# Research agent
researcher = ToolCallingAgent(
tools=[internet_search],
model=model,
name="ResearchAgent",
max_steps=10,
description = "Conducts deep research using internet_search",
)
# Coding agent
coder = CodeAgent(
tools=[
code_executor,
save_files,
list_workspace_files,
package_artifact,
cleanup_workspace,
run_manifest,
],
model=model,
name="CodingAgent",
max_steps=20,
additional_authorized_imports=[
"subprocess", "tempfile", "time", "os", "pathlib", "typing","shutil", "zipfile","uuid"
],
description = "Executes Python code safely in a sandboxed Docker container."
"If a library is missing, add it to install_libs in run_manifest."
)
analyst = CodeAgent(
tools=[LoadData, CleanData, EDA, AutoML],
model=model,
max_steps=20,
name="DataScienceAgent",
additional_authorized_imports=[
"pandas", "matplotlib.pyplot", "seaborn", "PIL", "sklearn", "io", "os","joblib","tempfile"
],
description = "Loads datasets, cleans and preprocesses data, performs exploratory data analysis (EDA) with visualizations, and builds predictive models when a target variable is specified."
)
manager_agent = ToolCallingAgent(
tools=[],
model=model,
managed_agents=[planner, researcher, coder, analyst],
max_steps=20,
description= "Routes user queries to the right agent (Planner, Researcher, Coder or Data Scientist) and assembles results",
)
|