File size: 33,170 Bytes
3480bcd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
from tavily import TavilyClient 
import subprocess, tempfile, time, os
from pathlib import Path
from typing import Dict, Any, List, Literal
import shutil, zipfile
from uuid import uuid4
from smolagents import tool, CodeAgent, InferenceClientModel, ToolCallingAgent
# For Data Analysis Agent
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, r2_score, mean_squared_error
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
import joblib
import io



# Initialize Tavily client for web search
#tavily_client = TavilyClient(api_key=os.environ["TAVILY_API_KEY"])
tavily_client = TavilyClient(api_key="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")
os.environ["HF_TOKEN"] = ""
# ----------------- Tools -----------------
    
@tool
def internet_search(

    query: str,

    max_results: int = 5,

    topic: Literal["general", "news", "finance","science","technology","economy"] = "general",

    include_raw_content: bool = False,

)-> List[Dict[str, Any]]:
    
    """

    Tool to perform an internet search using the Tavily API.



    This tool allows the agent to gather information from the web

    based on a query and a specified topic. It returns a list of

    search results, optionally including the raw content of the

    webpages.



    Args:

        query (str): The search query or keywords to look up on the web.

        max_results (int, optional): Maximum number of search results to return. 

                                     Defaults to 5.

        topic (Literal["general", "news", "finance", "science", "technology", "economy"], optional): 

            Category of the search to prioritize relevant content. Defaults to "general".

        include_raw_content (bool, optional): If True, include the full raw content of the results; 

                                             otherwise, only metadata is returned. Defaults to False.



    Returns:

        List[Dict[str, Any]]: A list of search results from Tavily, with each item containing

                              relevant information such as title, URL, snippet, and optionally raw content.

    """

    result1 = tavily_client.search(
        query,
        max_results=max_results,
        include_raw_content=include_raw_content,
        topic=topic,
    )
    return result1

@tool
def code_executor(

    image: str,

    cmds: List[str],

    mounts: Dict[str, str] = None,

    host_workspace: str = None,

    container_workdir: str = "/workspace",

    timeout: int = 60,

    allow_network: bool = False,

) -> Dict[str, Any]:
    
    """

    Executes a sequence of shell commands inside a Docker container.



    This tool allows safe and isolated execution of code or scripts

    using a specified Docker image. It supports mounting host directories,

    custom working directories, timeout handling, and optional network access.



    Args:

        image (str): The Docker image to use for execution (e.g., "python:3.11-slim").

        cmds (List[str]): A list of shell commands to run inside the container.

        mounts (Dict[str, str], optional): Dictionary mapping host paths to container paths

                                           for volume mounting. Defaults to None.

        host_workspace (str, optional): Path on the host machine to use as workspace.

                                        If None, a temporary directory is created. Defaults to None.

        container_workdir (str, optional): Working directory inside the container. Defaults to "/workspace".

        timeout (int, optional): Maximum execution time in seconds before terminating the process. Defaults to 60.

        allow_network (bool, optional): Whether to allow network access inside the container.

                                        Defaults to False (safe default).



    Returns:

        Dict[str, Any]: A dictionary containing execution results:

            - stdout (str): Standard output from the container.

            - stderr (str): Standard error output.

            - exit_code (int): Exit code of the executed commands.

            - runtime_s (float): Execution time in seconds.

            - files (List[str]): List of files created in the host workspace (relative paths).

            - host_workspace (str): Path to the host workspace used for execution.



    Notes:

        - Ensures that the host workspace is always mounted to the container.

        - Normalizes Windows paths for Docker volume mounting.

        - Safely handles subprocess timeouts and captures output.

    """

    if host_workspace is None:
        host_workspace = tempfile.mkdtemp(prefix="mini_manus_ws_")
    # Ensure mounts include host_workspace -> container_workdir
    mounts = dict(mounts or {})
    if host_workspace not in mounts:
        mounts[host_workspace] = container_workdir

    docker_cmd = ["docker", "run", "--rm", "--memory", "512m", "--cpus", "1"]
    if not allow_network:
        docker_cmd += ["--network", "none"]

    # Normalize Windows backslashes -> forward slashes for docker -v on some setups
    def _norm(p: str) -> str:
        return p.replace("\\", "/")

    for host, cont in mounts.items():
        docker_cmd += ["-v", f"{_norm(host)}:{cont}"]

    docker_cmd += ["-w", container_workdir, image]
    joined = " && ".join(cmds) if cmds else "echo 'No commands provided'"
    docker_cmd += ["sh", "-lc", joined]

    start = time.time()
    try:
        proc = subprocess.run(docker_cmd, capture_output=True, text=True, timeout=timeout)
        runtime = time.time() - start

        # Gather files from the host workspace (NOT container path)
        files = []
        try:
            for p in Path(host_workspace).rglob("*"):
                if p.is_file():
                    files.append(str(p.relative_to(host_workspace)))
        except Exception:
            files = []

        return {
            "stdout": proc.stdout,
            "stderr": proc.stderr,
            "exit_code": proc.returncode,
            "runtime_s": round(runtime, 3),
            "files": files,
            "host_workspace": host_workspace,
        }
    except subprocess.TimeoutExpired as te:
        return {
            "stdout": te.stdout or "",
            "stderr": (te.stderr or "") + f"\n[Timed out after {timeout}s]",
            "exit_code": -1,
            "runtime_s": round(time.time() - start, 3),
            "files": [],
            "host_workspace": host_workspace,
        }
@tool
def save_files(manifest_files: List[Dict[str,str]], workspace: str = None) -> str:
    
    """

    Saves a list of files to a host workspace directory.



    This tool creates the specified files with their content on the host system.

    Each file is defined by a dictionary containing a relative path and content.

    If no workspace path is provided, a temporary directory is created automatically.



    Args:

        manifest_files (List[Dict[str, str]]): A list of file descriptors, 

            where each descriptor is a dictionary with:

            - "path" (str): Relative file path (e.g., "app.py" or "src/module.py").

            - "content" (str): The content to write into the file.

        workspace (str, optional): Path to the host directory where files should be saved.

                                   If None, a temporary directory is created. Defaults to None.



    Returns:

        str: The path to the host workspace directory where the files were saved.



    Notes:

        - Automatically creates parent directories if they do not exist.

        - Overwrites files if they already exist at the same path.

        - Useful for preparing workspaces for code execution in sandboxed environments.

    """

    if workspace is None:
        workspace = tempfile.mkdtemp(prefix="mini_manus_ws_")
    ws = Path(workspace)
    ws.mkdir(parents=True, exist_ok=True)
    for f in manifest_files:
        p = ws / f["path"]
        p.parent.mkdir(parents=True, exist_ok=True)
        p.write_text(f["content"], encoding="utf-8")
    return str(ws)

# 2) List files in a workspace (relative)
@tool
def list_workspace_files(workspace: str) -> List[str]:

    """

    Recursively list all files in a given workspace directory.



    This tool traverses the workspace directory and collects all file paths,

    returning them relative to the workspace root. It is useful for inspecting 

    the contents of a workspace, packaging artifacts, or tracking generated files.



    Args:

        workspace (str): Path to the workspace directory to list.



    Returns:

        List[str]: A list of file paths relative to the workspace root.



    Notes:

        - Only files are included; directories themselves are ignored.

        - If the workspace path is invalid or an error occurs during traversal,

          an empty list is returned.

        - Paths are returned as strings using forward slashes.

    """

    files = []
    try:
        for p in Path(workspace).rglob("*"):
            if p.is_file():
                files.append(str(p.relative_to(workspace)))
    except Exception:
        pass
    return files

# 3) Package artifact (zip) and return path
@tool
def package_artifact(workspace: str, out_dir: str = None) -> str:

    """

    Package the contents of a workspace directory into a ZIP archive.



    This tool collects all files within a given workspace and compresses 

    them into a single ZIP file, which can be used as an artifact for 

    deployment, sharing, or backup purposes.



    Args:

        workspace (str): Path to the workspace directory to package.

        out_dir (str, optional): Directory to save the generated ZIP file. 

            If None, a temporary directory will be created.



    Returns:

        str: Absolute file path of the created ZIP archive.



    Notes:

        - Only files are included in the ZIP archive; directories themselves 

          are not stored.

        - The ZIP filename is automatically generated using a UUID to ensure 

          uniqueness.

        - If `out_dir` does not exist, it will be created.

        - Useful for packaging code, data, or other artifacts generated 

          during automated workflows.

    """

    if out_dir is None:
        out_dir = tempfile.mkdtemp(prefix="mini_manus_artifacts_")
    Path(out_dir).mkdir(parents=True, exist_ok=True)
    zip_name = Path(out_dir) / f"artifact_{uuid4().hex}.zip"
    with zipfile.ZipFile(zip_name, "w", zipfile.ZIP_DEFLATED) as z:
        for p in Path(workspace).rglob("*"):
            if p.is_file():
                z.write(p, p.relative_to(workspace))
    return str(zip_name)

# 4) Cleanup workspace
@tool
def cleanup_workspace(workspace: str, keep: bool = False) -> None:

    """

    Safely removes a workspace directory and all its contents.



    This tool is used to clean up temporary directories created during 

    code execution, testing, or file manipulation. It ensures that the 

    workspace is deleted unless explicitly preserved.



    Args:

        workspace (str): Path to the workspace directory to delete.

        keep (bool, optional): If True, the workspace will not be deleted.

            Defaults to False.



    Returns:

        None



    Notes:

        - Any errors during deletion (e.g., non-existent directory, permission issues) 

          are silently ignored.

        - Use `keep=True` to preserve the workspace, for example, when artifacts 

          need to be inspected after execution.

        - Intended for host-side cleanup of temporary directories used in containerized 

          or local code execution workflows.

    """

    if keep:
        return
    try:
        shutil.rmtree(workspace)
    except Exception:
        pass

# 5) Run a manifest end-to-end using your code_executor (uses Docker image + run_commands)
@tool
def run_manifest(manifest: Dict[str, Any], base_image: str = "python:3.11-slim", timeout: int = 120, keep_workspace: bool = False) -> Dict[str, Any]:
    
    """

    Executes a manifest of files and commands inside a Docker container and optionally packages the workspace.



    This tool automates the process of:

    1. Saving provided files to a host workspace.

    2. Installing dependencies (if a `requirements.txt` is present or if `install_libs` is specified).

    3. Running commands and optional test commands inside a Docker container.

       - Commands referencing workspace files are automatically adjusted to point to the container workspace.

    4. Collecting outputs, listing files, and optionally packaging the workspace into a ZIP artifact.

    5. Cleaning up the workspace unless `keep_workspace=True`.



    Args:

        manifest (Dict[str, Any]): A dictionary describing the manifest, with the following keys:

            - "files" (List[Dict[str,str]]): List of files to save, each with "path" and "content".

            - "run_commands" (List[str], optional): Commands to execute inside the container.

            - "test_command" (str, optional): A command for testing/verifying the execution.

            - "install_libs" (List[str], optional): A list of Python packages to install dynamically

              (e.g., ["crewai", "transformers"]). Installed before any run/test commands.

        base_image (str, optional): Docker image to use for execution. Defaults to "python:3.11-slim".

        timeout (int, optional): Maximum time in seconds for container execution. Defaults to 120.

        keep_workspace (bool, optional): If True, preserves the host workspace after execution. Defaults to False.



    Returns:

        Dict[str, Any]: A dictionary containing execution results and metadata:

            - "stdout" (str): Standard output from the execution.

            - "stderr" (str): Standard error from the execution.

            - "exit_code" (int): Exit code of the executed commands.

            - "runtime_s" (float): Total runtime in seconds.

            - "files" (List[str]): List of files present in the workspace after execution.

            - "artifact" (str or None): Path to a ZIP file of the workspace, if packaging succeeded.

            - "workspace" (str): Path to the host workspace.



    Notes:

        - If `requirements.txt` exists, dependencies are installed automatically inside the container.

        - If `install_libs` is provided, those packages are installed dynamically via pip.

        - Commands that reference workspace files are automatically adjusted to point to the container workspace.

        - Network access is enabled briefly during dependency installation.

        - Commands are executed sequentially inside the container.

        - Workspace cleanup is automatic unless `keep_workspace=True`.

        - Useful for safely running and testing code in isolated, reproducible environments.

    """

    files = manifest.get("files", [])
    run_cmds = manifest.get("run_commands", [])
    test_cmd = manifest.get("test_command")
    install_libs = manifest.get("install_libs", [])   # πŸ‘ˆ NEW
    host_workspace = save_files(files)  # this returns a host path

    # Map host workspace -> container path
    mounts = {host_workspace: "/workspace"}

    # Pre-install step if requirements.txt exists
    install_cmds = []
    if install_libs:
        # install arbitrary packages inside container
        libs = " ".join(install_libs)
        install_cmds.append(f"pip install {libs}")

    if (Path(host_workspace) / "requirements.txt").exists():
        install_cmds.append("pip install -r requirements.txt")

    #NEW 
    def fix_file_paths(cmds: List[str]) -> List[str]:
        fixed = []
        for c in cmds:
            parts = c.split()
            if parts[0] == "python" and len(parts) > 1:
                parts[1] = f"/workspace/{parts[1]}"
            fixed.append(" ".join(parts))
        return fixed
    

    # Build the full command sequence (run installs first if present)

    run_cmds = fix_file_paths(run_cmds)
    if test_cmd:
        test_cmd = fix_file_paths([test_cmd])[0]

    # Build full command list
    cmds = install_cmds + [f"cd /workspace && {c}" for c in run_cmds]
    if test_cmd:
        cmds.append(f"cd /workspace && {test_cmd}")

    if not cmds:
        cmds = ["cd /workspace && echo 'No commands provided'"]


    # If we're installing requirements, allow network briefly (set allow_network=True)
    allow_network = bool(install_cmds)

    exec_res = code_executor(
        image=base_image,
        cmds=cmds,
        mounts=mounts,
        host_workspace=host_workspace,
        container_workdir="/workspace",
        timeout=timeout,
        allow_network=allow_network,
    )

    # gather host-side file list (relative)
    files_list = list_workspace_files(host_workspace)

    # package artifact (optional)
    artifact = None
    try:
        artifact = package_artifact(host_workspace)
    except Exception:
        artifact = None

    result = {
        "stdout": exec_res.get("stdout", ""),
        "stderr": exec_res.get("stderr", ""),
        "exit_code": exec_res.get("exit_code", 1),
        "runtime_s": exec_res.get("runtime_s", None),
        "files": files_list,
        "artifact": artifact,
        "workspace": host_workspace,
    }

    # decide whether to cleanup workspace
    cleanup_workspace(host_workspace, keep=keep_workspace)
    return result

def detect_target_column(df: pd.DataFrame) -> str:
    """

    Heuristically detect the most likely target column based on naming, cardinality, and type.

    """
    if df.empty or len(df.columns) < 2:
        return None

    scores = {}

    for col in df.columns:
        score = 0.0
        name_lower = col.lower()

        # Rule 1: Name matches common target keywords
        keywords = ["target", "label", "class", "outcome", "result", "y", "output", "flag", "status", "churn", "survived", "price", "sale"]
        if any(kw in name_lower for kw in keywords):
            score += 3.0
        if name_lower in ["target", "label", "class", "y"]:
            score += 2.0

        # Rule 2: Binary or low-cardinality categorical β†’ likely classification
        nunique = df[col].nunique()
        total = len(df)
        unique_ratio = nunique / total

        if nunique == 2 and df[col].dtype in ["int64", "object", "category"]:
            score += 4.0  # Strong signal
        elif nunique <= 20 and df[col].dtype in ["int64", "object", "category"]:
            score += 3.0

        # Rule 3: High unique ratio + numeric β†’ likely regression target
        if unique_ratio > 0.8 and df[col].dtype in ["int64", "float64"]:
            score += 2.5

        # Rule 4: Avoid ID-like or high-cardinality text
        id_keywords = ["id", "name", "email", "phone", "address", "username", "url", "link"]
        if any(kw in name_lower for kw in id_keywords):
            score -= 10.0
        if nunique == total and df[col].dtype == "object":
            score -= 10.0  # Likely unique identifier

        scores[col] = score

    # Return best candidate if score > 0
    best_col = max(scores, key=scores.get)
    return best_col if scores[best_col] > 0 else None



















# β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
# πŸ› οΈ Tool 1: LoadData
# β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”

@tool
def LoadData(filepath: str) -> dict:
    """

    Loads data from a CSV file and returns it as a dictionary.



    Args:

        filepath (str): Path to the CSV file.



    Returns:

        dict: Data as dictionary (from DataFrame.to_dict()).

    """
    df = pd.read_csv(filepath)
    return df.to_dict()


# β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
# πŸ› οΈ Tool 2: CleanData (Enhanced)
# β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”

@tool
def CleanData(data: dict, handle_outliers: bool = True, impute_strategy: str = "median_mode") -> pd.DataFrame:
    """

    Cleans dataset with smart imputation, encoding, and optional outlier removal.



    Args:

        data (dict): Dataset in dictionary format.

        handle_outliers (bool): Whether to remove outliers using IQR.

        impute_strategy (str): "median_mode" or "mean_mode"



    Returns:

        pd.DataFrame: Cleaned dataset.

    """
    df = pd.DataFrame.from_dict(data)

    # Drop duplicates
    df = df.drop_duplicates().reset_index(drop=True)

    # Handle missing values
    for col in df.columns:
        if df[col].dtype in ["int64", "float64"]:
            if impute_strategy == "median_mode" or df[col].skew() > 1:
                fill_val = df[col].median()
            else:
                fill_val = df[col].mean()
            df[col] = df[col].fillna(fill_val)
        else:
            mode = df[col].mode()
            fill_val = mode[0] if len(mode) > 0 else "Unknown"
            df[col] = df[col].fillna(fill_val)

    # Parse datetime
    for col in df.columns:
        if "date" in col.lower() or "time" in col.lower():
            try:
                df[col] = pd.to_datetime(df[col], infer_datetime_format=True, errors="coerce")
            except:
                pass

    # Encode categorical variables (only if not too many unique values)
    for col in df.select_dtypes(include="object").columns:
        if df[col].nunique() / len(df) < 0.5:
            df[col] = df[col].astype("category").cat.codes
        # else: leave as object (e.g., free text)

    # Outlier removal (optional)
    if handle_outliers:
        for col in df.select_dtypes(include=["float64", "int64"]).columns:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
            count_before = len(df)
            df = df[(df[col] >= lower) & (df[col] <= upper)]
            if len(df) == 0:
                # Avoid empty df
                df = pd.DataFrame.from_dict(data)  # Revert
                break

    return df.reset_index(drop=True)


# β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
# πŸ“Š Tool 3: EDA (Enhanced)
# β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”

@tool
def EDA(data: dict, max_cat_plots: int = 3, max_num_plots: int = 3) -> dict:
    """

    Performs advanced EDA with smart visualizations and insights.



    Args:

        data (dict): Dataset in dictionary format.

        max_cat_plots (int): Max number of categorical distribution plots.

        max_num_plots (int): Max number of numeric vs target plots.



    Returns:

        dict: EDA results including text, plots, and recommendations.

    """
    df = pd.DataFrame.from_dict(data)
    results = {}

    # 1. Summary Stats
    results["summary"] = df.describe(include="all").to_string()

    # 2. Missing Values
    missing = df.isnull().sum()
    results["missing_values"] = missing[missing > 0].to_dict()

    # Missingness heatmap
    if missing.sum() > 0:
        plt.figure(figsize=(8, 4))
        sns.heatmap(df.isnull(), cbar=True, cmap="viridis", yticklabels=False)
        buf = io.BytesIO()
        plt.savefig(buf, format="png", bbox_inches="tight")
        plt.close()
        buf.seek(0)
        img = Image.open(buf)
        results["missingness_plot"] = img #buf

    # 3. Correlation Heatmap
    corr = df.corr(numeric_only=True)
    if not corr.empty and len(corr.columns) > 1:
        plt.figure(figsize=(8, 6))
        sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", square=True)
        buf = io.BytesIO()
        plt.savefig(buf, format="png", bbox_inches="tight")
        plt.close()
        buf.seek(0)
        img = Image.open(buf)
        results["correlation_plot"] = img #buf

        # Top 5 absolute correlations
        unstacked = corr.abs().unstack()
        unstacked = unstacked[unstacked < 1.0]
        top_corr = unstacked.sort_values(ascending=False).head(5).to_dict()
        results["top_correlations"] = top_corr

    # 4. Skewness & Kurtosis
    numeric_cols = df.select_dtypes(include=["float64", "int64"]).columns
    skew_kurt = {}
    for col in numeric_cols:
        skew_kurt[col] = {"skew": df[col].skew(), "kurtosis": df[col].kurtosis()}
    results["skew_kurtosis"] = skew_kurt

    # 5. Numeric Distributions
    if len(numeric_cols) > 0:
        df[numeric_cols].hist(bins=20, figsize=(12, 8), layout=(2, -1))
        buf = io.BytesIO()
        plt.savefig(buf, format="png", bbox_inches="tight")
        plt.close()
        buf.seek(0)
        img = Image.open(buf)
        results["numeric_distributions"] = img #buf

    # 6. Categorical Distributions
    cat_cols = df.select_dtypes(include=["object", "category"]).columns
    for col in cat_cols[:max_cat_plots]:
        plt.figure(figsize=(6, 4))
        top_vals = df[col].value_counts().head(10)
        sns.barplot(x=top_vals.index, y=top_vals.values)
        plt.xticks(rotation=45)
        buf = io.BytesIO()
        plt.savefig(buf, format="png", bbox_inches="tight")
        plt.close()
        buf.seek(0)
        img = Image.open(buf)
        results[f"dist_{col}"] = img #buf

    # 7. Target Relationships
    target_col = detect_target_column(df)
    if target_col:
        results["detected_target"] = target_col
        for col in numeric_cols[:max_num_plots]:
            plt.figure(figsize=(6, 4))
            if df[target_col].nunique() <= 20:
                sns.boxplot(data=df, x=target_col, y=col)
            else:
                sns.scatterplot(data=df, x=col, y=target_col)
            buf = io.BytesIO()
            plt.savefig(buf, format="png", bbox_inches="tight")
            plt.close()
            buf.seek(0)
            img = Image.open(buf)
            results[f"{col}_vs_{target_col}"] = img #buf

    # 8. Recommendations
    recs = []
    for col, sk in skew_kurt.items():
        if abs(sk["skew"]) > 1:
            recs.append(f"Feature '{col}' is skewed ({sk['skew']:.2f}) β†’ consider log transform.")
    if results["missing_values"]:
        recs.append("Missing data detected β†’ consider KNN or iterative imputation.")
    if results.get("top_correlations"):
        recs.append("High correlations found β†’ consider PCA or feature selection.")
    if target_col:
        recs.append(f"Target variable '{target_col}' detected automatically.")
    results["recommendations"] = recs

    return results


# β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
# πŸ€– Tool 4: AutoML (Enhanced)
# β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”

@tool
def AutoML(data: dict, task_hint: str = None) -> dict:
    """

    Enhanced AutoML with multiple models and robust evaluation.



    Args:

        data (dict): Cleaned dataset.

        task_hint (str): "classification", "regression", or None.



    Returns:

        dict: Model results and metrics.

    """
    df = pd.DataFrame.from_dict(data)
    results = {}

    target_col = detect_target_column(df)
    if not target_col:
        results["note"] = "No target column detected. Check column names and data."
        return results

    X = df.drop(columns=[target_col])
    y = df[target_col]

    # One-hot encode X
    X = pd.get_dummies(X, drop_first=True)

    if X.shape[1] == 0:
        results["error"] = "No valid features after encoding."
        return results

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Detect task
    if task_hint:
        task = task_hint
    elif y.dtype in ["object", "category"] or y.nunique() <= 20:
        task = "classification"
    else:
        task = "regression"

    try:
        if task == "classification":
            models = {
                "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
                "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42)
            }
            results["task"] = "classification"
            best_acc = 0
            for name, model in models.items():
                model.fit(X_train, y_train)
                preds = model.predict(X_test)
                acc = accuracy_score(y_test, preds)
                if acc > best_acc:
                    best_acc = acc
                    results["accuracy"] = acc
                    results["best_model"] = name
                    results["report"] = classification_report(y_test, preds, zero_division=0)
                    if hasattr(model, "feature_importances_"):
                        results["feature_importance"] = dict(zip(X.columns, model.feature_importances_))

        else:
            models = {
                "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
                "LinearRegression": LinearRegression()
            }
            results["task"] = "regression"
            best_r2 = -float("inf")
            for name, model in models.items():
                model.fit(X_train, y_train)
                preds = model.predict(X_test)
                r2 = r2_score(y_test, preds)
                if r2 > best_r2:
                    best_r2 = r2
                    results["r2_score"] = r2
                    results["mse"] = mean_squared_error(y_test, preds)
                    results["best_model"] = name
                    best_model = model  # Keep best model
                    if hasattr(model, "feature_importances_"):
                        results["feature_importance"] = dict(zip(X.columns, model.feature_importances_))
        # βœ… Save the best model to a temporary file
        model_dir = tempfile.mkdtemp()
        model_path = os.path.join(model_dir, f"trained_model_{task}.pkl")
        joblib.dump({
            "model": best_model,
            "task": task,
            "target_column": target_col,
            "features": X.columns.tolist()
        }, model_path)

        results["model_download_path"] = model_path
        results["model_info"] = f"Best model: {results['best_model']} | Task: {task} | Target: {target_col}"

    except Exception as e:
        results["error"] = f"Model training failed: {str(e)}"

    return results



model = InferenceClientModel(
    model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
    token=os.environ["HF_TOKEN"],
    provider="together",
    max_tokens=8048
)

planner = ToolCallingAgent(
    tools=[],
    model=model,
    name="PlannerAgent",
    max_steps=10,
    planning_interval=5,
    description= "Breaks down complex tasks and orchestrates tools for execution",
)

# Research agent
researcher = ToolCallingAgent(
    tools=[internet_search],
    model=model,
    name="ResearchAgent",
    max_steps=10,
    description = "Conducts deep research using internet_search",
)

# Coding agent
coder = CodeAgent(
    tools=[
        code_executor,
        save_files,
        list_workspace_files,
        package_artifact,
        cleanup_workspace,
        run_manifest,
    ],
    model=model,
    name="CodingAgent",
    max_steps=20,
    additional_authorized_imports=[
        "subprocess", "tempfile", "time", "os", "pathlib", "typing","shutil", "zipfile","uuid"
    ],
    description = "Executes Python code safely in a sandboxed Docker container."
                  "If a library is missing, add it to install_libs in run_manifest."
)


analyst = CodeAgent(
    tools=[LoadData, CleanData, EDA, AutoML],
    model=model,
    max_steps=20,
    name="DataScienceAgent",
    additional_authorized_imports=[
        "pandas", "matplotlib.pyplot", "seaborn", "PIL", "sklearn", "io", "os","joblib","tempfile"
    ],
    description = "Loads datasets, cleans and preprocesses data, performs exploratory data analysis (EDA) with visualizations, and builds predictive models when a target variable is specified."
)


manager_agent = ToolCallingAgent(
    tools=[],
    model=model,
    managed_agents=[planner, researcher, coder, analyst],
    max_steps=20,
    description= "Routes user queries to the right agent (Planner, Researcher, Coder or Data Scientist) and assembles results",
)