trackio / utils.py
qgallouedec's picture
qgallouedec HF Staff
Upload folder using huggingface_hub
f29fde5 verified
import random
import re
import sys
import time
from pathlib import Path
import huggingface_hub
import numpy as np
import pandas as pd
from huggingface_hub.constants import HF_HOME
RESERVED_KEYS = ["project", "run", "timestamp", "step", "time"]
TRACKIO_DIR = Path(HF_HOME) / "trackio"
TRACKIO_LOGO_PATH = str(Path(__file__).parent.joinpath("trackio_logo.png"))
def generate_readable_name():
"""
Generates a random, readable name like "dainty-sunset-1"
"""
adjectives = [
"dainty",
"brave",
"calm",
"eager",
"fancy",
"gentle",
"happy",
"jolly",
"kind",
"lively",
"merry",
"nice",
"proud",
"quick",
"silly",
"tidy",
"witty",
"zealous",
"bright",
"shy",
"bold",
"clever",
"daring",
"elegant",
"faithful",
"graceful",
"honest",
"inventive",
"jovial",
"keen",
"lucky",
"modest",
"noble",
"optimistic",
"patient",
"quirky",
"resourceful",
"sincere",
"thoughtful",
"upbeat",
"valiant",
"warm",
"youthful",
"zesty",
"adventurous",
"breezy",
"cheerful",
"delightful",
"energetic",
"fearless",
"glad",
"hopeful",
"imaginative",
"joyful",
"kindly",
"luminous",
"mysterious",
"neat",
"outgoing",
"playful",
"radiant",
"spirited",
"tranquil",
"unique",
"vivid",
"wise",
"zany",
"artful",
"bubbly",
"charming",
"dazzling",
"earnest",
"festive",
"gentlemanly",
"hearty",
"intrepid",
"jubilant",
"knightly",
"lively",
"magnetic",
"nimble",
"orderly",
"peaceful",
"quick-witted",
"robust",
"sturdy",
"trusty",
"upstanding",
"vibrant",
"whimsical",
]
nouns = [
"sunset",
"forest",
"river",
"mountain",
"breeze",
"meadow",
"ocean",
"valley",
"sky",
"field",
"cloud",
"star",
"rain",
"leaf",
"stone",
"flower",
"bird",
"tree",
"wave",
"trail",
"island",
"desert",
"hill",
"lake",
"pond",
"grove",
"canyon",
"reef",
"bay",
"peak",
"glade",
"marsh",
"cliff",
"dune",
"spring",
"brook",
"cave",
"plain",
"ridge",
"wood",
"blossom",
"petal",
"root",
"branch",
"seed",
"acorn",
"pine",
"willow",
"cedar",
"elm",
"falcon",
"eagle",
"sparrow",
"robin",
"owl",
"finch",
"heron",
"crane",
"duck",
"swan",
"fox",
"wolf",
"bear",
"deer",
"moose",
"otter",
"beaver",
"lynx",
"hare",
"badger",
"butterfly",
"bee",
"ant",
"beetle",
"dragonfly",
"firefly",
"ladybug",
"moth",
"spider",
"worm",
"coral",
"kelp",
"shell",
"pebble",
"boulder",
"cobble",
"sand",
"wavelet",
"tide",
"current",
]
adjective = random.choice(adjectives)
noun = random.choice(nouns)
number = random.randint(1, 99)
return f"{adjective}-{noun}-{number}"
def block_except_in_notebook():
in_notebook = bool(getattr(sys, "ps1", sys.flags.interactive))
if in_notebook:
return
try:
while True:
time.sleep(0.1)
except (KeyboardInterrupt, OSError):
print("Keyboard interruption in main thread... closing dashboard.")
def simplify_column_names(columns: list[str]) -> dict[str, str]:
"""
Simplifies column names to first 10 alphanumeric or "/" characters with unique suffixes.
Args:
columns: List of original column names
Returns:
Dictionary mapping original column names to simplified names
"""
simplified_names = {}
used_names = set()
for col in columns:
alphanumeric = re.sub(r"[^a-zA-Z0-9/]", "", col)
base_name = alphanumeric[:10] if alphanumeric else f"col_{len(used_names)}"
final_name = base_name
suffix = 1
while final_name in used_names:
final_name = f"{base_name}_{suffix}"
suffix += 1
simplified_names[col] = final_name
used_names.add(final_name)
return simplified_names
def print_dashboard_instructions(project: str) -> None:
"""
Prints instructions for viewing the Trackio dashboard.
Args:
project: The name of the project to show dashboard for.
"""
YELLOW = "\033[93m"
BOLD = "\033[1m"
RESET = "\033[0m"
print("* View dashboard by running in your terminal:")
print(f'{BOLD}{YELLOW}trackio show --project "{project}"{RESET}')
print(f'* or by running in Python: trackio.show(project="{project}")')
def preprocess_space_and_dataset_ids(
space_id: str | None, dataset_id: str | None
) -> tuple[str | None, str | None]:
if space_id is not None and "/" not in space_id:
username = huggingface_hub.whoami()["name"]
space_id = f"{username}/{space_id}"
if dataset_id is not None and "/" not in dataset_id:
username = huggingface_hub.whoami()["name"]
dataset_id = f"{username}/{dataset_id}"
if space_id is not None and dataset_id is None:
dataset_id = f"{space_id}_dataset"
return space_id, dataset_id
def fibo():
"""Generator for Fibonacci backoff: 1, 1, 2, 3, 5, 8, ..."""
a, b = 1, 1
while True:
yield a
a, b = b, a + b
COLOR_PALETTE = [
"#3B82F6",
"#EF4444",
"#10B981",
"#F59E0B",
"#8B5CF6",
"#EC4899",
"#06B6D4",
"#84CC16",
"#F97316",
"#6366F1",
]
def get_color_mapping(runs: list[str], smoothing: bool) -> dict[str, str]:
"""Generate color mapping for runs, with transparency for original data when smoothing is enabled."""
color_map = {}
for i, run in enumerate(runs):
base_color = COLOR_PALETTE[i % len(COLOR_PALETTE)]
if smoothing:
color_map[f"{run}_smoothed"] = base_color
color_map[f"{run}_original"] = base_color + "4D"
else:
color_map[run] = base_color
return color_map
def downsample(
df: pd.DataFrame,
x: str,
y: str,
color: str | None,
x_lim: tuple[float, float] | None = None,
) -> pd.DataFrame:
if df.empty:
return df
columns_to_keep = [x, y]
if color is not None and color in df.columns:
columns_to_keep.append(color)
df = df[columns_to_keep].copy()
n_bins = 100
if color is not None and color in df.columns:
groups = df.groupby(color)
else:
groups = [(None, df)]
downsampled_indices = []
for _, group_df in groups:
if group_df.empty:
continue
group_df = group_df.sort_values(x)
if x_lim is not None:
x_min, x_max = x_lim
before_point = group_df[group_df[x] < x_min].tail(1)
after_point = group_df[group_df[x] > x_max].head(1)
group_df = group_df[(group_df[x] >= x_min) & (group_df[x] <= x_max)]
else:
before_point = after_point = None
x_min = group_df[x].min()
x_max = group_df[x].max()
if before_point is not None and not before_point.empty:
downsampled_indices.extend(before_point.index.tolist())
if after_point is not None and not after_point.empty:
downsampled_indices.extend(after_point.index.tolist())
if group_df.empty:
continue
if x_min == x_max:
min_y_idx = group_df[y].idxmin()
max_y_idx = group_df[y].idxmax()
if min_y_idx != max_y_idx:
downsampled_indices.extend([min_y_idx, max_y_idx])
else:
downsampled_indices.append(min_y_idx)
continue
if len(group_df) < 500:
downsampled_indices.extend(group_df.index.tolist())
continue
bins = np.linspace(x_min, x_max, n_bins + 1)
group_df["bin"] = pd.cut(
group_df[x], bins=bins, labels=False, include_lowest=True
)
for bin_idx in group_df["bin"].dropna().unique():
bin_data = group_df[group_df["bin"] == bin_idx]
if bin_data.empty:
continue
min_y_idx = bin_data[y].idxmin()
max_y_idx = bin_data[y].idxmax()
downsampled_indices.append(min_y_idx)
if min_y_idx != max_y_idx:
downsampled_indices.append(max_y_idx)
unique_indices = list(set(downsampled_indices))
downsampled_df = df.loc[unique_indices].copy()
downsampled_df = downsampled_df.sort_values(x).reset_index(drop=True)
downsampled_df = downsampled_df.drop(columns=["bin"], errors="ignore")
return downsampled_df