File size: 11,445 Bytes
e241e27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
import os
from pathlib import Path

import pandas as pd

from trackio import deploy, utils
from trackio.sqlite_storage import SQLiteStorage


def import_csv(
    csv_path: str | Path,
    project: str,
    name: str | None = None,
    space_id: str | None = None,
    dataset_id: str | None = None,
) -> None:
    """
    Imports a CSV file into a Trackio project. The CSV file must contain a `"step"`
    column, may optionally contain a `"timestamp"` column, and any other columns will be
    treated as metrics. It should also include a header row with the column names.

    TODO: call init() and return a Run object so that the user can continue to log metrics to it.

    Args:
        csv_path (`str` or `Path`):
            The str or Path to the CSV file to import.
        project (`str`):
            The name of the project to import the CSV file into. Must not be an existing
            project.
        name (`str` or `None`, *optional*, defaults to `None`):
            The name of the Run to import the CSV file into. If not provided, a default
            name will be generated.
        name (`str` or `None`, *optional*, defaults to `None`):
            The name of the run (if not provided, a default name will be generated).
        space_id (`str` or `None`, *optional*, defaults to `None`):
            If provided, the project will be logged to a Hugging Face Space instead of a
            local directory. Should be a complete Space name like `"username/reponame"`
            or `"orgname/reponame"`, or just `"reponame"` in which case the Space will
            be created in the currently-logged-in Hugging Face user's namespace. If the
            Space does not exist, it will be created. If the Space already exists, the
            project will be logged to it.
        dataset_id (`str` or `None`, *optional*, defaults to `None`):
            If provided, a persistent Hugging Face Dataset will be created and the
            metrics will be synced to it every 5 minutes. Should be a complete Dataset
            name like `"username/datasetname"` or `"orgname/datasetname"`, or just
            `"datasetname"` in which case the Dataset will be created in the
            currently-logged-in Hugging Face user's namespace. If the Dataset does not
            exist, it will be created. If the Dataset already exists, the project will
            be appended to it. If not provided, the metrics will be logged to a local
            SQLite database, unless a `space_id` is provided, in which case a Dataset
            will be automatically created with the same name as the Space but with the
            `"_dataset"` suffix.
    """
    if SQLiteStorage.get_runs(project):
        raise ValueError(
            f"Project '{project}' already exists. Cannot import CSV into existing project."
        )

    csv_path = Path(csv_path)
    if not csv_path.exists():
        raise FileNotFoundError(f"CSV file not found: {csv_path}")

    df = pd.read_csv(csv_path)
    if df.empty:
        raise ValueError("CSV file is empty")

    column_mapping = utils.simplify_column_names(df.columns.tolist())
    df = df.rename(columns=column_mapping)

    step_column = None
    for col in df.columns:
        if col.lower() == "step":
            step_column = col
            break

    if step_column is None:
        raise ValueError("CSV file must contain a 'step' or 'Step' column")

    if name is None:
        name = csv_path.stem

    metrics_list = []
    steps = []
    timestamps = []

    numeric_columns = []
    for column in df.columns:
        if column == step_column:
            continue
        if column == "timestamp":
            continue

        try:
            pd.to_numeric(df[column], errors="raise")
            numeric_columns.append(column)
        except (ValueError, TypeError):
            continue

    for _, row in df.iterrows():
        metrics = {}
        for column in numeric_columns:
            value = row[column]
            if bool(pd.notna(value)):
                metrics[column] = float(value)

        if metrics:
            metrics_list.append(metrics)
            steps.append(int(row[step_column]))

            if "timestamp" in df.columns and bool(pd.notna(row["timestamp"])):
                timestamps.append(str(row["timestamp"]))
            else:
                timestamps.append("")

    if metrics_list:
        SQLiteStorage.bulk_log(
            project=project,
            run=name,
            metrics_list=metrics_list,
            steps=steps,
            timestamps=timestamps,
        )

    print(
        f"* Imported {len(metrics_list)} rows from {csv_path} into project '{project}' as run '{name}'"
    )
    print(f"* Metrics found: {', '.join(metrics_list[0].keys())}")

    space_id, dataset_id = utils.preprocess_space_and_dataset_ids(space_id, dataset_id)
    if dataset_id is not None:
        os.environ["TRACKIO_DATASET_ID"] = dataset_id
        print(f"* Trackio metrics will be synced to Hugging Face Dataset: {dataset_id}")

    if space_id is None:
        utils.print_dashboard_instructions(project)
    else:
        deploy.create_space_if_not_exists(space_id, dataset_id)
        deploy.wait_until_space_exists(space_id)
        deploy.upload_db_to_space(project, space_id)
        print(
            f"* View dashboard by going to: {deploy.SPACE_URL.format(space_id=space_id)}"
        )


def import_tf_events(
    log_dir: str | Path,
    project: str,
    name: str | None = None,
    space_id: str | None = None,
    dataset_id: str | None = None,
) -> None:
    """
    Imports TensorFlow Events files from a directory into a Trackio project. Each
    subdirectory in the log directory will be imported as a separate run.

    Args:
        log_dir (`str` or `Path`):
            The str or Path to the directory containing TensorFlow Events files.
        project (`str`):
            The name of the project to import the TensorFlow Events files into. Must not
            be an existing project.
        name (`str` or `None`, *optional*, defaults to `None`):
            The name prefix for runs (if not provided, will use directory names). Each
            subdirectory will create a separate run.
        space_id (`str` or `None`, *optional*, defaults to `None`):
            If provided, the project will be logged to a Hugging Face Space instead of a
            local directory. Should be a complete Space name like `"username/reponame"`
            or `"orgname/reponame"`, or just `"reponame"` in which case the Space will
            be created in the currently-logged-in Hugging Face user's namespace. If the
            Space does not exist, it will be created. If the Space already exists, the
            project will be logged to it.
        dataset_id (`str` or `None`, *optional*, defaults to `None`):
            If provided, a persistent Hugging Face Dataset will be created and the
            metrics will be synced to it every 5 minutes. Should be a complete Dataset
            name like `"username/datasetname"` or `"orgname/datasetname"`, or just
            `"datasetname"` in which case the Dataset will be created in the
            currently-logged-in Hugging Face user's namespace. If the Dataset does not
            exist, it will be created. If the Dataset already exists, the project will
            be appended to it. If not provided, the metrics will be logged to a local
            SQLite database, unless a `space_id` is provided, in which case a Dataset
            will be automatically created with the same name as the Space but with the
            `"_dataset"` suffix.
    """
    try:
        from tbparse import SummaryReader
    except ImportError:
        raise ImportError(
            "The `tbparse` package is not installed but is required for `import_tf_events`. Please install trackio with the `tensorboard` extra: `pip install trackio[tensorboard]`."
        )

    if SQLiteStorage.get_runs(project):
        raise ValueError(
            f"Project '{project}' already exists. Cannot import TF events into existing project."
        )

    path = Path(log_dir)
    if not path.exists():
        raise FileNotFoundError(f"TF events directory not found: {path}")

    # Use tbparse to read all tfevents files in the directory structure
    reader = SummaryReader(str(path), extra_columns={"dir_name"})
    df = reader.scalars

    if df.empty:
        raise ValueError(f"No TensorFlow events data found in {path}")

    total_imported = 0
    imported_runs = []

    # Group by dir_name to create separate runs
    for dir_name, group_df in df.groupby("dir_name"):
        try:
            # Determine run name based on directory name
            if dir_name == "":
                run_name = "main"  # For files in the root directory
            else:
                run_name = dir_name  # Use directory name

            if name:
                run_name = f"{name}_{run_name}"

            if group_df.empty:
                print(f"* Skipping directory {dir_name}: no scalar data found")
                continue

            metrics_list = []
            steps = []
            timestamps = []

            for _, row in group_df.iterrows():
                # Convert row values to appropriate types
                tag = str(row["tag"])
                value = float(row["value"])
                step = int(row["step"])

                metrics = {tag: value}
                metrics_list.append(metrics)
                steps.append(step)

                # Use wall_time if present, else fallback
                if "wall_time" in group_df.columns and not bool(
                    pd.isna(row["wall_time"])
                ):
                    timestamps.append(str(row["wall_time"]))
                else:
                    timestamps.append("")

            if metrics_list:
                SQLiteStorage.bulk_log(
                    project=project,
                    run=str(run_name),
                    metrics_list=metrics_list,
                    steps=steps,
                    timestamps=timestamps,
                )

                total_imported += len(metrics_list)
                imported_runs.append(run_name)

                print(
                    f"* Imported {len(metrics_list)} scalar events from directory '{dir_name}' as run '{run_name}'"
                )
                print(f"* Metrics in this run: {', '.join(set(group_df['tag']))}")

        except Exception as e:
            print(f"* Error processing directory {dir_name}: {e}")
            continue

    if not imported_runs:
        raise ValueError("No valid TensorFlow events data could be imported")

    print(f"* Total imported events: {total_imported}")
    print(f"* Created runs: {', '.join(imported_runs)}")

    space_id, dataset_id = utils.preprocess_space_and_dataset_ids(space_id, dataset_id)
    if dataset_id is not None:
        os.environ["TRACKIO_DATASET_ID"] = dataset_id
        print(f"* Trackio metrics will be synced to Hugging Face Dataset: {dataset_id}")

    if space_id is None:
        utils.print_dashboard_instructions(project)
    else:
        deploy.create_space_if_not_exists(space_id, dataset_id)
        deploy.wait_until_space_exists(space_id)
        deploy.upload_db_to_space(project, space_id)
        print(
            f"* View dashboard by going to: {deploy.SPACE_URL.format(space_id=space_id)}"
        )