Spaces:
Running
Running
File size: 6,543 Bytes
e96f980 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 |
#!/bin/bash
# 检查备份变量
if [ -z "$HF_TOKEN" ] || [ -z "$DATASET_ID" ]; then
echo "⚠️ 缺少 HF_TOKEN 或 DATASET_ID,备份功能未启用"
exec java ${JVM_OPTS} -jar /opt/halo/halo.jar
exit 0
fi
# 设置默认值
DATASET_N=${DATASET_N:-10} # 默认保留最新 10 个备份
SYNC_INTERVAL=${SYNC_INTERVAL:-36000} # 默认同步间隔 36000 秒(10小时)
BACKUP_DIR="$HOME/.halo2"
BACKUP_PREFIX="halo_backup_"
BACKUP_EXT=".tar.gz"
HF_BRANCH="main"
# 打印消息到控制台
print_message() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - $1"
}
# 检查并创建 Hugging Face 数据集(如果不存在)
create_dataset() {
print_message "检查 Hugging Face 数据集 '${DATASET_ID}' 是否存在..."
python3 <<EOF
import os
from huggingface_hub import HfApi
try:
api = HfApi(token=os.getenv("HF_TOKEN"))
repo_id = os.getenv("DATASET_ID")
# 获取当前用户的所有数据集
user_datasets = [d.id for d in api.list_datasets(author=repo_id.split('/')[0])]
if repo_id not in user_datasets:
api.create_repo(repo_id=repo_id, repo_type="dataset", private=True)
print(f"✅ 数据集 '{repo_id}' 不存在,已创建(私有)。")
else:
print(f"✅ 数据集 '{repo_id}' 已存在。")
except Exception as e:
print(f"⚠️ 数据集检查/创建失败:{str(e)}")
EOF
}
# 检查并创建 filecode 分支(如果不存在)
create_branch() {
print_message "检查 Hugging Face 数据集的 '${HF_BRANCH}' 分支..."
python3 <<EOF
import os
from huggingface_hub import HfApi
try:
api = HfApi(token=os.getenv("HF_TOKEN"))
repo_id = os.getenv("DATASET_ID")
hf_branch = "${HF_BRANCH}"
# 获取所有分支
branches = api.list_repo_refs(repo_id, repo_type="dataset").branches
branch_names = [b.name for b in branches]
if hf_branch not in branch_names:
api.create_branch(repo_id=repo_id, branch=hf_branch, repo_type="dataset")
print(f"✅ 分支 '{hf_branch}' 不存在,已创建。")
else:
print(f"✅ 分支 '{hf_branch}' 已存在。")
except Exception as e:
print(f"⚠️ 分支检查/创建失败:{str(e)}")
EOF
}
# 下载最新备份
download_data() {
print_message "开始下载最新备份..."
python3 <<EOF
import os
import tarfile
import tempfile
import shutil
from huggingface_hub import HfApi
def download_and_extract(api, repo_id, branch):
"""下载并解压最新备份"""
files = api.list_repo_files(repo_id=repo_id, repo_type='dataset', revision=branch)
backup_files = sorted(f for f in files if f.startswith('${BACKUP_PREFIX}') and f.endswith('${BACKUP_EXT}'))
if not backup_files:
print("⚠️ 未找到任何备份文件")
return False
latest_backup = backup_files[-1]
# 使用临时目录下载文件
with tempfile.TemporaryDirectory() as temp_dir:
filepath = api.hf_hub_download(
repo_id=repo_id,
filename=latest_backup,
repo_type='dataset',
local_dir=temp_dir,
revision=branch
)
# 直接解压到 BACKUP_DIR
with tarfile.open(filepath, 'r:gz') as tar:
tar.extractall("${BACKUP_DIR}")
print("✅ 成功从最新备份恢复到 ${BACKUP_DIR}")
return True
if __name__ == "__main__":
api = HfApi(token=os.getenv("HF_TOKEN"))
restored = download_and_extract(api, os.getenv("DATASET_ID"), "${HF_BRANCH}")
print("RESTORED=1" if restored else "RESTORED=0")
EOF
}
# 备份数据到 Hugging Face
backup_data() {
local timestamp=$(date +%Y%m%d_%H%M%S)
local backup_file="${BACKUP_PREFIX}${timestamp}${BACKUP_EXT}"
# 使用 Python 创建临时目录
local temp_dir=$(python3 -c "import tempfile; print(tempfile.mkdtemp())")
local backup_path="${temp_dir}/${backup_file}"
print_message "开始备份数据:${backup_file} (临时目录: ${temp_dir})"
# 创建备份文件
tar -czf "$backup_path" -C "$BACKUP_DIR" .
# 使用 Python 上传并清理旧备份
python3 <<EOF
import os
import tempfile
from huggingface_hub import HfApi
try:
api = HfApi(token=os.getenv("HF_TOKEN"))
repo_id = os.getenv("DATASET_ID")
backup_file = "${backup_path}"
dataset_n = ${DATASET_N}
hf_branch = "${HF_BRANCH}"
# 上传新备份到 filecode 分支
api.upload_file(
path_or_fileobj=backup_file,
path_in_repo=os.path.basename(backup_file),
repo_id=repo_id,
repo_type="dataset",
revision=hf_branch
)
print(f"✅ 备份上传成功(分支: {hf_branch}):{os.path.basename(backup_file)}")
# 获取所有备份文件
files = api.list_repo_files(repo_id, repo_type="dataset", revision=hf_branch)
backup_files = sorted(f for f in files if f.startswith("${BACKUP_PREFIX}") and f.endswith("${BACKUP_EXT}"))
# 删除旧备份,仅保留最新 dataset_n 个
if len(backup_files) > dataset_n:
to_delete = backup_files[:-dataset_n]
for old_file in to_delete:
api.delete_file(path_in_repo=old_file, repo_id=repo_id, repo_type="dataset", revision=hf_branch)
print(f"🗑️ 已删除过期备份(分支: {hf_branch}):{old_file}")
except Exception as e:
print(f"⚠️ 备份失败:{str(e)}")
EOF
# 清理本地临时备份文件和目录
rm -f "$backup_path"
rm -rf "$temp_dir"
print_message "✅ 备份完成:${backup_file}"
# 使用 super_squash_history 压缩 commit 历史
print_message "开始压缩 commit 历史..."
python3 <<EOF
import os
from huggingface_hub import HfApi
try:
api = HfApi(token=os.getenv("HF_TOKEN"))
repo_id = os.getenv("DATASET_ID")
hf_branch = "${HF_BRANCH}"
api.super_squash_history(repo_id=repo_id, repo_type="dataset", branch=hf_branch)
print(f"✅ commit 历史已成功压缩(分支: {hf_branch})")
except Exception as e:
print(f"⚠️ commit 历史压缩失败:{str(e)}")
EOF
print_message "commit 历史压缩完成"
}
# 启动数据同步进程
sync_data() {
while true; do
backup_data
print_message "⏳ 下次同步将在 ${SYNC_INTERVAL} 秒后进行..."
sleep "$SYNC_INTERVAL"
done
}
# 主程序执行逻辑
(
print_message "🚀 系统启动,准备从 Hugging Face 下载最新备份..."
create_dataset
create_branch
download_data
sync_data &
exec java ${JVM_OPTS} -jar /opt/halo/halo.jar
) 2>&1 |