File size: 6,543 Bytes
e96f980
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
#!/bin/bash

# 检查备份变量
if [ -z "$HF_TOKEN" ] || [ -z "$DATASET_ID" ]; then
    echo "⚠️ 缺少 HF_TOKEN 或 DATASET_ID,备份功能未启用"
    exec java ${JVM_OPTS} -jar /opt/halo/halo.jar
    exit 0
fi

# 设置默认值
DATASET_N=${DATASET_N:-10}           # 默认保留最新 10 个备份
SYNC_INTERVAL=${SYNC_INTERVAL:-36000} # 默认同步间隔 36000 秒(10小时)
BACKUP_DIR="$HOME/.halo2"
BACKUP_PREFIX="halo_backup_"
BACKUP_EXT=".tar.gz"
HF_BRANCH="main"

# 打印消息到控制台
print_message() {
    echo "$(date '+%Y-%m-%d %H:%M:%S') - $1"
}

# 检查并创建 Hugging Face 数据集(如果不存在)
create_dataset() {
    print_message "检查 Hugging Face 数据集 '${DATASET_ID}' 是否存在..."
    python3 <<EOF
import os
from huggingface_hub import HfApi
try:
    api = HfApi(token=os.getenv("HF_TOKEN"))
    repo_id = os.getenv("DATASET_ID")
    # 获取当前用户的所有数据集
    user_datasets = [d.id for d in api.list_datasets(author=repo_id.split('/')[0])]
    if repo_id not in user_datasets:
        api.create_repo(repo_id=repo_id, repo_type="dataset", private=True)
        print(f"✅ 数据集 '{repo_id}' 不存在,已创建(私有)。")
    else:
        print(f"✅ 数据集 '{repo_id}' 已存在。")
except Exception as e:
    print(f"⚠️ 数据集检查/创建失败:{str(e)}")
EOF
}

# 检查并创建 filecode 分支(如果不存在)
create_branch() {
    print_message "检查 Hugging Face 数据集的 '${HF_BRANCH}' 分支..."
    python3 <<EOF
import os
from huggingface_hub import HfApi
try:
    api = HfApi(token=os.getenv("HF_TOKEN"))
    repo_id = os.getenv("DATASET_ID")
    hf_branch = "${HF_BRANCH}"
    # 获取所有分支
    branches = api.list_repo_refs(repo_id, repo_type="dataset").branches
    branch_names = [b.name for b in branches]
    if hf_branch not in branch_names:
        api.create_branch(repo_id=repo_id, branch=hf_branch, repo_type="dataset")
        print(f"✅ 分支 '{hf_branch}' 不存在,已创建。")
    else:
        print(f"✅ 分支 '{hf_branch}' 已存在。")
except Exception as e:
    print(f"⚠️ 分支检查/创建失败:{str(e)}")
EOF
}

# 下载最新备份
download_data() {
    print_message "开始下载最新备份..."
    python3 <<EOF
import os
import tarfile
import tempfile
import shutil
from huggingface_hub import HfApi
def download_and_extract(api, repo_id, branch):
    """下载并解压最新备份"""
    files = api.list_repo_files(repo_id=repo_id, repo_type='dataset', revision=branch)
    backup_files = sorted(f for f in files if f.startswith('${BACKUP_PREFIX}') and f.endswith('${BACKUP_EXT}'))
    
    if not backup_files:
        print("⚠️ 未找到任何备份文件")
        return False
    
    latest_backup = backup_files[-1]
    
    # 使用临时目录下载文件
    with tempfile.TemporaryDirectory() as temp_dir:
        filepath = api.hf_hub_download(
            repo_id=repo_id, 
            filename=latest_backup, 
            repo_type='dataset', 
            local_dir=temp_dir,
            revision=branch
        )
        
        # 直接解压到 BACKUP_DIR
        with tarfile.open(filepath, 'r:gz') as tar:
            tar.extractall("${BACKUP_DIR}")
            
        print("✅ 成功从最新备份恢复到 ${BACKUP_DIR}")
        return True
if __name__ == "__main__":
    api = HfApi(token=os.getenv("HF_TOKEN"))
    restored = download_and_extract(api, os.getenv("DATASET_ID"), "${HF_BRANCH}")
    print("RESTORED=1" if restored else "RESTORED=0")
EOF
}

# 备份数据到 Hugging Face
backup_data() {
    local timestamp=$(date +%Y%m%d_%H%M%S)
    local backup_file="${BACKUP_PREFIX}${timestamp}${BACKUP_EXT}"
    
    # 使用 Python 创建临时目录
    local temp_dir=$(python3 -c "import tempfile; print(tempfile.mkdtemp())")
    local backup_path="${temp_dir}/${backup_file}"

    print_message "开始备份数据:${backup_file} (临时目录: ${temp_dir})"

    # 创建备份文件
    tar -czf "$backup_path" -C "$BACKUP_DIR" .

    # 使用 Python 上传并清理旧备份
    python3 <<EOF
import os
import tempfile
from huggingface_hub import HfApi
try:
    api = HfApi(token=os.getenv("HF_TOKEN"))
    repo_id = os.getenv("DATASET_ID")
    backup_file = "${backup_path}"
    dataset_n = ${DATASET_N}
    hf_branch = "${HF_BRANCH}"
    # 上传新备份到 filecode 分支
    api.upload_file(
        path_or_fileobj=backup_file,
        path_in_repo=os.path.basename(backup_file),
        repo_id=repo_id,
        repo_type="dataset",
        revision=hf_branch
    )
    print(f"✅ 备份上传成功(分支: {hf_branch}):{os.path.basename(backup_file)}")
    # 获取所有备份文件
    files = api.list_repo_files(repo_id, repo_type="dataset", revision=hf_branch)
    backup_files = sorted(f for f in files if f.startswith("${BACKUP_PREFIX}") and f.endswith("${BACKUP_EXT}"))
    # 删除旧备份,仅保留最新 dataset_n 个
    if len(backup_files) > dataset_n:
        to_delete = backup_files[:-dataset_n]
        for old_file in to_delete:
            api.delete_file(path_in_repo=old_file, repo_id=repo_id, repo_type="dataset", revision=hf_branch)
            print(f"🗑️ 已删除过期备份(分支: {hf_branch}):{old_file}")
except Exception as e:
    print(f"⚠️ 备份失败:{str(e)}")
EOF

    # 清理本地临时备份文件和目录
    rm -f "$backup_path"
    rm -rf "$temp_dir"
    print_message "✅ 备份完成:${backup_file}"

    # 使用 super_squash_history 压缩 commit 历史
    print_message "开始压缩 commit 历史..."
    python3 <<EOF
import os
from huggingface_hub import HfApi
try:
    api = HfApi(token=os.getenv("HF_TOKEN"))
    repo_id = os.getenv("DATASET_ID")
    hf_branch = "${HF_BRANCH}"
    api.super_squash_history(repo_id=repo_id, repo_type="dataset", branch=hf_branch)
    print(f"✅ commit 历史已成功压缩(分支: {hf_branch})")
except Exception as e:
    print(f"⚠️ commit 历史压缩失败:{str(e)}")
EOF
    print_message "commit 历史压缩完成"
}

# 启动数据同步进程
sync_data() {
    while true; do
        backup_data
        print_message "⏳ 下次同步将在 ${SYNC_INTERVAL} 秒后进行..."
        sleep "$SYNC_INTERVAL"
    done
}

# 主程序执行逻辑
(
    print_message "🚀 系统启动,准备从 Hugging Face 下载最新备份..."
    create_dataset
    create_branch
    download_data
    sync_data &
    exec java ${JVM_OPTS} -jar /opt/halo/halo.jar
) 2>&1