File size: 5,430 Bytes
28d6858
 
 
 
 
 
 
 
 
 
 
 
 
 
df4528e
28d6858
df4528e
 
 
28d6858
1d5c35d
 
 
 
 
28d6858
 
1d5c35d
 
28d6858
 
 
1d5c35d
 
28d6858
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f69e27e
28d6858
 
 
 
 
 
 
 
 
 
 
 
 
1d5c35d
 
28d6858
 
f99c444
 
28d6858
 
 
 
 
 
1d5c35d
 
 
 
 
28d6858
1d5c35d
 
 
 
28d6858
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1d5c35d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f99c444
28d6858
 
 
 
 
1d5c35d
 
 
 
28d6858
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import os
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoConfig
from datetime import datetime
from huggingface_hub import HfApi, create_repo, upload_folder, hf_hub_download
import traceback
import threading
import uvicorn
import time
from fastapi import FastAPI
from fastapi.responses import JSONResponse

# === Sabitler ===
MODEL_NAME = "TURKCELL/Turkcell-LLM-7b-v1"
HF_TOKEN = os.getenv("HF_TOKEN")
SOURCE_DATASET_ID = "UcsTurkey/turkish-train-chunks"
TRAIN_TARGET_DATASET_ID = "UcsTurkey/turkish-train-tokenized"
RAG_TARGET_DATASET_ID = "UcsTurkey/turkish-train-rag"
BUFFER_SIZE = 5
START_CHUNK_NUMBER = 0
PROCESS_CHUNK_COUNT = 776

GENERATE_TRAIN_DATA = False
GENERATE_RAG_DATA = True

CHUNK_FOLDER = "/data/chunks"
TRAIN_FOLDER = "/data/tokenized_chunks"
RAG_FOLDER = "/data/rag_chunks"
CACHE_DIR = "/data/.hf_cache"

os.makedirs(CHUNK_FOLDER, exist_ok=True)
os.makedirs(TRAIN_FOLDER, exist_ok=True)
os.makedirs(RAG_FOLDER, exist_ok=True)
os.makedirs(CACHE_DIR, exist_ok=True)

# ✅ Health check sunucusu
app = FastAPI()

@app.get("/")
def health():
    return JSONResponse(content={"status": "ok"})

def run_health_server():
    uvicorn.run(app, host="0.0.0.0", port=7860)

threading.Thread(target=run_health_server, daemon=True).start()

# 🕒 Zamanlı log fonksiyonu
def log(message):
    timestamp = datetime.now().strftime("%H:%M:%S")
    print(f"[{timestamp}] {message}")
    os.sys.stdout.flush()

# === Tokenizer ===
os.environ["HF_HOME"] = CACHE_DIR
log(f"🔁 Tokenizer yükleniyor: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False, cache_dir=CACHE_DIR)
if tokenizer.pad_token is None:
    log("ℹ️ pad_token tanımlı değil, eos_token atanıyor.")
    tokenizer.pad_token = tokenizer.eos_token

config = AutoConfig.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
MAX_LEN = getattr(config, "max_position_embeddings", 2048)

# === Hugging Face API ===
api = HfApi()
files = api.list_repo_files(repo_id=SOURCE_DATASET_ID, repo_type="dataset", token=HF_TOKEN)
csv_files = sorted([f for f in files if f.endswith(".csv")])
selected_files = csv_files[START_CHUNK_NUMBER:START_CHUNK_NUMBER + PROCESS_CHUNK_COUNT]

buffer_counter_train = 0
buffer_counter_rag = 0

def tokenize(example):
    # ✅ Mistral-7B-Instruct formatına uygun prompt
    prompt = f"### Instruction:\n{example['instruction']}\n\n### Response:\n{example['output']}"
    tokenized = tokenizer(prompt, truncation=True, padding="max_length", max_length=MAX_LEN)
    tokenized["labels"] = [
        -100 if token_id == tokenizer.pad_token_id else token_id for token_id in tokenized["input_ids"]
    ]
    return tokenized

def upload_if_ready(folder_path, target_repo):
    if os.listdir(folder_path):
        log(f"⬆️ BUFFER doldu. Hugging Face'e yükleniyor: {target_repo}")
        create_repo(target_repo, repo_type="dataset", token=HF_TOKEN, exist_ok=True)
        upload_folder(repo_id=target_repo, folder_path=folder_path, repo_type="dataset", token=HF_TOKEN)
        log("🧹 Upload sonrası klasör temizleniyor...")
        for f in os.listdir(folder_path):
            os.remove(os.path.join(folder_path, f))
        return 0
    return 0

for idx, filename in enumerate(selected_files):
    log(f"\n📄 {idx+1}/{len(selected_files)}{filename} işleniyor...")
    try:
        local_path = os.path.join(CHUNK_FOLDER, os.path.basename(filename))
        hf_hub_download(
            repo_id=SOURCE_DATASET_ID,
            filename=filename,
            local_dir=CHUNK_FOLDER,
            token=HF_TOKEN,
            repo_type="dataset"
        )
        df = pd.read_csv(local_path).dropna()
        df = df[df["question"].str.strip().astype(bool) & df["answer"].str.strip().astype(bool)]
        df = df.rename(columns={"question": "instruction", "answer": "output"})
        log(f"✅ Geçerli satır sayısı: {len(df)}")

        if GENERATE_RAG_DATA:
            rag_dataset = Dataset.from_pandas(df[["instruction", "output"]])
            rag_path = os.path.join(RAG_FOLDER, filename.replace(".csv", ".parquet"))
            rag_dataset.to_parquet(rag_path, compression="brotli")
            log(f"📦 RAG parquet kaydedildi: {rag_path}")
            buffer_counter_rag += 1
            if buffer_counter_rag >= BUFFER_SIZE:
                buffer_counter_rag = upload_if_ready(RAG_FOLDER, RAG_TARGET_DATASET_ID)

        if GENERATE_TRAIN_DATA:
            train_dataset = Dataset.from_pandas(df[["instruction", "output"]])
            tokenized_dataset = train_dataset.map(tokenize)
            parquet_path = os.path.join(TRAIN_FOLDER, filename.replace(".csv", ".parquet"))
            tokenized_dataset.to_parquet(parquet_path, compression="snappy")
            log(f"🎯 Tokenized parquet kaydedildi: {parquet_path}")
            buffer_counter_train += 1
            if buffer_counter_train >= BUFFER_SIZE:
                buffer_counter_train = upload_if_ready(TRAIN_FOLDER, TRAIN_TARGET_DATASET_ID)

    except Exception as e:
        log(f"❌ Hata oluştu: {filename}{e}")
        traceback.print_exc()
        continue

if GENERATE_TRAIN_DATA:
    buffer_counter_train = upload_if_ready(TRAIN_FOLDER, TRAIN_TARGET_DATASET_ID)
if GENERATE_RAG_DATA:
    buffer_counter_rag = upload_if_ready(RAG_FOLDER, RAG_TARGET_DATASET_ID)

log("✅ Tüm işlemler tamamlandı. Servis bekleme modunda...")
while True:
    time.sleep(60)