qwen-trainer-scripts / prepare_data.py
mindchain's picture
Upload folder using huggingface_hub
78a0ca9 verified
import os
import pandas as pd
from typing import List, Optional, Dict, Any
from dataclasses import dataclass
import data_designer.config as dd
from data_designer.interface import DataDesigner
@dataclass
class SyntheticDataConfig:
name: str = "synthetic_dataset"
num_records: int = 10
topics: List[str] = None
prompt_template: str = "Create a high-quality instruction and response pair for the topic: {{ topic }}."
model_alias: str = "perplexity-text"
output_path: str = "synthetic_data.jsonl"
class DataPreparer:
def __init__(self, designer: Optional[DataDesigner] = None):
if not designer:
# Configure Perplexity provider (OpenAI-compatible)
perplexity_provider = dd.ModelProvider(
name="perplexity",
provider_type="openai",
api_key="PERPLEXITY_API_KEY",
endpoint="https://api.perplexity.ai"
)
designer = DataDesigner(
model_providers=[perplexity_provider]
)
self.designer = designer
def generate_synthetic_data(self, config: SyntheticDataConfig) -> pd.DataFrame:
print(f"Generating {config.num_records} synthetic records for topics: {config.topics}")
# Configure model
perplexity_model = dd.ModelConfig(
alias="perplexity-text",
model="sonar",
provider="perplexity",
inference_parameters=dd.ChatCompletionInferenceParams(max_parallel_requests=1)
)
builder = dd.DataDesignerConfigBuilder(model_configs=[perplexity_model])
# Add topic sampler
if config.topics:
builder.add_column(
dd.SamplerColumnConfig(
name="topic",
sampler_type=dd.SamplerType.CATEGORY,
params=dd.CategorySamplerParams(values=config.topics)
)
)
else:
# Default topics if none provided
builder.add_column(
dd.SamplerColumnConfig(
name="topic",
sampler_type=dd.SamplerType.CATEGORY,
params=dd.CategorySamplerParams(values=["Python Programming", "Data Science", "Machine Learning"])
)
)
# Add LLM Structured column for Instruction/Response pairs
builder.add_column(
dd.LLMTextColumnConfig(
name="instruction",
model_alias=config.model_alias,
prompt=f"{config.prompt_template}\n\nReturn only the instruction part."
)
)
builder.add_column(
dd.LLMTextColumnConfig(
name="output",
model_alias=config.model_alias,
prompt="Based on the instruction: {{ instruction }}, provide a detailed and accurate response."
)
)
# Run generation
result = self.designer.create(config_builder=builder, num_records=config.num_records)
df = result.load_dataset()
# Save to JSONL
df.to_json(config.output_path, orient="records", lines=True)
print(f"Synthetic data saved to {config.output_path}")
return df
def format_for_qwen(self, df: pd.DataFrame) -> List[Dict[str, str]]:
"""Formats the dataframe into ChatML for Qwen training."""
chatml_data = []
for _, row in df.iterrows():
chatml_data.append({
"text": f"<|im_start|>user\n{row['instruction']}<|im_end|>\n<|im_start|>assistant\n{row['output']}<|im_end|>"
})
return chatml_data
if __name__ == "__main__":
# Example usage
config = SyntheticDataConfig(
num_records=10,
topics=["Quantum Computing", "Space Exploration"],
output_path="test_synthetic.jsonl"
)
preparer = DataPreparer()
df = preparer.generate_synthetic_data(config)
formatted = preparer.format_for_qwen(df)
print(f"Formatted {len(formatted)} records for Qwen.")