| import os |
| import pandas as pd |
| from typing import List, Optional, Dict, Any |
| from dataclasses import dataclass |
| import data_designer.config as dd |
| from data_designer.interface import DataDesigner |
|
|
| @dataclass |
| class SyntheticDataConfig: |
| name: str = "synthetic_dataset" |
| num_records: int = 10 |
| topics: List[str] = None |
| prompt_template: str = "Create a high-quality instruction and response pair for the topic: {{ topic }}." |
| model_alias: str = "perplexity-text" |
| output_path: str = "synthetic_data.jsonl" |
|
|
| class DataPreparer: |
| def __init__(self, designer: Optional[DataDesigner] = None): |
| if not designer: |
| |
| perplexity_provider = dd.ModelProvider( |
| name="perplexity", |
| provider_type="openai", |
| api_key="PERPLEXITY_API_KEY", |
| endpoint="https://api.perplexity.ai" |
| ) |
| designer = DataDesigner( |
| model_providers=[perplexity_provider] |
| ) |
| self.designer = designer |
|
|
| def generate_synthetic_data(self, config: SyntheticDataConfig) -> pd.DataFrame: |
| print(f"Generating {config.num_records} synthetic records for topics: {config.topics}") |
| |
| |
| perplexity_model = dd.ModelConfig( |
| alias="perplexity-text", |
| model="sonar", |
| provider="perplexity", |
| inference_parameters=dd.ChatCompletionInferenceParams(max_parallel_requests=1) |
| ) |
| builder = dd.DataDesignerConfigBuilder(model_configs=[perplexity_model]) |
| |
| |
| if config.topics: |
| builder.add_column( |
| dd.SamplerColumnConfig( |
| name="topic", |
| sampler_type=dd.SamplerType.CATEGORY, |
| params=dd.CategorySamplerParams(values=config.topics) |
| ) |
| ) |
| else: |
| |
| builder.add_column( |
| dd.SamplerColumnConfig( |
| name="topic", |
| sampler_type=dd.SamplerType.CATEGORY, |
| params=dd.CategorySamplerParams(values=["Python Programming", "Data Science", "Machine Learning"]) |
| ) |
| ) |
|
|
| |
| builder.add_column( |
| dd.LLMTextColumnConfig( |
| name="instruction", |
| model_alias=config.model_alias, |
| prompt=f"{config.prompt_template}\n\nReturn only the instruction part." |
| ) |
| ) |
| |
| builder.add_column( |
| dd.LLMTextColumnConfig( |
| name="output", |
| model_alias=config.model_alias, |
| prompt="Based on the instruction: {{ instruction }}, provide a detailed and accurate response." |
| ) |
| ) |
|
|
| |
| result = self.designer.create(config_builder=builder, num_records=config.num_records) |
| df = result.load_dataset() |
| |
| |
| df.to_json(config.output_path, orient="records", lines=True) |
| print(f"Synthetic data saved to {config.output_path}") |
| |
| return df |
|
|
| def format_for_qwen(self, df: pd.DataFrame) -> List[Dict[str, str]]: |
| """Formats the dataframe into ChatML for Qwen training.""" |
| chatml_data = [] |
| for _, row in df.iterrows(): |
| chatml_data.append({ |
| "text": f"<|im_start|>user\n{row['instruction']}<|im_end|>\n<|im_start|>assistant\n{row['output']}<|im_end|>" |
| }) |
| return chatml_data |
|
|
| if __name__ == "__main__": |
| |
| config = SyntheticDataConfig( |
| num_records=10, |
| topics=["Quantum Computing", "Space Exploration"], |
| output_path="test_synthetic.jsonl" |
| ) |
| preparer = DataPreparer() |
| df = preparer.generate_synthetic_data(config) |
| formatted = preparer.format_for_qwen(df) |
| print(f"Formatted {len(formatted)} records for Qwen.") |
|
|