import os import pandas as pd from typing import List, Optional, Dict, Any from dataclasses import dataclass import data_designer.config as dd from data_designer.interface import DataDesigner @dataclass class SyntheticDataConfig: name: str = "synthetic_dataset" num_records: int = 10 topics: List[str] = None prompt_template: str = "Create a high-quality instruction and response pair for the topic: {{ topic }}." model_alias: str = "perplexity-text" output_path: str = "synthetic_data.jsonl" class DataPreparer: def __init__(self, designer: Optional[DataDesigner] = None): if not designer: # Configure Perplexity provider (OpenAI-compatible) perplexity_provider = dd.ModelProvider( name="perplexity", provider_type="openai", api_key="PERPLEXITY_API_KEY", endpoint="https://api.perplexity.ai" ) designer = DataDesigner( model_providers=[perplexity_provider] ) self.designer = designer def generate_synthetic_data(self, config: SyntheticDataConfig) -> pd.DataFrame: print(f"Generating {config.num_records} synthetic records for topics: {config.topics}") # Configure model perplexity_model = dd.ModelConfig( alias="perplexity-text", model="sonar", provider="perplexity", inference_parameters=dd.ChatCompletionInferenceParams(max_parallel_requests=1) ) builder = dd.DataDesignerConfigBuilder(model_configs=[perplexity_model]) # Add topic sampler if config.topics: builder.add_column( dd.SamplerColumnConfig( name="topic", sampler_type=dd.SamplerType.CATEGORY, params=dd.CategorySamplerParams(values=config.topics) ) ) else: # Default topics if none provided builder.add_column( dd.SamplerColumnConfig( name="topic", sampler_type=dd.SamplerType.CATEGORY, params=dd.CategorySamplerParams(values=["Python Programming", "Data Science", "Machine Learning"]) ) ) # Add LLM Structured column for Instruction/Response pairs builder.add_column( dd.LLMTextColumnConfig( name="instruction", model_alias=config.model_alias, prompt=f"{config.prompt_template}\n\nReturn only the instruction part." ) ) builder.add_column( dd.LLMTextColumnConfig( name="output", model_alias=config.model_alias, prompt="Based on the instruction: {{ instruction }}, provide a detailed and accurate response." ) ) # Run generation result = self.designer.create(config_builder=builder, num_records=config.num_records) df = result.load_dataset() # Save to JSONL df.to_json(config.output_path, orient="records", lines=True) print(f"Synthetic data saved to {config.output_path}") return df def format_for_qwen(self, df: pd.DataFrame) -> List[Dict[str, str]]: """Formats the dataframe into ChatML for Qwen training.""" chatml_data = [] for _, row in df.iterrows(): chatml_data.append({ "text": f"<|im_start|>user\n{row['instruction']}<|im_end|>\n<|im_start|>assistant\n{row['output']}<|im_end|>" }) return chatml_data if __name__ == "__main__": # Example usage config = SyntheticDataConfig( num_records=10, topics=["Quantum Computing", "Space Exploration"], output_path="test_synthetic.jsonl" ) preparer = DataPreparer() df = preparer.generate_synthetic_data(config) formatted = preparer.format_for_qwen(df) print(f"Formatted {len(formatted)} records for Qwen.")