""" PyPilot Data Preprocessor - Handles massive code datasets """ import json import pickle import multiprocessing as mp from pathlib import Path from datasets import load_dataset import tokenizers from tokenizers import Tokenizer from tokenizers.models import BPE from tokenizers.trainers import BpeTrainer from tokenizers.pre_tokenizers import Whitespace class PyPilotDataPreprocessor: def __init__(self): self.supported_languages = ['python', 'javascript', 'java', 'cpp', 'go', 'rust'] self.processed_data = {} def load_github_dataset(self, language='python', split='train'): """Load massive code dataset from Hugging Face""" print(f"📥 Loading {language} code dataset...") try: dataset = load_dataset("codeparrot/github-code", split=split, languages=[language]) print(f"✅ Loaded {len(dataset)} {language} files") return dataset except Exception as e: print(f"❌ Error loading dataset: {e}") return None def build_tokenizer(self, dataset, vocab_size=50000): """Build custom tokenizer for code""" print("🔤 Building custom code tokenizer...") tokenizer = Tokenizer(BPE()) tokenizer.pre_tokenizer = Whitespace() trainer = BpeTrainer( vocab_size=vocab_size, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "[EOL]"] ) # Train tokenizer on code samples def batch_iterator(batch_size=1000): for i in range(0, len(dataset), batch_size): yield dataset[i:i+batch_size]['code'] tokenizer.train_from_iterator(batch_iterator(), trainer=trainer) tokenizer.save("./pypilot_tokenizer.json") print("✅ Tokenizer built and saved!") return tokenizer def parallel_process_files(self, file_paths, num_processes=8): """Process files in parallel for maximum speed""" print(f"⚡ Processing {len(file_paths)} files with {num_processes} processes...") def process_file(file_path): try: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() return { 'file_path': str(file_path), 'content': content, 'length': len(content), 'language': self.detect_language(file_path) } except Exception as e: return {'error': str(e), 'file_path': str(file_path)} with mp.Pool(num_processes) as pool: results = pool.map(process_file, file_paths) successful = [r for r in results if 'error' not in r] print(f"✅ Processed {len(successful)} files successfully") return successful def detect_language(self, file_path): """Detect programming language from file extension""" extensions = { '.py': 'python', '.js': 'javascript', '.java': 'java', '.cpp': 'cpp', '.cc': 'cpp', '.go': 'go', '.rs': 'rust', '.ts': 'typescript' } return extensions.get(Path(file_path).suffix, 'unknown') def create_training_pairs(self, code_samples, context_size=512): """Create (input, target) pairs for training""" print("🔄 Creating training pairs...") training_pairs = [] for sample in code_samples: code = sample.get('content', '') if len(code) > context_size: # Split code into chunks and create prediction tasks for i in range(0, len(code) - context_size, context_size // 2): input_chunk = code[i:i + context_size] target_chunk = code[i + 1:i + context_size + 1] training_pairs.append({ 'input': input_chunk, 'target': target_chunk, 'language': sample.get('language', 'unknown') }) print(f"✅ Created {len(training_pairs)} training pairs") return training_pairs if __name__ == "__main__": preprocessor = PyPilotDataPreprocessor() # Example usage dataset = preprocessor.load_github_dataset('python') if dataset: tokenizer = preprocessor.build_tokenizer(dataset) training_data = preprocessor.create_training_pairs(dataset) # Save processed data with open('processed_training_data.pkl', 'wb') as f: pickle.dump(training_data, f) print("💾 Training data saved!")