PyPilot / data_preprocessor.py
prelington's picture
Create data_preprocessor.py
3a7ae3c verified
"""
PyPilot Data Preprocessor - Handles massive code datasets
"""
import json
import pickle
import multiprocessing as mp
from pathlib import Path
from datasets import load_dataset
import tokenizers
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
class PyPilotDataPreprocessor:
def __init__(self):
self.supported_languages = ['python', 'javascript', 'java', 'cpp', 'go', 'rust']
self.processed_data = {}
def load_github_dataset(self, language='python', split='train'):
"""Load massive code dataset from Hugging Face"""
print(f"πŸ“₯ Loading {language} code dataset...")
try:
dataset = load_dataset("codeparrot/github-code", split=split, languages=[language])
print(f"βœ… Loaded {len(dataset)} {language} files")
return dataset
except Exception as e:
print(f"❌ Error loading dataset: {e}")
return None
def build_tokenizer(self, dataset, vocab_size=50000):
"""Build custom tokenizer for code"""
print("πŸ”€ Building custom code tokenizer...")
tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = Whitespace()
trainer = BpeTrainer(
vocab_size=vocab_size,
special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "[EOL]"]
)
# Train tokenizer on code samples
def batch_iterator(batch_size=1000):
for i in range(0, len(dataset), batch_size):
yield dataset[i:i+batch_size]['code']
tokenizer.train_from_iterator(batch_iterator(), trainer=trainer)
tokenizer.save("./pypilot_tokenizer.json")
print("βœ… Tokenizer built and saved!")
return tokenizer
def parallel_process_files(self, file_paths, num_processes=8):
"""Process files in parallel for maximum speed"""
print(f"⚑ Processing {len(file_paths)} files with {num_processes} processes...")
def process_file(file_path):
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
return {
'file_path': str(file_path),
'content': content,
'length': len(content),
'language': self.detect_language(file_path)
}
except Exception as e:
return {'error': str(e), 'file_path': str(file_path)}
with mp.Pool(num_processes) as pool:
results = pool.map(process_file, file_paths)
successful = [r for r in results if 'error' not in r]
print(f"βœ… Processed {len(successful)} files successfully")
return successful
def detect_language(self, file_path):
"""Detect programming language from file extension"""
extensions = {
'.py': 'python',
'.js': 'javascript',
'.java': 'java',
'.cpp': 'cpp',
'.cc': 'cpp',
'.go': 'go',
'.rs': 'rust',
'.ts': 'typescript'
}
return extensions.get(Path(file_path).suffix, 'unknown')
def create_training_pairs(self, code_samples, context_size=512):
"""Create (input, target) pairs for training"""
print("πŸ”„ Creating training pairs...")
training_pairs = []
for sample in code_samples:
code = sample.get('content', '')
if len(code) > context_size:
# Split code into chunks and create prediction tasks
for i in range(0, len(code) - context_size, context_size // 2):
input_chunk = code[i:i + context_size]
target_chunk = code[i + 1:i + context_size + 1]
training_pairs.append({
'input': input_chunk,
'target': target_chunk,
'language': sample.get('language', 'unknown')
})
print(f"βœ… Created {len(training_pairs)} training pairs")
return training_pairs
if __name__ == "__main__":
preprocessor = PyPilotDataPreprocessor()
# Example usage
dataset = preprocessor.load_github_dataset('python')
if dataset:
tokenizer = preprocessor.build_tokenizer(dataset)
training_data = preprocessor.create_training_pairs(dataset)
# Save processed data
with open('processed_training_data.pkl', 'wb') as f:
pickle.dump(training_data, f)
print("πŸ’Ύ Training data saved!")