|
|
""" |
|
|
PyPilot Data Preprocessor - Handles massive code datasets |
|
|
""" |
|
|
import json |
|
|
import pickle |
|
|
import multiprocessing as mp |
|
|
from pathlib import Path |
|
|
from datasets import load_dataset |
|
|
import tokenizers |
|
|
from tokenizers import Tokenizer |
|
|
from tokenizers.models import BPE |
|
|
from tokenizers.trainers import BpeTrainer |
|
|
from tokenizers.pre_tokenizers import Whitespace |
|
|
|
|
|
class PyPilotDataPreprocessor: |
|
|
def __init__(self): |
|
|
self.supported_languages = ['python', 'javascript', 'java', 'cpp', 'go', 'rust'] |
|
|
self.processed_data = {} |
|
|
|
|
|
def load_github_dataset(self, language='python', split='train'): |
|
|
"""Load massive code dataset from Hugging Face""" |
|
|
print(f"π₯ Loading {language} code dataset...") |
|
|
try: |
|
|
dataset = load_dataset("codeparrot/github-code", split=split, languages=[language]) |
|
|
print(f"β
Loaded {len(dataset)} {language} files") |
|
|
return dataset |
|
|
except Exception as e: |
|
|
print(f"β Error loading dataset: {e}") |
|
|
return None |
|
|
|
|
|
def build_tokenizer(self, dataset, vocab_size=50000): |
|
|
"""Build custom tokenizer for code""" |
|
|
print("π€ Building custom code tokenizer...") |
|
|
|
|
|
tokenizer = Tokenizer(BPE()) |
|
|
tokenizer.pre_tokenizer = Whitespace() |
|
|
|
|
|
trainer = BpeTrainer( |
|
|
vocab_size=vocab_size, |
|
|
special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "[EOL]"] |
|
|
) |
|
|
|
|
|
|
|
|
def batch_iterator(batch_size=1000): |
|
|
for i in range(0, len(dataset), batch_size): |
|
|
yield dataset[i:i+batch_size]['code'] |
|
|
|
|
|
tokenizer.train_from_iterator(batch_iterator(), trainer=trainer) |
|
|
tokenizer.save("./pypilot_tokenizer.json") |
|
|
print("β
Tokenizer built and saved!") |
|
|
return tokenizer |
|
|
|
|
|
def parallel_process_files(self, file_paths, num_processes=8): |
|
|
"""Process files in parallel for maximum speed""" |
|
|
print(f"β‘ Processing {len(file_paths)} files with {num_processes} processes...") |
|
|
|
|
|
def process_file(file_path): |
|
|
try: |
|
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
|
content = f.read() |
|
|
return { |
|
|
'file_path': str(file_path), |
|
|
'content': content, |
|
|
'length': len(content), |
|
|
'language': self.detect_language(file_path) |
|
|
} |
|
|
except Exception as e: |
|
|
return {'error': str(e), 'file_path': str(file_path)} |
|
|
|
|
|
with mp.Pool(num_processes) as pool: |
|
|
results = pool.map(process_file, file_paths) |
|
|
|
|
|
successful = [r for r in results if 'error' not in r] |
|
|
print(f"β
Processed {len(successful)} files successfully") |
|
|
return successful |
|
|
|
|
|
def detect_language(self, file_path): |
|
|
"""Detect programming language from file extension""" |
|
|
extensions = { |
|
|
'.py': 'python', |
|
|
'.js': 'javascript', |
|
|
'.java': 'java', |
|
|
'.cpp': 'cpp', |
|
|
'.cc': 'cpp', |
|
|
'.go': 'go', |
|
|
'.rs': 'rust', |
|
|
'.ts': 'typescript' |
|
|
} |
|
|
return extensions.get(Path(file_path).suffix, 'unknown') |
|
|
|
|
|
def create_training_pairs(self, code_samples, context_size=512): |
|
|
"""Create (input, target) pairs for training""" |
|
|
print("π Creating training pairs...") |
|
|
training_pairs = [] |
|
|
|
|
|
for sample in code_samples: |
|
|
code = sample.get('content', '') |
|
|
if len(code) > context_size: |
|
|
|
|
|
for i in range(0, len(code) - context_size, context_size // 2): |
|
|
input_chunk = code[i:i + context_size] |
|
|
target_chunk = code[i + 1:i + context_size + 1] |
|
|
training_pairs.append({ |
|
|
'input': input_chunk, |
|
|
'target': target_chunk, |
|
|
'language': sample.get('language', 'unknown') |
|
|
}) |
|
|
|
|
|
print(f"β
Created {len(training_pairs)} training pairs") |
|
|
return training_pairs |
|
|
|
|
|
if __name__ == "__main__": |
|
|
preprocessor = PyPilotDataPreprocessor() |
|
|
|
|
|
|
|
|
dataset = preprocessor.load_github_dataset('python') |
|
|
if dataset: |
|
|
tokenizer = preprocessor.build_tokenizer(dataset) |
|
|
training_data = preprocessor.create_training_pairs(dataset) |
|
|
|
|
|
|
|
|
with open('processed_training_data.pkl', 'wb') as f: |
|
|
pickle.dump(training_data, f) |
|
|
print("πΎ Training data saved!") |