File size: 4,755 Bytes
3a7ae3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
"""
PyPilot Data Preprocessor - Handles massive code datasets
"""
import json
import pickle
import multiprocessing as mp
from pathlib import Path
from datasets import load_dataset
import tokenizers
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

class PyPilotDataPreprocessor:
    def __init__(self):
        self.supported_languages = ['python', 'javascript', 'java', 'cpp', 'go', 'rust']
        self.processed_data = {}
        
    def load_github_dataset(self, language='python', split='train'):
        """Load massive code dataset from Hugging Face"""
        print(f"πŸ“₯ Loading {language} code dataset...")
        try:
            dataset = load_dataset("codeparrot/github-code", split=split, languages=[language])
            print(f"βœ… Loaded {len(dataset)} {language} files")
            return dataset
        except Exception as e:
            print(f"❌ Error loading dataset: {e}")
            return None
    
    def build_tokenizer(self, dataset, vocab_size=50000):
        """Build custom tokenizer for code"""
        print("πŸ”€ Building custom code tokenizer...")
        
        tokenizer = Tokenizer(BPE())
        tokenizer.pre_tokenizer = Whitespace()
        
        trainer = BpeTrainer(
            vocab_size=vocab_size,
            special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "[EOL]"]
        )
        
        # Train tokenizer on code samples
        def batch_iterator(batch_size=1000):
            for i in range(0, len(dataset), batch_size):
                yield dataset[i:i+batch_size]['code']
        
        tokenizer.train_from_iterator(batch_iterator(), trainer=trainer)
        tokenizer.save("./pypilot_tokenizer.json")
        print("βœ… Tokenizer built and saved!")
        return tokenizer
    
    def parallel_process_files(self, file_paths, num_processes=8):
        """Process files in parallel for maximum speed"""
        print(f"⚑ Processing {len(file_paths)} files with {num_processes} processes...")
        
        def process_file(file_path):
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                return {
                    'file_path': str(file_path),
                    'content': content,
                    'length': len(content),
                    'language': self.detect_language(file_path)
                }
            except Exception as e:
                return {'error': str(e), 'file_path': str(file_path)}
        
        with mp.Pool(num_processes) as pool:
            results = pool.map(process_file, file_paths)
        
        successful = [r for r in results if 'error' not in r]
        print(f"βœ… Processed {len(successful)} files successfully")
        return successful
    
    def detect_language(self, file_path):
        """Detect programming language from file extension"""
        extensions = {
            '.py': 'python',
            '.js': 'javascript', 
            '.java': 'java',
            '.cpp': 'cpp',
            '.cc': 'cpp',
            '.go': 'go',
            '.rs': 'rust',
            '.ts': 'typescript'
        }
        return extensions.get(Path(file_path).suffix, 'unknown')
    
    def create_training_pairs(self, code_samples, context_size=512):
        """Create (input, target) pairs for training"""
        print("πŸ”„ Creating training pairs...")
        training_pairs = []
        
        for sample in code_samples:
            code = sample.get('content', '')
            if len(code) > context_size:
                # Split code into chunks and create prediction tasks
                for i in range(0, len(code) - context_size, context_size // 2):
                    input_chunk = code[i:i + context_size]
                    target_chunk = code[i + 1:i + context_size + 1]
                    training_pairs.append({
                        'input': input_chunk,
                        'target': target_chunk,
                        'language': sample.get('language', 'unknown')
                    })
        
        print(f"βœ… Created {len(training_pairs)} training pairs")
        return training_pairs

if __name__ == "__main__":
    preprocessor = PyPilotDataPreprocessor()
    
    # Example usage
    dataset = preprocessor.load_github_dataset('python')
    if dataset:
        tokenizer = preprocessor.build_tokenizer(dataset)
        training_data = preprocessor.create_training_pairs(dataset)
        
        # Save processed data
        with open('processed_training_data.pkl', 'wb') as f:
            pickle.dump(training_data, f)
        print("πŸ’Ύ Training data saved!")