File size: 2,565 Bytes
4265aea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
"""Basic usage example for NexForge Tokenizer Builder."""
from pathlib import Path
import os
import tempfile

from nexforgetokenizer import SystemResources, build_tokenizer, log_memory_usage

def create_sample_code():
    """Create a sample directory with Python files for testing."""
    # Create a sample directory with Python files
    sample_dir = Path("sample_code")
    
    # Clean up if it exists
    if sample_dir.exists():
        import shutil
        shutil.rmtree(sample_dir)
    
    # Create directory
    sample_dir.mkdir(exist_ok=True)
    
    # Create some sample Python files
    (sample_dir / "hello.py").write_text("""
def greet(name):
    print(f"Hello, {name}!")

if __name__ == "__main__":
    greet("World")
""")
    
    (sample_dir / "math.py").write_text("""
def add(a, b):
    return a + b

def multiply(a, b):
    return a * b

if __name__ == "__main__":
    print(f"2 + 3 = {add(2, 3)}")
    print(f"2 * 3 = {multiply(2, 3)}")
""")
    
    return sample_dir

def main():
    """Run the example."""
    print("NexForge Tokenizer Builder Basic Example")
    print("=======================================\n")
    
    # Create sample code
    sample_dir = create_sample_code()
    print(f"Created sample code in: {sample_dir}")
    
    # Check system resources
    resources = SystemResources()
    print(f"\nDetected System Resources:")
    print(f"CPU Cores: {resources.cpu_cores}")
    print(f"Available RAM: {resources.available_ram_gb:.2f} GB")
    if resources.has_cuda:
        print(f"GPU: {resources.cuda_device} with {resources.cuda_mem_gb:.2f} GB")
    else:
        print("No CUDA GPU detected")
    
    # Create output path for tokenizer
    output_path = "sample_tokenizer.json"
    
    # Check initial memory usage
    print("\nInitial memory usage:")
    log_memory_usage()
    
    # Build the tokenizer
    print("\nBuilding tokenizer...")
    success = build_tokenizer(
        input_dir=str(sample_dir),
        output_path=output_path,
        vocab_size=1000,  # Small vocabulary for this example
        min_frequency=1,  # Include all tokens
        resources=resources
    )
    
    # Check final memory usage
    print("\nFinal memory usage:")
    log_memory_usage()
    
    if success:
        print(f"\nTokenizer successfully created at: {output_path}")
        print(f"You can now use this tokenizer with any library that supports the HuggingFace tokenizers format")
    else:
        print("\nFailed to create tokenizer")
    
    print("\nExample completed!")

if __name__ == "__main__":
    main()