EZ-Tokenizer / examples /basic_usage.py
Johnnyman1100's picture
Upload 38 files
4265aea verified
"""Basic usage example for NexForge Tokenizer Builder."""
from pathlib import Path
import os
import tempfile
from nexforgetokenizer import SystemResources, build_tokenizer, log_memory_usage
def create_sample_code():
"""Create a sample directory with Python files for testing."""
# Create a sample directory with Python files
sample_dir = Path("sample_code")
# Clean up if it exists
if sample_dir.exists():
import shutil
shutil.rmtree(sample_dir)
# Create directory
sample_dir.mkdir(exist_ok=True)
# Create some sample Python files
(sample_dir / "hello.py").write_text("""
def greet(name):
print(f"Hello, {name}!")
if __name__ == "__main__":
greet("World")
""")
(sample_dir / "math.py").write_text("""
def add(a, b):
return a + b
def multiply(a, b):
return a * b
if __name__ == "__main__":
print(f"2 + 3 = {add(2, 3)}")
print(f"2 * 3 = {multiply(2, 3)}")
""")
return sample_dir
def main():
"""Run the example."""
print("NexForge Tokenizer Builder Basic Example")
print("=======================================\n")
# Create sample code
sample_dir = create_sample_code()
print(f"Created sample code in: {sample_dir}")
# Check system resources
resources = SystemResources()
print(f"\nDetected System Resources:")
print(f"CPU Cores: {resources.cpu_cores}")
print(f"Available RAM: {resources.available_ram_gb:.2f} GB")
if resources.has_cuda:
print(f"GPU: {resources.cuda_device} with {resources.cuda_mem_gb:.2f} GB")
else:
print("No CUDA GPU detected")
# Create output path for tokenizer
output_path = "sample_tokenizer.json"
# Check initial memory usage
print("\nInitial memory usage:")
log_memory_usage()
# Build the tokenizer
print("\nBuilding tokenizer...")
success = build_tokenizer(
input_dir=str(sample_dir),
output_path=output_path,
vocab_size=1000, # Small vocabulary for this example
min_frequency=1, # Include all tokens
resources=resources
)
# Check final memory usage
print("\nFinal memory usage:")
log_memory_usage()
if success:
print(f"\nTokenizer successfully created at: {output_path}")
print(f"You can now use this tokenizer with any library that supports the HuggingFace tokenizers format")
else:
print("\nFailed to create tokenizer")
print("\nExample completed!")
if __name__ == "__main__":
main()