"""Basic usage example for NexForge Tokenizer Builder.""" from pathlib import Path import os import tempfile from nexforgetokenizer import SystemResources, build_tokenizer, log_memory_usage def create_sample_code(): """Create a sample directory with Python files for testing.""" # Create a sample directory with Python files sample_dir = Path("sample_code") # Clean up if it exists if sample_dir.exists(): import shutil shutil.rmtree(sample_dir) # Create directory sample_dir.mkdir(exist_ok=True) # Create some sample Python files (sample_dir / "hello.py").write_text(""" def greet(name): print(f"Hello, {name}!") if __name__ == "__main__": greet("World") """) (sample_dir / "math.py").write_text(""" def add(a, b): return a + b def multiply(a, b): return a * b if __name__ == "__main__": print(f"2 + 3 = {add(2, 3)}") print(f"2 * 3 = {multiply(2, 3)}") """) return sample_dir def main(): """Run the example.""" print("NexForge Tokenizer Builder Basic Example") print("=======================================\n") # Create sample code sample_dir = create_sample_code() print(f"Created sample code in: {sample_dir}") # Check system resources resources = SystemResources() print(f"\nDetected System Resources:") print(f"CPU Cores: {resources.cpu_cores}") print(f"Available RAM: {resources.available_ram_gb:.2f} GB") if resources.has_cuda: print(f"GPU: {resources.cuda_device} with {resources.cuda_mem_gb:.2f} GB") else: print("No CUDA GPU detected") # Create output path for tokenizer output_path = "sample_tokenizer.json" # Check initial memory usage print("\nInitial memory usage:") log_memory_usage() # Build the tokenizer print("\nBuilding tokenizer...") success = build_tokenizer( input_dir=str(sample_dir), output_path=output_path, vocab_size=1000, # Small vocabulary for this example min_frequency=1, # Include all tokens resources=resources ) # Check final memory usage print("\nFinal memory usage:") log_memory_usage() if success: print(f"\nTokenizer successfully created at: {output_path}") print(f"You can now use this tokenizer with any library that supports the HuggingFace tokenizers format") else: print("\nFailed to create tokenizer") print("\nExample completed!") if __name__ == "__main__": main()