|
"""Basic usage example for NexForge Tokenizer Builder.""" |
|
from pathlib import Path |
|
import os |
|
import tempfile |
|
|
|
from nexforgetokenizer import SystemResources, build_tokenizer, log_memory_usage |
|
|
|
def create_sample_code(): |
|
"""Create a sample directory with Python files for testing.""" |
|
|
|
sample_dir = Path("sample_code") |
|
|
|
|
|
if sample_dir.exists(): |
|
import shutil |
|
shutil.rmtree(sample_dir) |
|
|
|
|
|
sample_dir.mkdir(exist_ok=True) |
|
|
|
|
|
(sample_dir / "hello.py").write_text(""" |
|
def greet(name): |
|
print(f"Hello, {name}!") |
|
|
|
if __name__ == "__main__": |
|
greet("World") |
|
""") |
|
|
|
(sample_dir / "math.py").write_text(""" |
|
def add(a, b): |
|
return a + b |
|
|
|
def multiply(a, b): |
|
return a * b |
|
|
|
if __name__ == "__main__": |
|
print(f"2 + 3 = {add(2, 3)}") |
|
print(f"2 * 3 = {multiply(2, 3)}") |
|
""") |
|
|
|
return sample_dir |
|
|
|
def main(): |
|
"""Run the example.""" |
|
print("NexForge Tokenizer Builder Basic Example") |
|
print("=======================================\n") |
|
|
|
|
|
sample_dir = create_sample_code() |
|
print(f"Created sample code in: {sample_dir}") |
|
|
|
|
|
resources = SystemResources() |
|
print(f"\nDetected System Resources:") |
|
print(f"CPU Cores: {resources.cpu_cores}") |
|
print(f"Available RAM: {resources.available_ram_gb:.2f} GB") |
|
if resources.has_cuda: |
|
print(f"GPU: {resources.cuda_device} with {resources.cuda_mem_gb:.2f} GB") |
|
else: |
|
print("No CUDA GPU detected") |
|
|
|
|
|
output_path = "sample_tokenizer.json" |
|
|
|
|
|
print("\nInitial memory usage:") |
|
log_memory_usage() |
|
|
|
|
|
print("\nBuilding tokenizer...") |
|
success = build_tokenizer( |
|
input_dir=str(sample_dir), |
|
output_path=output_path, |
|
vocab_size=1000, |
|
min_frequency=1, |
|
resources=resources |
|
) |
|
|
|
|
|
print("\nFinal memory usage:") |
|
log_memory_usage() |
|
|
|
if success: |
|
print(f"\nTokenizer successfully created at: {output_path}") |
|
print(f"You can now use this tokenizer with any library that supports the HuggingFace tokenizers format") |
|
else: |
|
print("\nFailed to create tokenizer") |
|
|
|
print("\nExample completed!") |
|
|
|
if __name__ == "__main__": |
|
main() |