EZ-Tokenizer / examples /advanced_usage.py
Johnnyman1100's picture
Upload 38 files
4265aea verified
"""
Advanced usage example for NexForge Tokenizer Builder.
This example demonstrates:
- Custom special tokens
- Batch processing with progress tracking
- Vocabulary inspection and analysis
- Error handling and recovery
- Performance optimization
"""
import os
import json
import time
from pathlib import Path
from typing import Dict, List, Optional
from tqdm import tqdm
# Import the tokenizer components
from nexforgetokenizer import (
build_tokenizer,
SystemResources,
log_memory_usage,
TokenizerError
)
def create_large_sample_dataset(num_files: int = 50, base_dir: str = "sample_data") -> Path:
"""Create a larger sample dataset with different file types."""
base_path = Path(base_dir)
# Clean up if exists
if base_path.exists():
import shutil
shutil.rmtree(base_path)
# Create directories
base_path.mkdir(exist_ok=True)
# Create Python files
for i in range(num_files // 2):
module_content = f"""
# Sample Python module {i}
def process_data(data):
'''Process sample data.'''
result = []
for item in data:
if item % 2 == 0:
result.append(item * 2)
return result
"""
(base_path / f"module_{i}.py").write_text(module_content)
# Create text files
for i in range(num_files // 2):
doc_content = f"""
This is sample text document {i}.
It contains multiple lines of text with various tokens.
The quick brown fox jumps over the lazy dog.
Special characters: !@#$%^&*()_+-=[]{{}}|;':\",./<>?
"""
(base_path / f"document_{i}.txt").write_text(doc_content)
print(f"Created {num_files} sample files in {base_path}")
return base_path
class DataProcessor:
"""Example data processor class for demonstration."""
def __init__(self, config: dict):
self.config = config
def run(self):
"""Run the processor with the current config."""
print(f"Processing with config: {self.config}")
class TokenizerAnalyzer:
"""Helper class for analyzing tokenizer performance and vocabulary."""
def __init__(self, tokenizer_path: str):
self.tokenizer_path = tokenizer_path
self.tokenizer = None
self.vocab = None
def load(self):
"""Load the tokenizer."""
from tokenizers import Tokenizer
self.tokenizer = Tokenizer.from_file(self.tokenizer_path)
self.vocab = {
idx: self.tokenizer.id_to_token(idx)
for idx in range(self.tokenizer.get_vocab_size())
}
def analyze_vocab(self, top_n: int = 20):
"""Analyze and print vocabulary statistics."""
if not self.tokenizer:
self.load()
vocab_size = len(self.vocab)
special_tokens = [
token for token in self.vocab.values()
if token.startswith("[") and token.endswith("]")
]
print(f"\n=== Vocabulary Analysis ===")
print(f"Total vocabulary size: {vocab_size}")
print(f"Special tokens ({len(special_tokens)}): {', '.join(special_tokens[:10])}" +
("..." if len(special_tokens) > 10 else ""))
# Show sample of vocabulary
print(f"\nSample vocabulary items:")
for idx in range(min(top_n, vocab_size)):
print(f" {idx}: {self.vocab.get(idx, 'N/A')}")
if vocab_size > top_n:
print(f" ... and {vocab_size - top_n} more")
def main():
"""Run the advanced example."""
print("NexForge Tokenizer Builder - Advanced Example")
print("=========================================\n")
# 1. Setup
output_dir = Path("advanced_output")
output_dir.mkdir(exist_ok=True)
tokenizer_path = output_dir / "advanced_tokenizer.json"
# 2. Check system resources
resources = SystemResources()
print(f"\n=== System Resources ===")
print(f"CPU Cores: {resources.cpu_cores}")
print(f"Available RAM: {resources.available_ram_gb:.2f} GB")
if resources.has_cuda:
print(f"GPU: {resources.cuda_device} with {resources.cuda_mem_gb:.2f} GB")
else:
print("No CUDA GPU detected")
# 3. Create sample dataset
print("\n=== Creating Sample Dataset ===")
dataset_path = create_large_sample_dataset(num_files=50)
# 4. Custom special tokens
special_tokens = [
"[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]",
"[PYTHON]", "[TEXT]", "[CODE]"
]
# 5. Build the tokenizer with advanced options
print("\n=== Building Tokenizer ===")
print(f"Input directory: {dataset_path}")
print(f"Output path: {tokenizer_path}")
start_time = time.time()
try:
success = build_tokenizer(
input_dir=str(dataset_path),
output_path=str(tokenizer_path),
vocab_size=5000, # Larger vocabulary for better coverage
min_frequency=2, # Only include tokens that appear at least twice
special_tokens=special_tokens,
resources=resources,
max_files=50, # Process all files
chunk_size=100000, # Process in 100KB chunks
n_threads=max(1, resources.cpu_cores - 1) # Use all but one CPU core
)
if success:
duration = time.time() - start_time
print(f"\nTokenizer created successfully in {duration:.2f} seconds")
print(f"Tokenizer saved to: {tokenizer_path}")
# 6. Analyze the created tokenizer
print("\n=== Tokenizer Analysis ===")
analyzer = TokenizerAnalyzer(str(tokenizer_path))
analyzer.load()
analyzer.analyze_vocab()
# 7. Show example encoding/decoding
print("\n=== Example Encoding/Decoding ===")
sample_text = "def hello_world():\n print('Hello, world!') # Sample Python code"
encoded = analyzer.tokenizer.encode(sample_text)
decoded = analyzer.tokenizer.decode(encoded.ids)
print(f"Original: {sample_text}")
print(f"Encoded: {encoded.ids}")
print(f"Tokens: {encoded.tokens}")
print(f"Decoded: {decoded}")
else:
print("\nFailed to create tokenizer")
except TokenizerError as e:
print(f"\nError creating tokenizer: {e}")
except Exception as e:
print(f"\nUnexpected error: {e}")
finally:
# 8. Cleanup (optional)
# import shutil
# shutil.rmtree(dataset_path, ignore_errors=True)
pass
print("\nExample completed!")
if __name__ == "__main__":
main()