EZ-Tokenizer / examples /advanced_usage.py

Upload 38 files

4265aea verified 3 months ago

6.71 kB

	"""
	Advanced usage example for NexForge Tokenizer Builder.

	This example demonstrates:
	- Custom special tokens
	- Batch processing with progress tracking
	- Vocabulary inspection and analysis
	- Error handling and recovery
	- Performance optimization
	"""
	import os
	import json
	import time
	from pathlib import Path
	from typing import Dict, List, Optional

	from tqdm import tqdm

	# Import the tokenizer components
	from nexforgetokenizer import (
	build_tokenizer,
	SystemResources,
	log_memory_usage,
	TokenizerError
	)

	def create_large_sample_dataset(num_files: int = 50, base_dir: str = "sample_data") -> Path:
	"""Create a larger sample dataset with different file types."""
	base_path = Path(base_dir)

	# Clean up if exists
	if base_path.exists():
	import shutil
	shutil.rmtree(base_path)

	# Create directories
	base_path.mkdir(exist_ok=True)

	# Create Python files
	for i in range(num_files // 2):
	module_content = f"""
	# Sample Python module {i}

	def process_data(data):
	'''Process sample data.'''
	result = []
	for item in data:
	if item % 2 == 0:
	result.append(item * 2)
	return result
	"""
	(base_path / f"module_{i}.py").write_text(module_content)

	# Create text files
	for i in range(num_files // 2):
	doc_content = f"""
	This is sample text document {i}.
	It contains multiple lines of text with various tokens.
	The quick brown fox jumps over the lazy dog.
	Special characters: !@#$%^&*()_+-=[]{{}}\|;':\",./<>?
	"""
	(base_path / f"document_{i}.txt").write_text(doc_content)

	print(f"Created {num_files} sample files in {base_path}")
	return base_path

	class DataProcessor:
	"""Example data processor class for demonstration."""
	def __init__(self, config: dict):
	self.config = config

	def run(self):
	"""Run the processor with the current config."""
	print(f"Processing with config: {self.config}")

	class TokenizerAnalyzer:
	"""Helper class for analyzing tokenizer performance and vocabulary."""

	def __init__(self, tokenizer_path: str):
	self.tokenizer_path = tokenizer_path
	self.tokenizer = None
	self.vocab = None

	def load(self):
	"""Load the tokenizer."""
	from tokenizers import Tokenizer
	self.tokenizer = Tokenizer.from_file(self.tokenizer_path)
	self.vocab = {
	idx: self.tokenizer.id_to_token(idx)
	for idx in range(self.tokenizer.get_vocab_size())
	}

	def analyze_vocab(self, top_n: int = 20):
	"""Analyze and print vocabulary statistics."""
	if not self.tokenizer:
	self.load()

	vocab_size = len(self.vocab)
	special_tokens = [
	token for token in self.vocab.values()
	if token.startswith("[") and token.endswith("]")
	]

	print(f"\n=== Vocabulary Analysis ===")
	print(f"Total vocabulary size: {vocab_size}")
	print(f"Special tokens ({len(special_tokens)}): {', '.join(special_tokens[:10])}" +
	("..." if len(special_tokens) > 10 else ""))

	# Show sample of vocabulary
	print(f"\nSample vocabulary items:")
	for idx in range(min(top_n, vocab_size)):
	print(f" {idx}: {self.vocab.get(idx, 'N/A')}")

	if vocab_size > top_n:
	print(f" ... and {vocab_size - top_n} more")

	def main():
	"""Run the advanced example."""
	print("NexForge Tokenizer Builder - Advanced Example")
	print("=========================================\n")

	# 1. Setup
	output_dir = Path("advanced_output")
	output_dir.mkdir(exist_ok=True)

	tokenizer_path = output_dir / "advanced_tokenizer.json"

	# 2. Check system resources
	resources = SystemResources()
	print(f"\n=== System Resources ===")
	print(f"CPU Cores: {resources.cpu_cores}")
	print(f"Available RAM: {resources.available_ram_gb:.2f} GB")
	if resources.has_cuda:
	print(f"GPU: {resources.cuda_device} with {resources.cuda_mem_gb:.2f} GB")
	else:
	print("No CUDA GPU detected")

	# 3. Create sample dataset
	print("\n=== Creating Sample Dataset ===")
	dataset_path = create_large_sample_dataset(num_files=50)

	# 4. Custom special tokens
	special_tokens = [
	"[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]",
	"[PYTHON]", "[TEXT]", "[CODE]"
	]

	# 5. Build the tokenizer with advanced options
	print("\n=== Building Tokenizer ===")
	print(f"Input directory: {dataset_path}")
	print(f"Output path: {tokenizer_path}")

	start_time = time.time()

	try:
	success = build_tokenizer(
	input_dir=str(dataset_path),
	output_path=str(tokenizer_path),
	vocab_size=5000, # Larger vocabulary for better coverage
	min_frequency=2, # Only include tokens that appear at least twice
	special_tokens=special_tokens,
	resources=resources,
	max_files=50, # Process all files
	chunk_size=100000, # Process in 100KB chunks
	n_threads=max(1, resources.cpu_cores - 1) # Use all but one CPU core
	)

	if success:
	duration = time.time() - start_time
	print(f"\nTokenizer created successfully in {duration:.2f} seconds")
	print(f"Tokenizer saved to: {tokenizer_path}")

	# 6. Analyze the created tokenizer
	print("\n=== Tokenizer Analysis ===")
	analyzer = TokenizerAnalyzer(str(tokenizer_path))
	analyzer.load()
	analyzer.analyze_vocab()

	# 7. Show example encoding/decoding
	print("\n=== Example Encoding/Decoding ===")
	sample_text = "def hello_world():\n print('Hello, world!') # Sample Python code"

	encoded = analyzer.tokenizer.encode(sample_text)
	decoded = analyzer.tokenizer.decode(encoded.ids)

	print(f"Original: {sample_text}")
	print(f"Encoded: {encoded.ids}")
	print(f"Tokens: {encoded.tokens}")
	print(f"Decoded: {decoded}")

	else:
	print("\nFailed to create tokenizer")

	except TokenizerError as e:
	print(f"\nError creating tokenizer: {e}")
	except Exception as e:
	print(f"\nUnexpected error: {e}")
	finally:
	# 8. Cleanup (optional)
	# import shutil
	# shutil.rmtree(dataset_path, ignore_errors=True)
	pass

	print("\nExample completed!")

	if __name__ == "__main__":
	main()