EZ-Tokenizer / tests /test_adaptive_tokenizer.py
Johnnyman1100's picture
Upload 38 files
4265aea verified
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Simple test script for the NexForge Adaptive Tokenizer.
This script demonstrates the basic usage of the adaptive tokenizer
by creating a small sample Python file and building a tokenizer from it.
"""
import os
import sys
import logging
from pathlib import Path
import tempfile
from tokenizers import Tokenizer
# Add the parent directory to the path so we can import the package
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from nexforgetokenizer import SystemResources, build_tokenizer, log_memory_usage
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(),
logging.FileHandler('tokenizer_test.log')
]
)
# Sample Python code for testing
SAMPLE_CODE = """
# Comprehensive Python code test for tokenizer
def factorial(n):
\"\"\"Calculate factorial of n.\"\"\"
if n <= 1:
return 1
return n * factorial(n - 1)
class TestClass:
def __init__(self, value):
self.value = value
def process(self):
\"\"\"Process the value and return result.\"\"\"
return self.value * 2
def main():
# Test various Python constructs
numbers = [1, 2, 3, 4, 5]
squares = [x**2 for x in numbers]
# Test string formatting
name = "NexForge"
version = 1.0
# Test control flow
if version > 0.5:
print(f"{name} v{version} is stable!")
else:
print(f"{name} v{version} is in development")
# Test function calls
result = factorial(5)
print(f"5! = {result}")
# Test class usage
test = TestClass(21)
print(f"Processed value: {test.process()}")
return 0
if __name__ == "__main__":
exit(main())
"""
def create_test_file(directory):
"""Create a test Python file in the specified directory."""
os.makedirs(directory, exist_ok=True)
test_file = os.path.join(directory, 'test_code.py')
with open(test_file, 'w', encoding='utf-8') as f:
f.write(SAMPLE_CODE)
return test_file
def test_tokenizer():
"""Test the adaptive tokenizer on a sample Python file."""
# Create a temporary directory for our test output
with tempfile.TemporaryDirectory() as temp_dir:
# Use the existing sample data
sample_data_path = os.path.join(os.path.dirname(os.path.dirname(__file__)),
'src', 'nexforgetokenizer', 'data', 'python_code_sample.txt')
print(f"Using sample data file: {sample_data_path}")
# Verify the sample file exists
if not os.path.exists(sample_data_path):
print(f"ERROR: Sample data file not found at {sample_data_path}")
return False
print(f"Sample file size: {os.path.getsize(sample_data_path)} bytes")
# Directory containing the sample file
data_dir = os.path.dirname(sample_data_path)
print(f"Data directory: {data_dir}")
# Output path for the tokenizer
output_path = os.path.join(temp_dir, 'test_tokenizer.json')
# Log initial memory usage
print("\nInitial memory usage:")
log_memory_usage()
# Detect system resources
resources = SystemResources()
print(f"\nDetected system resources:")
print(f"CPU Cores: {resources.cpu_cores}")
print(f"Available RAM: {resources.available_ram_gb:.2f} GB")
if resources.has_cuda:
print(f"GPU: {resources.cuda_device} with {resources.cuda_mem_gb:.2f} GB")
else:
print("No CUDA GPU detected")
# Build the tokenizer using the existing sample data directory
print("\nBuilding tokenizer...")
success = build_tokenizer(
input_dir=data_dir,
output_path=output_path,
vocab_size=1000, # Small vocabulary for quick testing
min_frequency=1, # Include all tokens for this test
resources=resources
)
if success:
print(f"\nTokenizer successfully created at: {output_path}")
# Load the tokenizer and test it
tokenizer = Tokenizer.from_file(output_path)
vocab_size = len(tokenizer.get_vocab())
print(f"Vocabulary size: {vocab_size}")
# Test tokenization
encoded = tokenizer.encode(SAMPLE_CODE)
print(f"\nTokenized sample code:")
print(f"Number of tokens: {len(encoded.ids)}")
print(f"Average chars per token: {len(SAMPLE_CODE) / len(encoded.ids):.2f}")
# Log final memory usage
print("\nFinal memory usage:")
log_memory_usage()
return True
else:
print("Failed to create tokenizer")
return False
def main():
"""Main function to run the test."""
print("NexForge Adaptive Tokenizer Test")
print("==============================\n")
result = test_tokenizer()
if result:
print("\nTest completed successfully!")
return 0
else:
print("\nTest failed!")
return 1
if __name__ == "__main__":
sys.exit(main())