EZ-Tokenizer / tests /test_adaptive_tokenizer.py

Upload 38 files

4265aea verified 3 months ago

5.38 kB

	#!/usr/bin/env python
	# -- coding: utf-8 --

	"""
	Simple test script for the NexForge Adaptive Tokenizer.

	This script demonstrates the basic usage of the adaptive tokenizer
	by creating a small sample Python file and building a tokenizer from it.
	"""

	import os
	import sys
	import logging
	from pathlib import Path
	import tempfile
	from tokenizers import Tokenizer

	# Add the parent directory to the path so we can import the package
	sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

	from nexforgetokenizer import SystemResources, build_tokenizer, log_memory_usage

	# Configure logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
	handlers=[
	logging.StreamHandler(),
	logging.FileHandler('tokenizer_test.log')
	]
	)

	# Sample Python code for testing
	SAMPLE_CODE = """
	# Comprehensive Python code test for tokenizer

	def factorial(n):
	\"\"\"Calculate factorial of n.\"\"\"
	if n <= 1:
	return 1
	return n * factorial(n - 1)

	class TestClass:
	def __init__(self, value):
	self.value = value

	def process(self):
	\"\"\"Process the value and return result.\"\"\"
	return self.value * 2

	def main():
	# Test various Python constructs
	numbers = [1, 2, 3, 4, 5]
	squares = [x**2 for x in numbers]

	# Test string formatting
	name = "NexForge"
	version = 1.0

	# Test control flow
	if version > 0.5:
	print(f"{name} v{version} is stable!")
	else:
	print(f"{name} v{version} is in development")

	# Test function calls
	result = factorial(5)
	print(f"5! = {result}")

	# Test class usage
	test = TestClass(21)
	print(f"Processed value: {test.process()}")

	return 0

	if __name__ == "__main__":
	exit(main())
	"""

	def create_test_file(directory):
	"""Create a test Python file in the specified directory."""
	os.makedirs(directory, exist_ok=True)
	test_file = os.path.join(directory, 'test_code.py')

	with open(test_file, 'w', encoding='utf-8') as f:
	f.write(SAMPLE_CODE)

	return test_file

	def test_tokenizer():
	"""Test the adaptive tokenizer on a sample Python file."""
	# Create a temporary directory for our test output
	with tempfile.TemporaryDirectory() as temp_dir:
	# Use the existing sample data
	sample_data_path = os.path.join(os.path.dirname(os.path.dirname(__file__)),
	'src', 'nexforgetokenizer', 'data', 'python_code_sample.txt')

	print(f"Using sample data file: {sample_data_path}")

	# Verify the sample file exists
	if not os.path.exists(sample_data_path):
	print(f"ERROR: Sample data file not found at {sample_data_path}")
	return False

	print(f"Sample file size: {os.path.getsize(sample_data_path)} bytes")

	# Directory containing the sample file
	data_dir = os.path.dirname(sample_data_path)
	print(f"Data directory: {data_dir}")

	# Output path for the tokenizer
	output_path = os.path.join(temp_dir, 'test_tokenizer.json')

	# Log initial memory usage
	print("\nInitial memory usage:")
	log_memory_usage()

	# Detect system resources
	resources = SystemResources()
	print(f"\nDetected system resources:")
	print(f"CPU Cores: {resources.cpu_cores}")
	print(f"Available RAM: {resources.available_ram_gb:.2f} GB")
	if resources.has_cuda:
	print(f"GPU: {resources.cuda_device} with {resources.cuda_mem_gb:.2f} GB")
	else:
	print("No CUDA GPU detected")

	# Build the tokenizer using the existing sample data directory
	print("\nBuilding tokenizer...")
	success = build_tokenizer(
	input_dir=data_dir,
	output_path=output_path,
	vocab_size=1000, # Small vocabulary for quick testing
	min_frequency=1, # Include all tokens for this test
	resources=resources
	)

	if success:
	print(f"\nTokenizer successfully created at: {output_path}")

	# Load the tokenizer and test it
	tokenizer = Tokenizer.from_file(output_path)
	vocab_size = len(tokenizer.get_vocab())
	print(f"Vocabulary size: {vocab_size}")

	# Test tokenization
	encoded = tokenizer.encode(SAMPLE_CODE)
	print(f"\nTokenized sample code:")
	print(f"Number of tokens: {len(encoded.ids)}")
	print(f"Average chars per token: {len(SAMPLE_CODE) / len(encoded.ids):.2f}")

	# Log final memory usage
	print("\nFinal memory usage:")
	log_memory_usage()

	return True
	else:
	print("Failed to create tokenizer")
	return False

	def main():
	"""Main function to run the test."""
	print("NexForge Adaptive Tokenizer Test")
	print("==============================\n")

	result = test_tokenizer()

	if result:
	print("\nTest completed successfully!")
	return 0
	else:
	print("\nTest failed!")
	return 1

	if __name__ == "__main__":
	sys.exit(main())