File size: 5,384 Bytes
4265aea |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Simple test script for the NexForge Adaptive Tokenizer.
This script demonstrates the basic usage of the adaptive tokenizer
by creating a small sample Python file and building a tokenizer from it.
"""
import os
import sys
import logging
from pathlib import Path
import tempfile
from tokenizers import Tokenizer
# Add the parent directory to the path so we can import the package
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from nexforgetokenizer import SystemResources, build_tokenizer, log_memory_usage
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(),
logging.FileHandler('tokenizer_test.log')
]
)
# Sample Python code for testing
SAMPLE_CODE = """
# Comprehensive Python code test for tokenizer
def factorial(n):
\"\"\"Calculate factorial of n.\"\"\"
if n <= 1:
return 1
return n * factorial(n - 1)
class TestClass:
def __init__(self, value):
self.value = value
def process(self):
\"\"\"Process the value and return result.\"\"\"
return self.value * 2
def main():
# Test various Python constructs
numbers = [1, 2, 3, 4, 5]
squares = [x**2 for x in numbers]
# Test string formatting
name = "NexForge"
version = 1.0
# Test control flow
if version > 0.5:
print(f"{name} v{version} is stable!")
else:
print(f"{name} v{version} is in development")
# Test function calls
result = factorial(5)
print(f"5! = {result}")
# Test class usage
test = TestClass(21)
print(f"Processed value: {test.process()}")
return 0
if __name__ == "__main__":
exit(main())
"""
def create_test_file(directory):
"""Create a test Python file in the specified directory."""
os.makedirs(directory, exist_ok=True)
test_file = os.path.join(directory, 'test_code.py')
with open(test_file, 'w', encoding='utf-8') as f:
f.write(SAMPLE_CODE)
return test_file
def test_tokenizer():
"""Test the adaptive tokenizer on a sample Python file."""
# Create a temporary directory for our test output
with tempfile.TemporaryDirectory() as temp_dir:
# Use the existing sample data
sample_data_path = os.path.join(os.path.dirname(os.path.dirname(__file__)),
'src', 'nexforgetokenizer', 'data', 'python_code_sample.txt')
print(f"Using sample data file: {sample_data_path}")
# Verify the sample file exists
if not os.path.exists(sample_data_path):
print(f"ERROR: Sample data file not found at {sample_data_path}")
return False
print(f"Sample file size: {os.path.getsize(sample_data_path)} bytes")
# Directory containing the sample file
data_dir = os.path.dirname(sample_data_path)
print(f"Data directory: {data_dir}")
# Output path for the tokenizer
output_path = os.path.join(temp_dir, 'test_tokenizer.json')
# Log initial memory usage
print("\nInitial memory usage:")
log_memory_usage()
# Detect system resources
resources = SystemResources()
print(f"\nDetected system resources:")
print(f"CPU Cores: {resources.cpu_cores}")
print(f"Available RAM: {resources.available_ram_gb:.2f} GB")
if resources.has_cuda:
print(f"GPU: {resources.cuda_device} with {resources.cuda_mem_gb:.2f} GB")
else:
print("No CUDA GPU detected")
# Build the tokenizer using the existing sample data directory
print("\nBuilding tokenizer...")
success = build_tokenizer(
input_dir=data_dir,
output_path=output_path,
vocab_size=1000, # Small vocabulary for quick testing
min_frequency=1, # Include all tokens for this test
resources=resources
)
if success:
print(f"\nTokenizer successfully created at: {output_path}")
# Load the tokenizer and test it
tokenizer = Tokenizer.from_file(output_path)
vocab_size = len(tokenizer.get_vocab())
print(f"Vocabulary size: {vocab_size}")
# Test tokenization
encoded = tokenizer.encode(SAMPLE_CODE)
print(f"\nTokenized sample code:")
print(f"Number of tokens: {len(encoded.ids)}")
print(f"Average chars per token: {len(SAMPLE_CODE) / len(encoded.ids):.2f}")
# Log final memory usage
print("\nFinal memory usage:")
log_memory_usage()
return True
else:
print("Failed to create tokenizer")
return False
def main():
"""Main function to run the test."""
print("NexForge Adaptive Tokenizer Test")
print("==============================\n")
result = test_tokenizer()
if result:
print("\nTest completed successfully!")
return 0
else:
print("\nTest failed!")
return 1
if __name__ == "__main__":
sys.exit(main())
|