|
|
|
|
|
|
|
""" |
|
Simple test script for the NexForge Adaptive Tokenizer. |
|
|
|
This script demonstrates the basic usage of the adaptive tokenizer |
|
by creating a small sample Python file and building a tokenizer from it. |
|
""" |
|
|
|
import os |
|
import sys |
|
import logging |
|
from pathlib import Path |
|
import tempfile |
|
from tokenizers import Tokenizer |
|
|
|
|
|
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) |
|
|
|
from nexforgetokenizer import SystemResources, build_tokenizer, log_memory_usage |
|
|
|
|
|
logging.basicConfig( |
|
level=logging.INFO, |
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', |
|
handlers=[ |
|
logging.StreamHandler(), |
|
logging.FileHandler('tokenizer_test.log') |
|
] |
|
) |
|
|
|
|
|
SAMPLE_CODE = """ |
|
# Comprehensive Python code test for tokenizer |
|
|
|
def factorial(n): |
|
\"\"\"Calculate factorial of n.\"\"\" |
|
if n <= 1: |
|
return 1 |
|
return n * factorial(n - 1) |
|
|
|
class TestClass: |
|
def __init__(self, value): |
|
self.value = value |
|
|
|
def process(self): |
|
\"\"\"Process the value and return result.\"\"\" |
|
return self.value * 2 |
|
|
|
def main(): |
|
# Test various Python constructs |
|
numbers = [1, 2, 3, 4, 5] |
|
squares = [x**2 for x in numbers] |
|
|
|
# Test string formatting |
|
name = "NexForge" |
|
version = 1.0 |
|
|
|
# Test control flow |
|
if version > 0.5: |
|
print(f"{name} v{version} is stable!") |
|
else: |
|
print(f"{name} v{version} is in development") |
|
|
|
# Test function calls |
|
result = factorial(5) |
|
print(f"5! = {result}") |
|
|
|
# Test class usage |
|
test = TestClass(21) |
|
print(f"Processed value: {test.process()}") |
|
|
|
return 0 |
|
|
|
if __name__ == "__main__": |
|
exit(main()) |
|
""" |
|
|
|
def create_test_file(directory): |
|
"""Create a test Python file in the specified directory.""" |
|
os.makedirs(directory, exist_ok=True) |
|
test_file = os.path.join(directory, 'test_code.py') |
|
|
|
with open(test_file, 'w', encoding='utf-8') as f: |
|
f.write(SAMPLE_CODE) |
|
|
|
return test_file |
|
|
|
def test_tokenizer(): |
|
"""Test the adaptive tokenizer on a sample Python file.""" |
|
|
|
with tempfile.TemporaryDirectory() as temp_dir: |
|
|
|
sample_data_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), |
|
'src', 'nexforgetokenizer', 'data', 'python_code_sample.txt') |
|
|
|
print(f"Using sample data file: {sample_data_path}") |
|
|
|
|
|
if not os.path.exists(sample_data_path): |
|
print(f"ERROR: Sample data file not found at {sample_data_path}") |
|
return False |
|
|
|
print(f"Sample file size: {os.path.getsize(sample_data_path)} bytes") |
|
|
|
|
|
data_dir = os.path.dirname(sample_data_path) |
|
print(f"Data directory: {data_dir}") |
|
|
|
|
|
output_path = os.path.join(temp_dir, 'test_tokenizer.json') |
|
|
|
|
|
print("\nInitial memory usage:") |
|
log_memory_usage() |
|
|
|
|
|
resources = SystemResources() |
|
print(f"\nDetected system resources:") |
|
print(f"CPU Cores: {resources.cpu_cores}") |
|
print(f"Available RAM: {resources.available_ram_gb:.2f} GB") |
|
if resources.has_cuda: |
|
print(f"GPU: {resources.cuda_device} with {resources.cuda_mem_gb:.2f} GB") |
|
else: |
|
print("No CUDA GPU detected") |
|
|
|
|
|
print("\nBuilding tokenizer...") |
|
success = build_tokenizer( |
|
input_dir=data_dir, |
|
output_path=output_path, |
|
vocab_size=1000, |
|
min_frequency=1, |
|
resources=resources |
|
) |
|
|
|
if success: |
|
print(f"\nTokenizer successfully created at: {output_path}") |
|
|
|
|
|
tokenizer = Tokenizer.from_file(output_path) |
|
vocab_size = len(tokenizer.get_vocab()) |
|
print(f"Vocabulary size: {vocab_size}") |
|
|
|
|
|
encoded = tokenizer.encode(SAMPLE_CODE) |
|
print(f"\nTokenized sample code:") |
|
print(f"Number of tokens: {len(encoded.ids)}") |
|
print(f"Average chars per token: {len(SAMPLE_CODE) / len(encoded.ids):.2f}") |
|
|
|
|
|
print("\nFinal memory usage:") |
|
log_memory_usage() |
|
|
|
return True |
|
else: |
|
print("Failed to create tokenizer") |
|
return False |
|
|
|
def main(): |
|
"""Main function to run the test.""" |
|
print("NexForge Adaptive Tokenizer Test") |
|
print("==============================\n") |
|
|
|
result = test_tokenizer() |
|
|
|
if result: |
|
print("\nTest completed successfully!") |
|
return 0 |
|
else: |
|
print("\nTest failed!") |
|
return 1 |
|
|
|
if __name__ == "__main__": |
|
sys.exit(main()) |
|
|