#!/usr/bin/env python # -*- coding: utf-8 -*- """ Simple test script for the NexForge Adaptive Tokenizer. This script demonstrates the basic usage of the adaptive tokenizer by creating a small sample Python file and building a tokenizer from it. """ import os import sys import logging from pathlib import Path import tempfile from tokenizers import Tokenizer # Add the parent directory to the path so we can import the package sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) from nexforgetokenizer import SystemResources, build_tokenizer, log_memory_usage # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler(), logging.FileHandler('tokenizer_test.log') ] ) # Sample Python code for testing SAMPLE_CODE = """ # Comprehensive Python code test for tokenizer def factorial(n): \"\"\"Calculate factorial of n.\"\"\" if n <= 1: return 1 return n * factorial(n - 1) class TestClass: def __init__(self, value): self.value = value def process(self): \"\"\"Process the value and return result.\"\"\" return self.value * 2 def main(): # Test various Python constructs numbers = [1, 2, 3, 4, 5] squares = [x**2 for x in numbers] # Test string formatting name = "NexForge" version = 1.0 # Test control flow if version > 0.5: print(f"{name} v{version} is stable!") else: print(f"{name} v{version} is in development") # Test function calls result = factorial(5) print(f"5! = {result}") # Test class usage test = TestClass(21) print(f"Processed value: {test.process()}") return 0 if __name__ == "__main__": exit(main()) """ def create_test_file(directory): """Create a test Python file in the specified directory.""" os.makedirs(directory, exist_ok=True) test_file = os.path.join(directory, 'test_code.py') with open(test_file, 'w', encoding='utf-8') as f: f.write(SAMPLE_CODE) return test_file def test_tokenizer(): """Test the adaptive tokenizer on a sample Python file.""" # Create a temporary directory for our test output with tempfile.TemporaryDirectory() as temp_dir: # Use the existing sample data sample_data_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'src', 'nexforgetokenizer', 'data', 'python_code_sample.txt') print(f"Using sample data file: {sample_data_path}") # Verify the sample file exists if not os.path.exists(sample_data_path): print(f"ERROR: Sample data file not found at {sample_data_path}") return False print(f"Sample file size: {os.path.getsize(sample_data_path)} bytes") # Directory containing the sample file data_dir = os.path.dirname(sample_data_path) print(f"Data directory: {data_dir}") # Output path for the tokenizer output_path = os.path.join(temp_dir, 'test_tokenizer.json') # Log initial memory usage print("\nInitial memory usage:") log_memory_usage() # Detect system resources resources = SystemResources() print(f"\nDetected system resources:") print(f"CPU Cores: {resources.cpu_cores}") print(f"Available RAM: {resources.available_ram_gb:.2f} GB") if resources.has_cuda: print(f"GPU: {resources.cuda_device} with {resources.cuda_mem_gb:.2f} GB") else: print("No CUDA GPU detected") # Build the tokenizer using the existing sample data directory print("\nBuilding tokenizer...") success = build_tokenizer( input_dir=data_dir, output_path=output_path, vocab_size=1000, # Small vocabulary for quick testing min_frequency=1, # Include all tokens for this test resources=resources ) if success: print(f"\nTokenizer successfully created at: {output_path}") # Load the tokenizer and test it tokenizer = Tokenizer.from_file(output_path) vocab_size = len(tokenizer.get_vocab()) print(f"Vocabulary size: {vocab_size}") # Test tokenization encoded = tokenizer.encode(SAMPLE_CODE) print(f"\nTokenized sample code:") print(f"Number of tokens: {len(encoded.ids)}") print(f"Average chars per token: {len(SAMPLE_CODE) / len(encoded.ids):.2f}") # Log final memory usage print("\nFinal memory usage:") log_memory_usage() return True else: print("Failed to create tokenizer") return False def main(): """Main function to run the test.""" print("NexForge Adaptive Tokenizer Test") print("==============================\n") result = test_tokenizer() if result: print("\nTest completed successfully!") return 0 else: print("\nTest failed!") return 1 if __name__ == "__main__": sys.exit(main())