File size: 5,384 Bytes

4265aea

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Simple test script for the NexForge Adaptive Tokenizer.

This script demonstrates the basic usage of the adaptive tokenizer
by creating a small sample Python file and building a tokenizer from it.
"""

import os
import sys
import logging
from pathlib import Path
import tempfile
from tokenizers import Tokenizer

# Add the parent directory to the path so we can import the package
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

from nexforgetokenizer import SystemResources, build_tokenizer, log_memory_usage

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler('tokenizer_test.log')
    ]
)

# Sample Python code for testing
SAMPLE_CODE = """
# Comprehensive Python code test for tokenizer

def factorial(n):
    \"\"\"Calculate factorial of n.\"\"\"
    if n <= 1:
        return 1
    return n * factorial(n - 1)

class TestClass:
    def __init__(self, value):
        self.value = value
    
    def process(self):
        \"\"\"Process the value and return result.\"\"\"
        return self.value * 2

def main():
    # Test various Python constructs
    numbers = [1, 2, 3, 4, 5]
    squares = [x**2 for x in numbers]
    
    # Test string formatting
    name = "NexForge"
    version = 1.0
    
    # Test control flow
    if version > 0.5:
        print(f"{name} v{version} is stable!")
    else:
        print(f"{name} v{version} is in development")
    
    # Test function calls
    result = factorial(5)
    print(f"5! = {result}")
    
    # Test class usage
    test = TestClass(21)
    print(f"Processed value: {test.process()}")
    
    return 0

if __name__ == "__main__":
    exit(main())
"""

def create_test_file(directory):
    """Create a test Python file in the specified directory."""
    os.makedirs(directory, exist_ok=True)
    test_file = os.path.join(directory, 'test_code.py')
    
    with open(test_file, 'w', encoding='utf-8') as f:
        f.write(SAMPLE_CODE)
    
    return test_file

def test_tokenizer():
    """Test the adaptive tokenizer on a sample Python file."""
    # Create a temporary directory for our test output
    with tempfile.TemporaryDirectory() as temp_dir:
        # Use the existing sample data
        sample_data_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 
                                    'src', 'nexforgetokenizer', 'data', 'python_code_sample.txt')
        
        print(f"Using sample data file: {sample_data_path}")
        
        # Verify the sample file exists
        if not os.path.exists(sample_data_path):
            print(f"ERROR: Sample data file not found at {sample_data_path}")
            return False
            
        print(f"Sample file size: {os.path.getsize(sample_data_path)} bytes")
        
        # Directory containing the sample file
        data_dir = os.path.dirname(sample_data_path)
        print(f"Data directory: {data_dir}")
        
        # Output path for the tokenizer
        output_path = os.path.join(temp_dir, 'test_tokenizer.json')
        
        # Log initial memory usage
        print("\nInitial memory usage:")
        log_memory_usage()
        
        # Detect system resources
        resources = SystemResources()
        print(f"\nDetected system resources:")
        print(f"CPU Cores: {resources.cpu_cores}")
        print(f"Available RAM: {resources.available_ram_gb:.2f} GB")
        if resources.has_cuda:
            print(f"GPU: {resources.cuda_device} with {resources.cuda_mem_gb:.2f} GB")
        else:
            print("No CUDA GPU detected")
        
        # Build the tokenizer using the existing sample data directory
        print("\nBuilding tokenizer...")
        success = build_tokenizer(
            input_dir=data_dir,
            output_path=output_path,
            vocab_size=1000,  # Small vocabulary for quick testing
            min_frequency=1,  # Include all tokens for this test
            resources=resources
        )
        
        if success:
            print(f"\nTokenizer successfully created at: {output_path}")
            
            # Load the tokenizer and test it
            tokenizer = Tokenizer.from_file(output_path)
            vocab_size = len(tokenizer.get_vocab())
            print(f"Vocabulary size: {vocab_size}")
            
            # Test tokenization
            encoded = tokenizer.encode(SAMPLE_CODE)
            print(f"\nTokenized sample code:")
            print(f"Number of tokens: {len(encoded.ids)}")
            print(f"Average chars per token: {len(SAMPLE_CODE) / len(encoded.ids):.2f}")
            
            # Log final memory usage
            print("\nFinal memory usage:")
            log_memory_usage()
            
            return True
        else:
            print("Failed to create tokenizer")
            return False

def main():
    """Main function to run the test."""
    print("NexForge Adaptive Tokenizer Test")
    print("==============================\n")
    
    result = test_tokenizer()
    
    if result:
        print("\nTest completed successfully!")
        return 0
    else:
        print("\nTest failed!")
        return 1

if __name__ == "__main__":
    sys.exit(main())