File size: 7,931 Bytes

4265aea

import os
import sys
from pathlib import Path
from tokenizers import Tokenizer
from typing import Optional, Tuple, List, Dict, Any
import json

def get_project_root() -> Path:
    """Get the project root directory."""
    # Use the current working directory as the project root
    return Path.cwd()

def setup_paths() -> Tuple[Path, Path, Path]:
    """Set up and validate required paths.
    
    Returns:
        Tuple containing (tokenizer_path, data_dir, output_dir)
    """
    root = get_project_root()
    
    # Define paths - look in root directory (one level up from Test_tokenizer)
    tokenizer_path = root.parent / 'output' / 'tokenizer.json'
    data_dir = root.parent / 'Dataset'  # Look in root directory
    output_dir = root.parent / 'test_result'  # Output to root directory
    
    # Create output directory if it doesn't exist
    output_dir.mkdir(parents=True, exist_ok=True)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Validate paths
    if not tokenizer_path.exists():
        print(f"Error: Tokenizer not found at {tokenizer_path}")
        sys.exit(1)
        
    if not data_dir.exists():
        print(f"Error: Data directory not found at {data_dir}")
        sys.exit(1)
    
    return tokenizer_path, data_dir, output_dir

def get_first_chunk_file(data_dir: Path) -> Optional[Path]:
    """Get the first chunk file from the data directory."""
    # Look for .txt files in the data directory
    chunk_files = sorted(list(data_dir.glob('*.txt')))
    if not chunk_files:
        print(f"Error: No .txt files found in {data_dir}")
        return None
    return chunk_files[0]  # Return the first chunk file

def test_tokenizer_on_chunk(tokenizer: Tokenizer, chunk_path: Path, max_lines: int = 1000) -> Dict[str, Any]:
    """Test the tokenizer on the first max_lines of a chunk file."""
    results = {
        'total_lines': 0,
        'lines_processed': 0,
        'total_tokens': 0,
        'perfect_matches': 0,
        'total_chars': 0,
        'total_diff_chars': 0,
        'lines': []
    }
    
    try:
        with open(chunk_path, 'r', encoding='utf-8') as f:
            for i, line in enumerate(f):
                if i >= max_lines:
                    break
                    
                line = line.strip()
                if not line:  # Skip empty lines
                    continue
                    
                # Tokenize and decode
                encoding = tokenizer.encode(line)
                decoded = tokenizer.decode(encoding.ids)
                
                # Calculate differences
                diff_chars = sum(1 for a, b in zip(line, decoded) if a != b)
                diff_chars += abs(len(line) - len(decoded))
                is_perfect = diff_chars == 0
                
                # Update results
                results['total_lines'] += 1
                results['lines_processed'] += 1
                results['total_tokens'] += len(encoding.tokens)
                results['total_chars'] += len(line)
                results['total_diff_chars'] += diff_chars
                results['perfect_matches'] += 1 if is_perfect else 0
                
                # Store detailed results for the first few lines
                if i < 5:  # First 5 lines
                    results['lines'].append({
                        'original': line[:200] + ('...' if len(line) > 200 else ''),
                        'decoded': decoded[:200] + ('...' if len(decoded) > 200 else ''),
                        'tokens': encoding.tokens[:10],  # First 10 tokens
                        'is_perfect': is_perfect,
                        'diff_chars': diff_chars,
                        'similarity': 1 - (diff_chars / max(len(line), 1))
                    })
                
                # Print progress
                if (i + 1) % 100 == 0:
                    print(f"Processed {i+1} lines...")
                    
    except Exception as e:
        print(f"Error processing file: {e}")
        return results
    
    return results

def print_summary(results: Dict[str, Any], output_path: Path) -> None:
    """Print and save test summary in TXT format with script name in the filename."""
    if not results['lines_processed']:
        print("No lines were processed.")
        return
    
    # Calculate statistics
    avg_tokens_per_line = results['total_tokens'] / results['lines_processed']
    total_chars = results['total_chars']
    total_diff_chars = results['total_diff_chars']
    accuracy = 1 - (total_diff_chars / total_chars) if total_chars > 0 else 0
    diff_percentage = (total_diff_chars / total_chars * 100) if total_chars > 0 else 0
    
    # Get script name without extension
    script_name = Path(__file__).stem
    
    # Prepare summary text
    summary = [
        "="*80,
        "TOKENIZER TEST SUMMARY",
        "="*80,
        f"Test Script:       {script_name}.py",
        f"Timestamp:          {results.get('timestamp', 'N/A')}",
        f"Tokenizer:          {results.get('tokenizer_path', 'N/A')}",
        f"Chunk file:         {results.get('chunk_file', 'N/A')}",
        "-"*80,
        f"Lines processed:     {results['lines_processed']}",
        f"Perfect matches:     {results['perfect_matches']} ({results['perfect_matches']/results['lines_processed']*100:.1f}%)",
        f"Average tokens/line:  {avg_tokens_per_line:.2f}",
        f"Total characters:    {total_chars:,}",
        f"Total tokens:        {results['total_tokens']:,}",
        f"Character accuracy:   {accuracy*100:.2f}%",
        f"Character diff:      {total_diff_chars:,} chars ({diff_percentage:.4f}%)",
        f"Chars per token:     {total_chars/results['total_tokens']:.2f} (lower is better)",
        "\nSAMPLE LINES:",
        "-"*40
    ]
    
    # Add sample lines
    for i, line in enumerate(results.get('lines', [])[:3]):
        summary.extend([
            f"\nSAMPLE {i+1}:",
            f"Original: {line.get('original', '')}",
            f"Decoded:  {line.get('decoded', '')}",
            f"Tokens:   {', '.join(line.get('tokens', [])[:8])}{'...' if len(line.get('tokens', [])) > 8 else ''}",
            f"Match:    {'✓ Perfect' if line.get('is_perfect') else '✗ Different'}",
            "-"*40
        ])
    
    # Print to console
    print("\n".join(summary))
    
    # Save as TXT with script name in filename
    timestamp = results.get('timestamp', '')
    output_file = output_path / f'{script_name}_result_{timestamp}.txt'
    
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write("\n".join(summary))
    
    print(f"\nResults saved to: {output_file}")

def main():
    # Set up paths
    tokenizer_path, data_dir, output_dir = setup_paths()
    
    # Get the first chunk file
    chunk_path = get_first_chunk_file(data_dir)
    if not chunk_path:
        print(f"No files found in {data_dir}. Please ensure the Dataset directory contains text files.")
        return
        
    print(f"Found data directory: {data_dir}")
    print(f"Output directory: {output_dir}")
    
    print(f"Testing tokenizer on first 1000 lines of: {chunk_path.name}")
    
    # Load the tokenizer
    print(f"Loading tokenizer from: {tokenizer_path}")
    tokenizer = Tokenizer.from_file(str(tokenizer_path))
    
    # Get vocabulary info
    vocab = tokenizer.get_vocab()
    print(f"Vocabulary size: {len(vocab):,} tokens")
    
    # Test tokenizer on the chunk
    print("\nTesting tokenizer on chunk...")
    results = test_tokenizer_on_chunk(tokenizer, chunk_path, max_lines=1000)
    
    # Add timestamp and tokenizer info to results
    import time
    results['timestamp'] = time.strftime("%Y%m%d_%H%M%S")
    results['tokenizer_path'] = str(tokenizer_path)
    results['chunk_file'] = str(chunk_path.name)
    
    # Print and save summary
    print_summary(results, output_dir)
    print("\nTest complete!")

if __name__ == "__main__":
    main()