EZ-Tokenizer / Test_tokenizer /test_tokenizer_simple.py
Johnnyman1100's picture
Upload 38 files
4265aea verified
import os
import sys
from pathlib import Path
from tokenizers import Tokenizer
from typing import Optional, Tuple, List, Dict, Any
import json
def get_project_root() -> Path:
"""Get the project root directory."""
# Use the current working directory as the project root
return Path.cwd()
def setup_paths() -> Tuple[Path, Path, Path]:
"""Set up and validate required paths.
Returns:
Tuple containing (tokenizer_path, data_dir, output_dir)
"""
root = get_project_root()
# Define paths - look in root directory (one level up from Test_tokenizer)
tokenizer_path = root.parent / 'output' / 'tokenizer.json'
data_dir = root.parent / 'Dataset' # Look in root directory
output_dir = root.parent / 'test_result' # Output to root directory
# Create output directory if it doesn't exist
output_dir.mkdir(parents=True, exist_ok=True)
output_dir.mkdir(parents=True, exist_ok=True)
# Validate paths
if not tokenizer_path.exists():
print(f"Error: Tokenizer not found at {tokenizer_path}")
sys.exit(1)
if not data_dir.exists():
print(f"Error: Data directory not found at {data_dir}")
sys.exit(1)
return tokenizer_path, data_dir, output_dir
def get_first_chunk_file(data_dir: Path) -> Optional[Path]:
"""Get the first chunk file from the data directory."""
# Look for .txt files in the data directory
chunk_files = sorted(list(data_dir.glob('*.txt')))
if not chunk_files:
print(f"Error: No .txt files found in {data_dir}")
return None
return chunk_files[0] # Return the first chunk file
def test_tokenizer_on_chunk(tokenizer: Tokenizer, chunk_path: Path, max_lines: int = 1000) -> Dict[str, Any]:
"""Test the tokenizer on the first max_lines of a chunk file."""
results = {
'total_lines': 0,
'lines_processed': 0,
'total_tokens': 0,
'perfect_matches': 0,
'total_chars': 0,
'total_diff_chars': 0,
'lines': []
}
try:
with open(chunk_path, 'r', encoding='utf-8') as f:
for i, line in enumerate(f):
if i >= max_lines:
break
line = line.strip()
if not line: # Skip empty lines
continue
# Tokenize and decode
encoding = tokenizer.encode(line)
decoded = tokenizer.decode(encoding.ids)
# Calculate differences
diff_chars = sum(1 for a, b in zip(line, decoded) if a != b)
diff_chars += abs(len(line) - len(decoded))
is_perfect = diff_chars == 0
# Update results
results['total_lines'] += 1
results['lines_processed'] += 1
results['total_tokens'] += len(encoding.tokens)
results['total_chars'] += len(line)
results['total_diff_chars'] += diff_chars
results['perfect_matches'] += 1 if is_perfect else 0
# Store detailed results for the first few lines
if i < 5: # First 5 lines
results['lines'].append({
'original': line[:200] + ('...' if len(line) > 200 else ''),
'decoded': decoded[:200] + ('...' if len(decoded) > 200 else ''),
'tokens': encoding.tokens[:10], # First 10 tokens
'is_perfect': is_perfect,
'diff_chars': diff_chars,
'similarity': 1 - (diff_chars / max(len(line), 1))
})
# Print progress
if (i + 1) % 100 == 0:
print(f"Processed {i+1} lines...")
except Exception as e:
print(f"Error processing file: {e}")
return results
return results
def print_summary(results: Dict[str, Any], output_path: Path) -> None:
"""Print and save test summary in TXT format with script name in the filename."""
if not results['lines_processed']:
print("No lines were processed.")
return
# Calculate statistics
avg_tokens_per_line = results['total_tokens'] / results['lines_processed']
total_chars = results['total_chars']
total_diff_chars = results['total_diff_chars']
accuracy = 1 - (total_diff_chars / total_chars) if total_chars > 0 else 0
diff_percentage = (total_diff_chars / total_chars * 100) if total_chars > 0 else 0
# Get script name without extension
script_name = Path(__file__).stem
# Prepare summary text
summary = [
"="*80,
"TOKENIZER TEST SUMMARY",
"="*80,
f"Test Script: {script_name}.py",
f"Timestamp: {results.get('timestamp', 'N/A')}",
f"Tokenizer: {results.get('tokenizer_path', 'N/A')}",
f"Chunk file: {results.get('chunk_file', 'N/A')}",
"-"*80,
f"Lines processed: {results['lines_processed']}",
f"Perfect matches: {results['perfect_matches']} ({results['perfect_matches']/results['lines_processed']*100:.1f}%)",
f"Average tokens/line: {avg_tokens_per_line:.2f}",
f"Total characters: {total_chars:,}",
f"Total tokens: {results['total_tokens']:,}",
f"Character accuracy: {accuracy*100:.2f}%",
f"Character diff: {total_diff_chars:,} chars ({diff_percentage:.4f}%)",
f"Chars per token: {total_chars/results['total_tokens']:.2f} (lower is better)",
"\nSAMPLE LINES:",
"-"*40
]
# Add sample lines
for i, line in enumerate(results.get('lines', [])[:3]):
summary.extend([
f"\nSAMPLE {i+1}:",
f"Original: {line.get('original', '')}",
f"Decoded: {line.get('decoded', '')}",
f"Tokens: {', '.join(line.get('tokens', [])[:8])}{'...' if len(line.get('tokens', [])) > 8 else ''}",
f"Match: {'✓ Perfect' if line.get('is_perfect') else '✗ Different'}",
"-"*40
])
# Print to console
print("\n".join(summary))
# Save as TXT with script name in filename
timestamp = results.get('timestamp', '')
output_file = output_path / f'{script_name}_result_{timestamp}.txt'
with open(output_file, 'w', encoding='utf-8') as f:
f.write("\n".join(summary))
print(f"\nResults saved to: {output_file}")
def main():
# Set up paths
tokenizer_path, data_dir, output_dir = setup_paths()
# Get the first chunk file
chunk_path = get_first_chunk_file(data_dir)
if not chunk_path:
print(f"No files found in {data_dir}. Please ensure the Dataset directory contains text files.")
return
print(f"Found data directory: {data_dir}")
print(f"Output directory: {output_dir}")
print(f"Testing tokenizer on first 1000 lines of: {chunk_path.name}")
# Load the tokenizer
print(f"Loading tokenizer from: {tokenizer_path}")
tokenizer = Tokenizer.from_file(str(tokenizer_path))
# Get vocabulary info
vocab = tokenizer.get_vocab()
print(f"Vocabulary size: {len(vocab):,} tokens")
# Test tokenizer on the chunk
print("\nTesting tokenizer on chunk...")
results = test_tokenizer_on_chunk(tokenizer, chunk_path, max_lines=1000)
# Add timestamp and tokenizer info to results
import time
results['timestamp'] = time.strftime("%Y%m%d_%H%M%S")
results['tokenizer_path'] = str(tokenizer_path)
results['chunk_file'] = str(chunk_path.name)
# Print and save summary
print_summary(results, output_dir)
print("\nTest complete!")
if __name__ == "__main__":
main()