EZ-Tokenizer / Test_tokenizer /test_tokenizer_simple.py

Upload 38 files

4265aea verified 3 months ago

7.93 kB

	import os
	import sys
	from pathlib import Path
	from tokenizers import Tokenizer
	from typing import Optional, Tuple, List, Dict, Any
	import json

	def get_project_root() -> Path:
	"""Get the project root directory."""
	# Use the current working directory as the project root
	return Path.cwd()

	def setup_paths() -> Tuple[Path, Path, Path]:
	"""Set up and validate required paths.

	Returns:
	Tuple containing (tokenizer_path, data_dir, output_dir)
	"""
	root = get_project_root()

	# Define paths - look in root directory (one level up from Test_tokenizer)
	tokenizer_path = root.parent / 'output' / 'tokenizer.json'
	data_dir = root.parent / 'Dataset' # Look in root directory
	output_dir = root.parent / 'test_result' # Output to root directory

	# Create output directory if it doesn't exist
	output_dir.mkdir(parents=True, exist_ok=True)
	output_dir.mkdir(parents=True, exist_ok=True)

	# Validate paths
	if not tokenizer_path.exists():
	print(f"Error: Tokenizer not found at {tokenizer_path}")
	sys.exit(1)

	if not data_dir.exists():
	print(f"Error: Data directory not found at {data_dir}")
	sys.exit(1)

	return tokenizer_path, data_dir, output_dir

	def get_first_chunk_file(data_dir: Path) -> Optional[Path]:
	"""Get the first chunk file from the data directory."""
	# Look for .txt files in the data directory
	chunk_files = sorted(list(data_dir.glob('*.txt')))
	if not chunk_files:
	print(f"Error: No .txt files found in {data_dir}")
	return None
	return chunk_files[0] # Return the first chunk file

	def test_tokenizer_on_chunk(tokenizer: Tokenizer, chunk_path: Path, max_lines: int = 1000) -> Dict[str, Any]:
	"""Test the tokenizer on the first max_lines of a chunk file."""
	results = {
	'total_lines': 0,
	'lines_processed': 0,
	'total_tokens': 0,
	'perfect_matches': 0,
	'total_chars': 0,
	'total_diff_chars': 0,
	'lines': []
	}

	try:
	with open(chunk_path, 'r', encoding='utf-8') as f:
	for i, line in enumerate(f):
	if i >= max_lines:
	break

	line = line.strip()
	if not line: # Skip empty lines
	continue

	# Tokenize and decode
	encoding = tokenizer.encode(line)
	decoded = tokenizer.decode(encoding.ids)

	# Calculate differences
	diff_chars = sum(1 for a, b in zip(line, decoded) if a != b)
	diff_chars += abs(len(line) - len(decoded))
	is_perfect = diff_chars == 0

	# Update results
	results['total_lines'] += 1
	results['lines_processed'] += 1
	results['total_tokens'] += len(encoding.tokens)
	results['total_chars'] += len(line)
	results['total_diff_chars'] += diff_chars
	results['perfect_matches'] += 1 if is_perfect else 0

	# Store detailed results for the first few lines
	if i < 5: # First 5 lines
	results['lines'].append({
	'original': line[:200] + ('...' if len(line) > 200 else ''),
	'decoded': decoded[:200] + ('...' if len(decoded) > 200 else ''),
	'tokens': encoding.tokens[:10], # First 10 tokens
	'is_perfect': is_perfect,
	'diff_chars': diff_chars,
	'similarity': 1 - (diff_chars / max(len(line), 1))
	})

	# Print progress
	if (i + 1) % 100 == 0:
	print(f"Processed {i+1} lines...")

	except Exception as e:
	print(f"Error processing file: {e}")
	return results

	return results

	def print_summary(results: Dict[str, Any], output_path: Path) -> None:
	"""Print and save test summary in TXT format with script name in the filename."""
	if not results['lines_processed']:
	print("No lines were processed.")
	return

	# Calculate statistics
	avg_tokens_per_line = results['total_tokens'] / results['lines_processed']
	total_chars = results['total_chars']
	total_diff_chars = results['total_diff_chars']
	accuracy = 1 - (total_diff_chars / total_chars) if total_chars > 0 else 0
	diff_percentage = (total_diff_chars / total_chars * 100) if total_chars > 0 else 0

	# Get script name without extension
	script_name = Path(__file__).stem

	# Prepare summary text
	summary = [
	"="*80,
	"TOKENIZER TEST SUMMARY",
	"="*80,
	f"Test Script: {script_name}.py",
	f"Timestamp: {results.get('timestamp', 'N/A')}",
	f"Tokenizer: {results.get('tokenizer_path', 'N/A')}",
	f"Chunk file: {results.get('chunk_file', 'N/A')}",
	"-"*80,
	f"Lines processed: {results['lines_processed']}",
	f"Perfect matches: {results['perfect_matches']} ({results['perfect_matches']/results['lines_processed']*100:.1f}%)",
	f"Average tokens/line: {avg_tokens_per_line:.2f}",
	f"Total characters: {total_chars:,}",
	f"Total tokens: {results['total_tokens']:,}",
	f"Character accuracy: {accuracy*100:.2f}%",
	f"Character diff: {total_diff_chars:,} chars ({diff_percentage:.4f}%)",
	f"Chars per token: {total_chars/results['total_tokens']:.2f} (lower is better)",
	"\nSAMPLE LINES:",
	"-"*40
	]

	# Add sample lines
	for i, line in enumerate(results.get('lines', [])[:3]):
	summary.extend([
	f"\nSAMPLE {i+1}:",
	f"Original: {line.get('original', '')}",
	f"Decoded: {line.get('decoded', '')}",
	f"Tokens: {', '.join(line.get('tokens', [])[:8])}{'...' if len(line.get('tokens', [])) > 8 else ''}",
	f"Match: {'✓ Perfect' if line.get('is_perfect') else '✗ Different'}",
	"-"*40
	])

	# Print to console
	print("\n".join(summary))

	# Save as TXT with script name in filename
	timestamp = results.get('timestamp', '')
	output_file = output_path / f'{script_name}_result_{timestamp}.txt'

	with open(output_file, 'w', encoding='utf-8') as f:
	f.write("\n".join(summary))

	print(f"\nResults saved to: {output_file}")

	def main():
	# Set up paths
	tokenizer_path, data_dir, output_dir = setup_paths()

	# Get the first chunk file
	chunk_path = get_first_chunk_file(data_dir)
	if not chunk_path:
	print(f"No files found in {data_dir}. Please ensure the Dataset directory contains text files.")
	return

	print(f"Found data directory: {data_dir}")
	print(f"Output directory: {output_dir}")

	print(f"Testing tokenizer on first 1000 lines of: {chunk_path.name}")

	# Load the tokenizer
	print(f"Loading tokenizer from: {tokenizer_path}")
	tokenizer = Tokenizer.from_file(str(tokenizer_path))

	# Get vocabulary info
	vocab = tokenizer.get_vocab()
	print(f"Vocabulary size: {len(vocab):,} tokens")

	# Test tokenizer on the chunk
	print("\nTesting tokenizer on chunk...")
	results = test_tokenizer_on_chunk(tokenizer, chunk_path, max_lines=1000)

	# Add timestamp and tokenizer info to results
	import time
	results['timestamp'] = time.strftime("%Y%m%d_%H%M%S")
	results['tokenizer_path'] = str(tokenizer_path)
	results['chunk_file'] = str(chunk_path.name)

	# Print and save summary
	print_summary(results, output_dir)
	print("\nTest complete!")

	if __name__ == "__main__":
	main()