import os import sys from pathlib import Path from tokenizers import Tokenizer from typing import Optional, Tuple, List, Dict, Any import json def get_project_root() -> Path: """Get the project root directory.""" # Use the current working directory as the project root return Path.cwd() def setup_paths() -> Tuple[Path, Path, Path]: """Set up and validate required paths. Returns: Tuple containing (tokenizer_path, data_dir, output_dir) """ root = get_project_root() # Define paths - look in root directory (one level up from Test_tokenizer) tokenizer_path = root.parent / 'output' / 'tokenizer.json' data_dir = root.parent / 'Dataset' # Look in root directory output_dir = root.parent / 'test_result' # Output to root directory # Create output directory if it doesn't exist output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True) # Validate paths if not tokenizer_path.exists(): print(f"Error: Tokenizer not found at {tokenizer_path}") sys.exit(1) if not data_dir.exists(): print(f"Error: Data directory not found at {data_dir}") sys.exit(1) return tokenizer_path, data_dir, output_dir def get_first_chunk_file(data_dir: Path) -> Optional[Path]: """Get the first chunk file from the data directory.""" # Look for .txt files in the data directory chunk_files = sorted(list(data_dir.glob('*.txt'))) if not chunk_files: print(f"Error: No .txt files found in {data_dir}") return None return chunk_files[0] # Return the first chunk file def test_tokenizer_on_chunk(tokenizer: Tokenizer, chunk_path: Path, max_lines: int = 1000) -> Dict[str, Any]: """Test the tokenizer on the first max_lines of a chunk file.""" results = { 'total_lines': 0, 'lines_processed': 0, 'total_tokens': 0, 'perfect_matches': 0, 'total_chars': 0, 'total_diff_chars': 0, 'lines': [] } try: with open(chunk_path, 'r', encoding='utf-8') as f: for i, line in enumerate(f): if i >= max_lines: break line = line.strip() if not line: # Skip empty lines continue # Tokenize and decode encoding = tokenizer.encode(line) decoded = tokenizer.decode(encoding.ids) # Calculate differences diff_chars = sum(1 for a, b in zip(line, decoded) if a != b) diff_chars += abs(len(line) - len(decoded)) is_perfect = diff_chars == 0 # Update results results['total_lines'] += 1 results['lines_processed'] += 1 results['total_tokens'] += len(encoding.tokens) results['total_chars'] += len(line) results['total_diff_chars'] += diff_chars results['perfect_matches'] += 1 if is_perfect else 0 # Store detailed results for the first few lines if i < 5: # First 5 lines results['lines'].append({ 'original': line[:200] + ('...' if len(line) > 200 else ''), 'decoded': decoded[:200] + ('...' if len(decoded) > 200 else ''), 'tokens': encoding.tokens[:10], # First 10 tokens 'is_perfect': is_perfect, 'diff_chars': diff_chars, 'similarity': 1 - (diff_chars / max(len(line), 1)) }) # Print progress if (i + 1) % 100 == 0: print(f"Processed {i+1} lines...") except Exception as e: print(f"Error processing file: {e}") return results return results def print_summary(results: Dict[str, Any], output_path: Path) -> None: """Print and save test summary in TXT format with script name in the filename.""" if not results['lines_processed']: print("No lines were processed.") return # Calculate statistics avg_tokens_per_line = results['total_tokens'] / results['lines_processed'] total_chars = results['total_chars'] total_diff_chars = results['total_diff_chars'] accuracy = 1 - (total_diff_chars / total_chars) if total_chars > 0 else 0 diff_percentage = (total_diff_chars / total_chars * 100) if total_chars > 0 else 0 # Get script name without extension script_name = Path(__file__).stem # Prepare summary text summary = [ "="*80, "TOKENIZER TEST SUMMARY", "="*80, f"Test Script: {script_name}.py", f"Timestamp: {results.get('timestamp', 'N/A')}", f"Tokenizer: {results.get('tokenizer_path', 'N/A')}", f"Chunk file: {results.get('chunk_file', 'N/A')}", "-"*80, f"Lines processed: {results['lines_processed']}", f"Perfect matches: {results['perfect_matches']} ({results['perfect_matches']/results['lines_processed']*100:.1f}%)", f"Average tokens/line: {avg_tokens_per_line:.2f}", f"Total characters: {total_chars:,}", f"Total tokens: {results['total_tokens']:,}", f"Character accuracy: {accuracy*100:.2f}%", f"Character diff: {total_diff_chars:,} chars ({diff_percentage:.4f}%)", f"Chars per token: {total_chars/results['total_tokens']:.2f} (lower is better)", "\nSAMPLE LINES:", "-"*40 ] # Add sample lines for i, line in enumerate(results.get('lines', [])[:3]): summary.extend([ f"\nSAMPLE {i+1}:", f"Original: {line.get('original', '')}", f"Decoded: {line.get('decoded', '')}", f"Tokens: {', '.join(line.get('tokens', [])[:8])}{'...' if len(line.get('tokens', [])) > 8 else ''}", f"Match: {'✓ Perfect' if line.get('is_perfect') else '✗ Different'}", "-"*40 ]) # Print to console print("\n".join(summary)) # Save as TXT with script name in filename timestamp = results.get('timestamp', '') output_file = output_path / f'{script_name}_result_{timestamp}.txt' with open(output_file, 'w', encoding='utf-8') as f: f.write("\n".join(summary)) print(f"\nResults saved to: {output_file}") def main(): # Set up paths tokenizer_path, data_dir, output_dir = setup_paths() # Get the first chunk file chunk_path = get_first_chunk_file(data_dir) if not chunk_path: print(f"No files found in {data_dir}. Please ensure the Dataset directory contains text files.") return print(f"Found data directory: {data_dir}") print(f"Output directory: {output_dir}") print(f"Testing tokenizer on first 1000 lines of: {chunk_path.name}") # Load the tokenizer print(f"Loading tokenizer from: {tokenizer_path}") tokenizer = Tokenizer.from_file(str(tokenizer_path)) # Get vocabulary info vocab = tokenizer.get_vocab() print(f"Vocabulary size: {len(vocab):,} tokens") # Test tokenizer on the chunk print("\nTesting tokenizer on chunk...") results = test_tokenizer_on_chunk(tokenizer, chunk_path, max_lines=1000) # Add timestamp and tokenizer info to results import time results['timestamp'] = time.strftime("%Y%m%d_%H%M%S") results['tokenizer_path'] = str(tokenizer_path) results['chunk_file'] = str(chunk_path.name) # Print and save summary print_summary(results, output_dir) print("\nTest complete!") if __name__ == "__main__": main()