|
import os |
|
import sys |
|
from pathlib import Path |
|
from tokenizers import Tokenizer |
|
from typing import Optional, Tuple, List, Dict, Any |
|
import json |
|
|
|
def get_project_root() -> Path: |
|
"""Get the project root directory.""" |
|
|
|
return Path.cwd() |
|
|
|
def setup_paths() -> Tuple[Path, Path, Path]: |
|
"""Set up and validate required paths. |
|
|
|
Returns: |
|
Tuple containing (tokenizer_path, data_dir, output_dir) |
|
""" |
|
root = get_project_root() |
|
|
|
|
|
tokenizer_path = root.parent / 'output' / 'tokenizer.json' |
|
data_dir = root.parent / 'Dataset' |
|
output_dir = root.parent / 'test_result' |
|
|
|
|
|
output_dir.mkdir(parents=True, exist_ok=True) |
|
output_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
if not tokenizer_path.exists(): |
|
print(f"Error: Tokenizer not found at {tokenizer_path}") |
|
sys.exit(1) |
|
|
|
if not data_dir.exists(): |
|
print(f"Error: Data directory not found at {data_dir}") |
|
sys.exit(1) |
|
|
|
return tokenizer_path, data_dir, output_dir |
|
|
|
def get_first_chunk_file(data_dir: Path) -> Optional[Path]: |
|
"""Get the first chunk file from the data directory.""" |
|
|
|
chunk_files = sorted(list(data_dir.glob('*.txt'))) |
|
if not chunk_files: |
|
print(f"Error: No .txt files found in {data_dir}") |
|
return None |
|
return chunk_files[0] |
|
|
|
def test_tokenizer_on_chunk(tokenizer: Tokenizer, chunk_path: Path, max_lines: int = 1000) -> Dict[str, Any]: |
|
"""Test the tokenizer on the first max_lines of a chunk file.""" |
|
results = { |
|
'total_lines': 0, |
|
'lines_processed': 0, |
|
'total_tokens': 0, |
|
'perfect_matches': 0, |
|
'total_chars': 0, |
|
'total_diff_chars': 0, |
|
'lines': [] |
|
} |
|
|
|
try: |
|
with open(chunk_path, 'r', encoding='utf-8') as f: |
|
for i, line in enumerate(f): |
|
if i >= max_lines: |
|
break |
|
|
|
line = line.strip() |
|
if not line: |
|
continue |
|
|
|
|
|
encoding = tokenizer.encode(line) |
|
decoded = tokenizer.decode(encoding.ids) |
|
|
|
|
|
diff_chars = sum(1 for a, b in zip(line, decoded) if a != b) |
|
diff_chars += abs(len(line) - len(decoded)) |
|
is_perfect = diff_chars == 0 |
|
|
|
|
|
results['total_lines'] += 1 |
|
results['lines_processed'] += 1 |
|
results['total_tokens'] += len(encoding.tokens) |
|
results['total_chars'] += len(line) |
|
results['total_diff_chars'] += diff_chars |
|
results['perfect_matches'] += 1 if is_perfect else 0 |
|
|
|
|
|
if i < 5: |
|
results['lines'].append({ |
|
'original': line[:200] + ('...' if len(line) > 200 else ''), |
|
'decoded': decoded[:200] + ('...' if len(decoded) > 200 else ''), |
|
'tokens': encoding.tokens[:10], |
|
'is_perfect': is_perfect, |
|
'diff_chars': diff_chars, |
|
'similarity': 1 - (diff_chars / max(len(line), 1)) |
|
}) |
|
|
|
|
|
if (i + 1) % 100 == 0: |
|
print(f"Processed {i+1} lines...") |
|
|
|
except Exception as e: |
|
print(f"Error processing file: {e}") |
|
return results |
|
|
|
return results |
|
|
|
def print_summary(results: Dict[str, Any], output_path: Path) -> None: |
|
"""Print and save test summary in TXT format with script name in the filename.""" |
|
if not results['lines_processed']: |
|
print("No lines were processed.") |
|
return |
|
|
|
|
|
avg_tokens_per_line = results['total_tokens'] / results['lines_processed'] |
|
total_chars = results['total_chars'] |
|
total_diff_chars = results['total_diff_chars'] |
|
accuracy = 1 - (total_diff_chars / total_chars) if total_chars > 0 else 0 |
|
diff_percentage = (total_diff_chars / total_chars * 100) if total_chars > 0 else 0 |
|
|
|
|
|
script_name = Path(__file__).stem |
|
|
|
|
|
summary = [ |
|
"="*80, |
|
"TOKENIZER TEST SUMMARY", |
|
"="*80, |
|
f"Test Script: {script_name}.py", |
|
f"Timestamp: {results.get('timestamp', 'N/A')}", |
|
f"Tokenizer: {results.get('tokenizer_path', 'N/A')}", |
|
f"Chunk file: {results.get('chunk_file', 'N/A')}", |
|
"-"*80, |
|
f"Lines processed: {results['lines_processed']}", |
|
f"Perfect matches: {results['perfect_matches']} ({results['perfect_matches']/results['lines_processed']*100:.1f}%)", |
|
f"Average tokens/line: {avg_tokens_per_line:.2f}", |
|
f"Total characters: {total_chars:,}", |
|
f"Total tokens: {results['total_tokens']:,}", |
|
f"Character accuracy: {accuracy*100:.2f}%", |
|
f"Character diff: {total_diff_chars:,} chars ({diff_percentage:.4f}%)", |
|
f"Chars per token: {total_chars/results['total_tokens']:.2f} (lower is better)", |
|
"\nSAMPLE LINES:", |
|
"-"*40 |
|
] |
|
|
|
|
|
for i, line in enumerate(results.get('lines', [])[:3]): |
|
summary.extend([ |
|
f"\nSAMPLE {i+1}:", |
|
f"Original: {line.get('original', '')}", |
|
f"Decoded: {line.get('decoded', '')}", |
|
f"Tokens: {', '.join(line.get('tokens', [])[:8])}{'...' if len(line.get('tokens', [])) > 8 else ''}", |
|
f"Match: {'✓ Perfect' if line.get('is_perfect') else '✗ Different'}", |
|
"-"*40 |
|
]) |
|
|
|
|
|
print("\n".join(summary)) |
|
|
|
|
|
timestamp = results.get('timestamp', '') |
|
output_file = output_path / f'{script_name}_result_{timestamp}.txt' |
|
|
|
with open(output_file, 'w', encoding='utf-8') as f: |
|
f.write("\n".join(summary)) |
|
|
|
print(f"\nResults saved to: {output_file}") |
|
|
|
def main(): |
|
|
|
tokenizer_path, data_dir, output_dir = setup_paths() |
|
|
|
|
|
chunk_path = get_first_chunk_file(data_dir) |
|
if not chunk_path: |
|
print(f"No files found in {data_dir}. Please ensure the Dataset directory contains text files.") |
|
return |
|
|
|
print(f"Found data directory: {data_dir}") |
|
print(f"Output directory: {output_dir}") |
|
|
|
print(f"Testing tokenizer on first 1000 lines of: {chunk_path.name}") |
|
|
|
|
|
print(f"Loading tokenizer from: {tokenizer_path}") |
|
tokenizer = Tokenizer.from_file(str(tokenizer_path)) |
|
|
|
|
|
vocab = tokenizer.get_vocab() |
|
print(f"Vocabulary size: {len(vocab):,} tokens") |
|
|
|
|
|
print("\nTesting tokenizer on chunk...") |
|
results = test_tokenizer_on_chunk(tokenizer, chunk_path, max_lines=1000) |
|
|
|
|
|
import time |
|
results['timestamp'] = time.strftime("%Y%m%d_%H%M%S") |
|
results['tokenizer_path'] = str(tokenizer_path) |
|
results['chunk_file'] = str(chunk_path.name) |
|
|
|
|
|
print_summary(results, output_dir) |
|
print("\nTest complete!") |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|