""" Validate the quality of generated training examples """ import json import re from typing import List, Dict, Tuple def analyze_training_examples(filepath: str) -> Dict: """Analyze the quality and characteristics of training examples""" with open(filepath, 'r', encoding='utf-8') as f: examples = json.load(f) analysis = { 'total_examples': len(examples), 'provocative_titles': 0, 'cynical_phrases': 0, 'technical_content': 0, 'negative_analogies': 0, 'avg_article_length': 0, 'style_consistency': 0, 'sample_titles': [] } # Style indicators provocative_words = [ 'disaster', 'catastrophe', 'crash', 'burn', 'fail', 'collapse', 'meltdown', 'nightmare', 'fiasco', 'debacle', 'train wreck', 'explosion', 'implosion' ] cynical_phrases = [ 'of course', 'naturally', 'predictably', 'unsurprisingly', 'evidently', 'clearly', 'obviously', 'needless to say' ] negative_analogies = [ 'train wreck', 'collision', 'explosion', 'disaster', 'catastrophe', 'meltdown', 'implosion', 'crash', 'carnival barker', 'unicorn' ] technical_terms = [ '5G', 'RAN', 'AI', 'edge computing', 'automation', 'cloud', 'network', 'operator', 'vendor', 'infrastructure', 'deployment', 'integration' ] total_length = 0 style_score = 0 for example in examples: if 'messages' in example and len(example['messages']) >= 3: content = example['messages'][2]['content'] title_line = content.split('\n\n')[0] title = title_line[2:] if title_line.startswith('# ') else title_line # Collect sample titles if len(analysis['sample_titles']) < 10: analysis['sample_titles'].append(title) content_lower = content.lower() # Check for provocative titles if any(word in title.lower() for word in provocative_words): analysis['provocative_titles'] += 1 # Check for cynical phrases if any(phrase in content_lower for phrase in cynical_phrases): analysis['cynical_phrases'] += 1 # Check for technical content if any(term.lower() in content_lower for term in technical_terms): analysis['technical_content'] += 1 # Check for negative analogies if any(analogy in content_lower for analogy in negative_analogies): analysis['negative_analogies'] += 1 # Calculate article length article_length = len(content) total_length += article_length # Style consistency score (0-4 based on presence of key elements) style_elements = 0 if any(word in title.lower() for word in provocative_words): style_elements += 1 if any(phrase in content_lower for phrase in cynical_phrases): style_elements += 1 if any(analogy in content_lower for analogy in negative_analogies): style_elements += 1 if any(term.lower() in content_lower for term in technical_terms): style_elements += 1 style_score += style_elements # Calculate averages and percentages if examples: analysis['avg_article_length'] = total_length // len(examples) analysis['style_consistency'] = (style_score / (len(examples) * 4)) * 100 # Convert counts to percentages analysis['provocative_titles'] = (analysis['provocative_titles'] / len(examples)) * 100 analysis['cynical_phrases'] = (analysis['cynical_phrases'] / len(examples)) * 100 analysis['technical_content'] = (analysis['technical_content'] / len(examples)) * 100 analysis['negative_analogies'] = (analysis['negative_analogies'] / len(examples)) * 100 return analysis def print_analysis_report(analysis: Dict): """Print a detailed analysis report""" print("=" * 60) print("TRAINING EXAMPLES QUALITY ANALYSIS") print("=" * 60) print(f"Total Examples: {analysis['total_examples']}") print(f"Average Article Length: {analysis['avg_article_length']:,} characters") print() print("STYLE ANALYSIS:") print(f" Provocative Titles: {analysis['provocative_titles']:.1f}%") print(f" Cynical Phrases: {analysis['cynical_phrases']:.1f}%") print(f" Technical Content: {analysis['technical_content']:.1f}%") print(f" Negative Analogies: {analysis['negative_analogies']:.1f}%") print(f" Overall Style Consistency: {analysis['style_consistency']:.1f}%") print() print("SAMPLE TITLES:") for i, title in enumerate(analysis['sample_titles'], 1): print(f" {i:2d}. {title}") print() # Quality assessment quality_score = ( analysis['provocative_titles'] + analysis['cynical_phrases'] + analysis['technical_content'] + analysis['negative_analogies'] ) / 4 print("QUALITY ASSESSMENT:") if quality_score >= 80: print(" ✅ EXCELLENT - High-quality examples with strong style consistency") elif quality_score >= 60: print(" ✅ GOOD - Solid examples with good style elements") elif quality_score >= 40: print(" ⚠️ FAIR - Acceptable but could use improvement") else: print(" ❌ POOR - Needs significant improvement") print(f" Overall Quality Score: {quality_score:.1f}%") print() def compare_datasets(original_file: str, new_file: str): """Compare original and new datasets""" print("DATASET COMPARISON:") print("-" * 40) original_analysis = analyze_training_examples(original_file) new_analysis = analyze_training_examples(new_file) print(f"Original Dataset: {original_analysis['total_examples']} examples") print(f"Expanded Dataset: {new_analysis['total_examples']} examples") print(f"New Examples Added: {new_analysis['total_examples'] - original_analysis['total_examples']}") print() print("STYLE CONSISTENCY COMPARISON:") print(f" Original: {original_analysis['style_consistency']:.1f}%") print(f" Expanded: {new_analysis['style_consistency']:.1f}%") if new_analysis['style_consistency'] >= original_analysis['style_consistency']: print(" ✅ Style consistency maintained or improved") else: print(" ⚠️ Style consistency decreased") print() def main(): """Main validation function""" print("Validating training examples quality...\n") # Analyze the new examples print("ANALYZING NEW EXAMPLES:") new_analysis = analyze_training_examples('data/additional_training_examples.json') print_analysis_report(new_analysis) # Analyze the expanded dataset print("ANALYZING EXPANDED DATASET:") expanded_analysis = analyze_training_examples('data/expanded_train_dataset.json') print_analysis_report(expanded_analysis) # Compare with original try: compare_datasets('data/train_dataset.json', 'data/expanded_train_dataset.json') except FileNotFoundError: print("Original dataset not found for comparison.") print("=" * 60) print("VALIDATION COMPLETE") print("=" * 60) if __name__ == "__main__": main()