Spaces:
Running
Running
""" | |
Comprehensive test suite for Enhanced Response Processor | |
Tests all extraction strategies, validation, and edge cases | |
""" | |
import pytest | |
import logging | |
from typing import Dict, Any | |
from utils.response_processor import ( | |
EnhancedResponseProcessor, | |
ExtractionStrategy, | |
ConfidenceLevel, | |
QuestionType, | |
ExtractionResult, | |
ValidationResult, | |
create_enhanced_processor, | |
process_response_enhanced | |
) | |
# Configure logging for tests | |
logging.basicConfig(level=logging.INFO) | |
class TestResponseProcessorInitialization: | |
"""Test response processor initialization and configuration.""" | |
def test_default_initialization(self): | |
"""Test default initialization.""" | |
processor = EnhancedResponseProcessor() | |
assert processor.confidence_threshold == 0.5 | |
assert processor.extraction_stats["total_processed"] == 0 | |
def test_custom_threshold_initialization(self): | |
"""Test initialization with custom confidence threshold.""" | |
processor = EnhancedResponseProcessor(confidence_threshold=0.8) | |
assert processor.confidence_threshold == 0.8 | |
def test_create_enhanced_processor_function(self): | |
"""Test the convenience creation function.""" | |
processor = create_enhanced_processor(0.7) | |
assert isinstance(processor, EnhancedResponseProcessor) | |
assert processor.confidence_threshold == 0.7 | |
class TestQuestionClassification: | |
"""Test question type classification.""" | |
def test_mathematical_questions(self): | |
"""Test classification of mathematical questions.""" | |
processor = EnhancedResponseProcessor() | |
math_questions = [ | |
"What is 25 * 17?", | |
"Calculate the sum of 100 + 200", | |
"Compute 15 / 3", | |
"What is 2 + 2 = ?", | |
] | |
for question in math_questions: | |
qtype = processor._classify_question(question) | |
assert qtype == QuestionType.MATHEMATICAL | |
def test_count_questions(self): | |
"""Test classification of count questions.""" | |
processor = EnhancedResponseProcessor() | |
count_questions = [ | |
"How many objects are in the image?", | |
"Count the number of items", | |
"What is the total number of elements?", | |
] | |
for question in count_questions: | |
qtype = processor._classify_question(question) | |
assert qtype == QuestionType.COUNT | |
def test_location_questions(self): | |
"""Test classification of location questions.""" | |
processor = EnhancedResponseProcessor() | |
location_questions = [ | |
"Where is Paris located?", | |
"What city is mentioned in the text?", | |
"Which country is this?", | |
] | |
for question in location_questions: | |
qtype = processor._classify_question(question) | |
assert qtype == QuestionType.LOCATION | |
def test_person_questions(self): | |
"""Test classification of person questions.""" | |
processor = EnhancedResponseProcessor() | |
person_questions = [ | |
"Who is the author of this book?", | |
"What is the name of the person?", | |
"Who wrote this article?", | |
] | |
for question in person_questions: | |
qtype = processor._classify_question(question) | |
assert qtype == QuestionType.PERSON | |
def test_yesno_questions(self): | |
"""Test classification of yes/no questions.""" | |
processor = EnhancedResponseProcessor() | |
yesno_questions = [ | |
"Is this correct?", | |
"Are there any errors?", | |
"Was this written in 2020?", | |
"Can you see the image?", | |
] | |
for question in yesno_questions: | |
qtype = processor._classify_question(question) | |
assert qtype == QuestionType.YES_NO | |
class TestFinalAnswerFormatExtraction: | |
"""Test extraction using FINAL ANSWER: format.""" | |
def test_basic_final_answer_extraction(self): | |
"""Test basic FINAL ANSWER: format extraction.""" | |
processor = EnhancedResponseProcessor() | |
response = """ | |
Let me analyze this step by step. | |
First, I need to calculate 25 * 17. | |
25 * 17 = 425 | |
FINAL ANSWER: 425 | |
""" | |
result = processor.process_response(response, "What is 25 * 17?") | |
assert result.answer == "425" | |
assert result.strategy == ExtractionStrategy.FINAL_ANSWER_FORMAT | |
assert result.confidence >= 0.9 | |
def test_final_answer_with_quotes(self): | |
"""Test FINAL ANSWER: with quoted content.""" | |
processor = EnhancedResponseProcessor() | |
response = """ | |
The capital of France is well known. | |
FINAL ANSWER: "Paris" | |
""" | |
result = processor.process_response(response, "What is the capital of France?") | |
assert result.answer == "Paris" | |
assert result.confidence >= 0.9 | |
def test_final_answer_case_insensitive(self): | |
"""Test case insensitive FINAL ANSWER: extraction.""" | |
processor = EnhancedResponseProcessor() | |
response = """ | |
After careful analysis... | |
final answer: London | |
""" | |
result = processor.process_response(response) | |
assert result.answer == "London" | |
assert result.strategy == ExtractionStrategy.FINAL_ANSWER_FORMAT | |
def test_multiple_final_answers(self): | |
"""Test extraction when multiple FINAL ANSWER: formats exist.""" | |
processor = EnhancedResponseProcessor() | |
response = """ | |
First attempt: | |
FINAL ANSWER: wrong | |
Let me recalculate... | |
FINAL ANSWER: correct | |
""" | |
result = processor.process_response(response) | |
assert result.answer == "correct" # Should take the last one | |
class TestConclusionSentenceExtraction: | |
"""Test extraction from conclusion sentences.""" | |
def test_therefore_pattern(self): | |
"""Test 'therefore' conclusion pattern.""" | |
processor = EnhancedResponseProcessor() | |
response = """ | |
Looking at the calculation step by step: | |
25 * 17 = 25 * (10 + 7) = 250 + 175 = 425 | |
Therefore, the answer is 425. | |
""" | |
result = processor.process_response(response, "What is 25 * 17?") | |
assert result.answer == "425" | |
assert result.strategy == ExtractionStrategy.CONCLUSION_SENTENCES | |
assert result.confidence >= 0.7 | |
def test_answer_is_pattern(self): | |
"""Test 'the answer is' pattern.""" | |
processor = EnhancedResponseProcessor() | |
response = """ | |
After analyzing the image, I can see several objects. | |
Counting them carefully, the answer is 12. | |
""" | |
result = processor.process_response(response, "How many objects are in the image?") | |
assert result.answer == "12" | |
assert result.strategy == ExtractionStrategy.CONCLUSION_SENTENCES | |
def test_we_get_pattern(self): | |
"""Test 'we get' conclusion pattern.""" | |
processor = EnhancedResponseProcessor() | |
response = """ | |
Performing the division: 100 ÷ 4 | |
We get 25. | |
""" | |
result = processor.process_response(response, "What is 100 divided by 4?") | |
assert result.answer == "25" | |
assert result.strategy == ExtractionStrategy.CONCLUSION_SENTENCES | |
class TestSemanticPatternExtraction: | |
"""Test semantic pattern extraction based on question types.""" | |
def test_mathematical_semantic_extraction(self): | |
"""Test mathematical answer extraction.""" | |
processor = EnhancedResponseProcessor() | |
response = """ | |
Let me solve this equation. | |
The calculation shows that x = 42. | |
This is the solution to the problem. | |
""" | |
result = processor.process_response(response, "Solve for x") | |
assert result.answer == "42" | |
assert result.strategy == ExtractionStrategy.SEMANTIC_PATTERNS | |
def test_count_semantic_extraction(self): | |
"""Test count answer extraction.""" | |
processor = EnhancedResponseProcessor() | |
response = """ | |
Looking at the image, I can identify various objects. | |
The total count is 15 items. | |
Each item is clearly visible. | |
""" | |
result = processor.process_response(response, "How many items are there?") | |
assert result.answer == "15" | |
assert result.strategy == ExtractionStrategy.SEMANTIC_PATTERNS | |
def test_location_semantic_extraction(self): | |
"""Test location answer extraction.""" | |
processor = EnhancedResponseProcessor() | |
response = """ | |
The document mentions several places. | |
The main location is in New York. | |
This is where the events took place. | |
""" | |
result = processor.process_response(response, "Where did this happen?") | |
assert result.answer == "New York" | |
assert result.strategy == ExtractionStrategy.SEMANTIC_PATTERNS | |
def test_person_semantic_extraction(self): | |
"""Test person name extraction.""" | |
processor = EnhancedResponseProcessor() | |
response = """ | |
The book was written by John Smith. | |
He is a well-known author in this field. | |
""" | |
result = processor.process_response(response, "Who wrote this book?") | |
assert result.answer == "John Smith" | |
assert result.strategy == ExtractionStrategy.SEMANTIC_PATTERNS | |
class TestComplexResponseHandling: | |
"""Test handling of complex, verbose responses.""" | |
def test_multi_paragraph_response(self): | |
"""Test extraction from multi-paragraph responses.""" | |
processor = EnhancedResponseProcessor() | |
response = """ | |
This is a complex mathematical problem that requires several steps to solve. | |
First, let me break down the problem into smaller parts. We need to calculate | |
the total area of the rectangle, which involves multiplying length by width. | |
The length is given as 15 meters, and the width is 8 meters. When we multiply | |
these values together, we get 15 × 8 = 120. | |
Therefore, the total area is 120 square meters. | |
FINAL ANSWER: 120 | |
""" | |
result = processor.process_response(response, "What is the area of the rectangle?") | |
assert result.answer == "120" | |
assert result.confidence >= 0.9 | |
def test_response_with_multiple_numbers(self): | |
"""Test extraction when response contains multiple numbers.""" | |
processor = EnhancedResponseProcessor() | |
response = """ | |
Looking at the data, I see several values: 10, 25, 30, and 45. | |
The calculation involves adding these: 10 + 25 + 30 + 45. | |
The sum equals 110. | |
FINAL ANSWER: 110 | |
""" | |
result = processor.process_response(response, "What is the sum?") | |
assert result.answer == "110" | |
def test_response_with_embedded_answer(self): | |
"""Test extraction of answers embedded in explanations.""" | |
processor = EnhancedResponseProcessor() | |
response = """ | |
The author of this work is clearly identified in the introduction. | |
Based on the biographical information provided, we can determine | |
that the person who wrote this is Jane Doe, as mentioned in the | |
acknowledgments section. | |
""" | |
result = processor.process_response(response, "Who is the author?") | |
assert result.answer == "Jane Doe" | |
assert result.confidence >= 0.7 | |
class TestErrorResponseHandling: | |
"""Test handling of error responses and edge cases.""" | |
def test_empty_response(self): | |
"""Test handling of empty responses.""" | |
processor = EnhancedResponseProcessor() | |
result = processor.process_response("", "What is 2 + 2?") | |
assert result.answer == "unknown" | |
assert result.confidence < 0.5 | |
def test_error_message_response(self): | |
"""Test handling of error message responses.""" | |
processor = EnhancedResponseProcessor() | |
response = """ | |
I'm sorry, but I cannot process this request due to an error. | |
The system is unable to calculate the result. | |
Please try again later. | |
""" | |
result = processor.process_response(response, "What is 2 + 2?") | |
# Should still try to extract something, but with low confidence | |
assert result.confidence < 0.5 | |
def test_ambiguous_response(self): | |
"""Test handling of ambiguous responses.""" | |
processor = EnhancedResponseProcessor() | |
response = """ | |
This could be either A or B, depending on the context. | |
It's difficult to determine without more information. | |
The answer might be around 50, but I'm not certain. | |
""" | |
result = processor.process_response(response, "What is the value?") | |
# Should extract something but with lower confidence | |
assert result.confidence < 0.8 | |
class TestAnswerValidation: | |
"""Test answer validation functionality.""" | |
def test_mathematical_answer_validation(self): | |
"""Test validation of mathematical answers.""" | |
processor = EnhancedResponseProcessor() | |
# Valid mathematical answer | |
validation = processor._validate_answer("42", "What is 6 * 7?", QuestionType.MATHEMATICAL) | |
assert validation.is_valid | |
assert validation.confidence_penalty == 0.0 | |
# Invalid mathematical answer (no numbers) | |
validation = processor._validate_answer("hello", "What is 6 * 7?", QuestionType.MATHEMATICAL) | |
assert not validation.is_valid | |
assert validation.confidence_penalty > 0.0 | |
def test_count_answer_validation(self): | |
"""Test validation of count answers.""" | |
processor = EnhancedResponseProcessor() | |
# Valid count answer | |
validation = processor._validate_answer("15", "How many items?", QuestionType.COUNT) | |
assert validation.is_valid | |
# Invalid count answer | |
validation = processor._validate_answer("many items", "How many items?", QuestionType.COUNT) | |
assert validation.confidence_penalty > 0.0 | |
def test_yesno_answer_validation(self): | |
"""Test validation of yes/no answers.""" | |
processor = EnhancedResponseProcessor() | |
# Valid yes/no answers | |
for answer in ["yes", "no", "true", "false"]: | |
validation = processor._validate_answer(answer, "Is this correct?", QuestionType.YES_NO) | |
assert validation.is_valid | |
# Invalid yes/no answer | |
validation = processor._validate_answer("maybe", "Is this correct?", QuestionType.YES_NO) | |
assert validation.confidence_penalty > 0.0 | |
class TestAnswerCleaning: | |
"""Test answer cleaning and formatting.""" | |
def test_number_comma_removal(self): | |
"""Test removal of commas from numbers.""" | |
processor = EnhancedResponseProcessor() | |
cleaned = processor._clean_answer("1,234", QuestionType.MATHEMATICAL) | |
assert cleaned == "1234" | |
cleaned = processor._clean_answer("10,000", QuestionType.COUNT) | |
assert cleaned == "10000" | |
def test_quote_removal(self): | |
"""Test removal of quotes from answers.""" | |
processor = EnhancedResponseProcessor() | |
cleaned = processor._clean_answer('"Paris"', QuestionType.LOCATION) | |
assert cleaned == "Paris" | |
cleaned = processor._clean_answer("'London'", QuestionType.LOCATION) | |
assert cleaned == "London" | |
def test_prefix_removal(self): | |
"""Test removal of common prefixes.""" | |
processor = EnhancedResponseProcessor() | |
cleaned = processor._clean_answer("The answer is 42", QuestionType.MATHEMATICAL) | |
assert cleaned == "42" | |
cleaned = processor._clean_answer("Result: 100", QuestionType.MATHEMATICAL) | |
assert cleaned == "100" | |
class TestConfidenceScoring: | |
"""Test confidence scoring functionality.""" | |
def test_high_confidence_scenarios(self): | |
"""Test scenarios that should produce high confidence.""" | |
processor = EnhancedResponseProcessor() | |
# FINAL ANSWER format should have high confidence | |
response = "FINAL ANSWER: 42" | |
result = processor.process_response(response, "What is the answer?") | |
assert result.confidence >= 0.9 | |
def test_medium_confidence_scenarios(self): | |
"""Test scenarios that should produce medium confidence.""" | |
processor = EnhancedResponseProcessor() | |
# Conclusion sentences should have medium-high confidence | |
response = "Therefore, the answer is 42." | |
result = processor.process_response(response, "What is the answer?") | |
assert 0.7 <= result.confidence < 0.9 | |
def test_low_confidence_scenarios(self): | |
"""Test scenarios that should produce low confidence.""" | |
processor = EnhancedResponseProcessor() | |
# Fallback extraction should have low confidence | |
response = "This is a complex problem. Maybe 42? I'm not sure." | |
result = processor.process_response(response, "What is the answer?") | |
assert result.confidence < 0.7 | |
class TestStatisticsTracking: | |
"""Test statistics tracking functionality.""" | |
def test_statistics_initialization(self): | |
"""Test initial statistics state.""" | |
processor = EnhancedResponseProcessor() | |
stats = processor.get_statistics() | |
assert stats["total_processed"] == 0 | |
assert all(count == 0 for count in stats["strategy_usage"].values()) | |
assert all(count == 0 for count in stats["confidence_distribution"].values()) | |
def test_statistics_tracking(self): | |
"""Test statistics tracking during processing.""" | |
processor = EnhancedResponseProcessor() | |
# Process a few responses | |
processor.process_response("FINAL ANSWER: 42", "What is the answer?") | |
processor.process_response("Therefore, the result is 100.", "What is the result?") | |
stats = processor.get_statistics() | |
assert stats["total_processed"] == 2 | |
assert stats["strategy_usage"]["final_answer_format"] >= 1 | |
def test_statistics_reset(self): | |
"""Test statistics reset functionality.""" | |
processor = EnhancedResponseProcessor() | |
# Process some responses | |
processor.process_response("FINAL ANSWER: 42", "What is the answer?") | |
# Reset statistics | |
processor.reset_statistics() | |
stats = processor.get_statistics() | |
assert stats["total_processed"] == 0 | |
assert all(count == 0 for count in stats["strategy_usage"].values()) | |
class TestBackwardCompatibility: | |
"""Test backward compatibility functions.""" | |
def test_process_response_enhanced_function(self): | |
"""Test the backward compatibility function.""" | |
response = "FINAL ANSWER: 42" | |
question = "What is the answer?" | |
answer = process_response_enhanced(response, question) | |
assert answer == "42" | |
def test_process_response_enhanced_with_threshold(self): | |
"""Test the backward compatibility function with custom threshold.""" | |
response = "Maybe the answer is 42?" | |
question = "What is the answer?" | |
# With high threshold, should return unknown for low confidence | |
answer = process_response_enhanced(response, question, confidence_threshold=0.9) | |
# The exact behavior depends on the extraction result, but it should handle the threshold | |
class TestRealWorldScenarios: | |
"""Test real-world response scenarios.""" | |
def test_verbose_mathematical_response(self): | |
"""Test verbose mathematical response extraction.""" | |
processor = EnhancedResponseProcessor() | |
response = """ | |
To solve this problem, I need to carefully analyze the given information. | |
The problem asks me to calculate the area of a rectangle with length 12 meters | |
and width 8 meters. The formula for the area of a rectangle is length × width. | |
Substituting the values: | |
Area = 12 × 8 = 96 | |
Therefore, the area of the rectangle is 96 square meters. | |
FINAL ANSWER: 96 | |
""" | |
result = processor.process_response(response, "What is the area of the rectangle?") | |
assert result.answer == "96" | |
assert result.question_type == QuestionType.MATHEMATICAL | |
assert result.confidence >= 0.9 | |
def test_image_analysis_response(self): | |
"""Test image analysis response extraction.""" | |
processor = EnhancedResponseProcessor() | |
response = """ | |
Looking at the provided image, I can analyze the contents systematically. | |
I can see several objects distributed across the image: | |
- 3 red circles in the upper left | |
- 2 blue squares in the center | |
- 4 green triangles on the right side | |
- 1 yellow star at the bottom | |
Counting all visible objects, I find a total of 10 objects in the image. | |
FINAL ANSWER: 10 | |
""" | |
result = processor.process_response(response, "How many objects are in the image?") | |
assert result.answer == "10" | |
assert result.question_type == QuestionType.COUNT | |
assert result.confidence >= 0.9 | |
def test_author_identification_response(self): | |
"""Test author identification response extraction.""" | |
processor = EnhancedResponseProcessor() | |
response = """ | |
After examining the document carefully, I can identify the author information. | |
The title page clearly states the author's name, and this is confirmed by | |
the copyright information on the reverse side. The biographical note at | |
the end also provides additional context about the author's background. | |
Based on all this evidence, the author of this work is Emily Johnson. | |
FINAL ANSWER: Emily Johnson | |
""" | |
result = processor.process_response(response, "Who is the author of this document?") | |
assert result.answer == "Emily Johnson" | |
assert result.question_type == QuestionType.PERSON | |
assert result.confidence >= 0.9 | |
if __name__ == "__main__": | |
# Run tests with pytest | |
pytest.main([__file__, "-v"]) |