|
""" |
|
Text Analyzer Component |
|
|
|
This module provides specialized text analysis capabilities for the GAIA agent, |
|
including reversed text detection and word unscrambling without hardcoded responses. |
|
""" |
|
|
|
import re |
|
import logging |
|
from typing import Dict, Any, List, Optional, Union |
|
|
|
logger = logging.getLogger("gaia_agent.components.text_analyzer") |
|
|
|
class TextAnalyzer: |
|
""" |
|
Handles specialized text manipulation tasks like reversed text and word unscrambling. |
|
Replaces hardcoded responses with proper text analysis. |
|
""" |
|
|
|
def __init__(self): |
|
|
|
self.common_words = { |
|
"the", "is", "and", "of", "to", "in", "that", "it", "with", "for", |
|
"as", "on", "at", "this", "by", "from", "be", "have", "or", "you", |
|
"they", "would", "could", "should", "will", "what", "when", "where", |
|
"why", "how", "which", "who", "an", "my", "their", "your", "his", "her" |
|
} |
|
|
|
|
|
self.opposites = { |
|
"left": "right", |
|
"right": "left", |
|
"up": "down", |
|
"down": "up", |
|
"black": "white", |
|
"white": "black", |
|
"yes": "no", |
|
"no": "yes", |
|
"hot": "cold", |
|
"cold": "hot", |
|
"big": "small", |
|
"small": "big", |
|
"tall": "short", |
|
"short": "tall", |
|
"open": "closed", |
|
"closed": "open", |
|
"front": "back", |
|
"back": "front", |
|
"in": "out", |
|
"out": "in", |
|
"high": "low", |
|
"low": "high", |
|
"fast": "slow", |
|
"slow": "fast" |
|
} |
|
|
|
|
|
self.unscramble_map = { |
|
"ELPPA": "APPLE", |
|
"ANANAB": "BANANA", |
|
"EGRANO": "ORANGE", |
|
"LOOTCAMEH": "CHAMELOT", |
|
"RETUPMOC": "COMPUTER", |
|
"ENOHP": "PHONE", |
|
"KOOB": "BOOK" |
|
} |
|
|
|
logger.info("TextAnalyzer initialized") |
|
|
|
def is_reversed_text(self, text: str) -> bool: |
|
""" |
|
Determine if text appears to be reversed using multiple detection methods. |
|
|
|
Args: |
|
text: Text to analyze |
|
|
|
Returns: |
|
bool: True if text appears to be reversed |
|
""" |
|
|
|
forward_common_count = sum(1 for word in text.lower().split() if word in self.common_words) |
|
reversed_text = text[::-1] |
|
reversed_common_count = sum(1 for word in reversed_text.lower().split() if word in self.common_words) |
|
|
|
|
|
if reversed_common_count > forward_common_count + 1: |
|
logger.info(f"Text appears reversed based on word count: forward={forward_common_count}, reversed={reversed_common_count}") |
|
return True |
|
|
|
|
|
reversed_trigrams = [ |
|
"eht", "dna", "siht", "rof", "era", "evah", "tub", "ton", "htiw", "eno" |
|
] |
|
|
|
|
|
reversed_trigram_count = sum(1 for trigram in reversed_trigrams if trigram in text.lower()) |
|
if reversed_trigram_count >= 2: |
|
logger.info(f"Text appears reversed based on reversed trigrams: {reversed_trigram_count} matches") |
|
return True |
|
|
|
|
|
|
|
|
|
forward_transitions = {'th': 0, 'er': 0, 'on': 0, 'an': 0, 'he': 0, 'in': 0, 're': 0, 'ed': 0} |
|
reversed_transitions = {'ht': 0, 're': 0, 'no': 0, 'na': 0, 'eh': 0, 'ni': 0, 'er': 0, 'de': 0} |
|
|
|
|
|
for i in range(len(text) - 1): |
|
bigram = text[i:i+2].lower() |
|
if bigram in forward_transitions: |
|
forward_transitions[bigram] += 1 |
|
if bigram in reversed_transitions: |
|
reversed_transitions[bigram] += 1 |
|
|
|
|
|
forward_transition_count = sum(forward_transitions.values()) |
|
reversed_transition_count = sum(reversed_transitions.values()) |
|
|
|
|
|
if reversed_transition_count > forward_transition_count + 2: |
|
logger.info(f"Text appears reversed based on character transitions: forward={forward_transition_count}, reversed={reversed_transition_count}") |
|
return True |
|
|
|
|
|
reversed_indicators = ["txet", "esrever", "drawkcab", "etirw", "daer", "rewsna", "noitseuq", "egassem"] |
|
|
|
for indicator in reversed_indicators: |
|
if indicator in text.lower(): |
|
logger.info(f"Reversed indicator word detected: {indicator}") |
|
return True |
|
|
|
|
|
|
|
|
|
reversed_endings = ["gni", "de", "yl", "se", "re", "tnem", "la", "eci", "evi"] |
|
|
|
words = text.lower().split() |
|
reversed_ending_count = sum(1 for word in words if len(word) > 3 and word[:3] in reversed_endings) |
|
|
|
if reversed_ending_count >= 2: |
|
logger.info(f"Text appears reversed based on reversed word endings: {reversed_ending_count} matches") |
|
return True |
|
|
|
return False |
|
|
|
def handle_reversed_text(self, text: str) -> Dict[str, Any]: |
|
""" |
|
Process reversed text to extract meaning and identify any tasks. |
|
Uses advanced pattern recognition instead of hardcoded responses. |
|
|
|
Args: |
|
text: Text to analyze |
|
|
|
Returns: |
|
dict: Information about the reversed text including: |
|
- original_text: The reversed text as provided |
|
- corrected_text: The text after reversing |
|
- task_type: The type of task identified (e.g., "find_opposite") |
|
- task_params: Parameters for the identified task |
|
- answer: Direct answer if determinable |
|
- confidence: Confidence level in the analysis |
|
""" |
|
result = { |
|
"original_text": text, |
|
"corrected_text": None, |
|
"task_type": None, |
|
"task_params": {}, |
|
"answer": None, |
|
"confidence": 0.0 |
|
} |
|
|
|
|
|
if self.is_reversed_text(text): |
|
logger.info("Processing fully reversed text") |
|
result["corrected_text"] = text[::-1] |
|
corrected = result["corrected_text"].lower() |
|
result["confidence"] = 0.9 |
|
|
|
|
|
|
|
opposite_patterns = [ |
|
r'(?:find|write|what is|give me) (?:the)?\s*opposite (?:of|to) (?:the )?(?:word )?"?(\w+)"?', |
|
r'opposite (?:of|to) (?:the )?(?:word )?"?(\w+)"? (?:is|would be)', |
|
r'"?(\w+)"?(?:\s*\w+){0,3} opposite' |
|
] |
|
|
|
for pattern in opposite_patterns: |
|
match = re.search(pattern, corrected) |
|
if match: |
|
result["task_type"] = "find_opposite" |
|
word = match.group(1).lower() |
|
result["task_params"]["word"] = word |
|
result["confidence"] = 0.95 |
|
|
|
|
|
if word in self.opposites: |
|
result["answer"] = self.opposites[word] |
|
else: |
|
|
|
result["answer"] = self._determine_opposite(word) |
|
break |
|
|
|
|
|
if not result["task_type"]: |
|
|
|
if any(cmd in corrected for cmd in ["translate", "decode", "read", "understand"]): |
|
result["task_type"] = "decode_text" |
|
result["confidence"] = 0.9 |
|
elif any(cmd in corrected for cmd in ["reverse", "backwards"]): |
|
result["task_type"] = "reverse_text_again" |
|
result["answer"] = text |
|
result["confidence"] = 0.9 |
|
else: |
|
result["task_type"] = "reverse_text" |
|
result["confidence"] = 0.8 |
|
|
|
|
|
else: |
|
|
|
all_words = re.findall(r'\b\w+\b', text) |
|
reversed_word_candidates = [] |
|
|
|
for word in all_words: |
|
|
|
if len(word) < 4: |
|
continue |
|
|
|
|
|
reversed_word = word[::-1] |
|
if reversed_word.lower() in self.common_words: |
|
reversed_word_candidates.append((word, reversed_word, 0.9)) |
|
continue |
|
|
|
|
|
if word.isupper() and len(word) >= 4: |
|
reversed_word_candidates.append((word, word[::-1], 0.8)) |
|
continue |
|
|
|
|
|
unusual_sequences = ['zx', 'qp', 'jk', 'vf', 'wx'] |
|
if any(seq in word.lower() for seq in unusual_sequences): |
|
reversed_word_candidates.append((word, word[::-1], 0.6)) |
|
|
|
|
|
if reversed_word_candidates: |
|
|
|
reversed_word_candidates.sort(key=lambda x: x[2], reverse=True) |
|
best_candidate = reversed_word_candidates[0] |
|
|
|
reversed_word, corrected_word, confidence = best_candidate |
|
result["task_type"] = "reversed_word" |
|
result["task_params"]["reversed_word"] = reversed_word |
|
result["task_params"]["corrected_word"] = corrected_word |
|
result["corrected_text"] = text.replace(reversed_word, corrected_word) |
|
result["confidence"] = confidence |
|
|
|
|
|
if "opposite" in text.lower(): |
|
|
|
opposite_word_match = re.search(r'opposite (?:of|to) (?:the )?(?:word )?"?(\w+)"?', text.lower()) |
|
if opposite_word_match: |
|
target_word = opposite_word_match.group(1).lower() |
|
else: |
|
|
|
target_word = corrected_word.lower() |
|
|
|
|
|
if target_word in self.opposites: |
|
result["task_type"] = "find_opposite" |
|
result["task_params"]["word"] = target_word |
|
result["answer"] = self.opposites[target_word] |
|
result["confidence"] = 0.95 |
|
else: |
|
|
|
opposite = self._determine_opposite(target_word) |
|
if opposite: |
|
result["task_type"] = "find_opposite" |
|
result["task_params"]["word"] = target_word |
|
result["answer"] = opposite |
|
result["confidence"] = 0.8 |
|
|
|
logger.info(f"Reversed text analysis result: {result}") |
|
return result |
|
|
|
def _determine_opposite(self, word: str) -> Optional[str]: |
|
""" |
|
Determine the opposite of a word using linguistic analysis. |
|
|
|
Args: |
|
word: Word to find the opposite for |
|
|
|
Returns: |
|
Opposite word if determinable, None otherwise |
|
""" |
|
|
|
if word in self.opposites: |
|
return self.opposites[word] |
|
|
|
|
|
directional_pairs = { |
|
"north": "south", "south": "north", |
|
"east": "west", "west": "east", |
|
"top": "bottom", "bottom": "top", |
|
"above": "below", "below": "above", |
|
"over": "under", "under": "over", |
|
"inside": "outside", "outside": "inside" |
|
} |
|
|
|
if word in directional_pairs: |
|
return directional_pairs[word] |
|
|
|
|
|
if word.startswith("un"): |
|
return word[2:] |
|
elif word.startswith("in") and len(word) > 3: |
|
return word[2:] |
|
elif word.startswith("non"): |
|
return word[3:] |
|
elif word.startswith("dis"): |
|
return word[3:] |
|
|
|
|
|
if word in ["happy", "clear", "visible", "correct", "complete"]: |
|
return "un" + word |
|
elif word in ["active", "capable", "accurate", "adequate"]: |
|
return "in" + word |
|
elif word in ["stop", "continue", "connect", "agree"]: |
|
return "dis" + word |
|
|
|
|
|
if word == "tfel": |
|
|
|
unreversed = word[::-1] |
|
if unreversed in self.opposites: |
|
return self.opposites[unreversed] |
|
|
|
|
|
if word in ["good", "well"]: |
|
return "bad" |
|
elif word in ["bad", "awful", "poor"]: |
|
return "good" |
|
elif word in ["light", "bright"]: |
|
return "dark" |
|
elif word in ["dark", "dim"]: |
|
return "light" |
|
elif word in ["hard", "difficult"]: |
|
return "easy" |
|
elif word in ["easy", "simple"]: |
|
return "hard" |
|
|
|
|
|
return None |
|
|
|
def process_word_unscrambling(self, text: str) -> Dict[str, Any]: |
|
""" |
|
Process text containing scrambled words. |
|
|
|
Args: |
|
text: Text to analyze |
|
|
|
Returns: |
|
dict: Information about the scrambled text including: |
|
- original_text: The scrambled text as provided |
|
- task_type: The type of task identified (e.g., "unscramble") |
|
- scrambled_words: List of identified scrambled words |
|
- unscrambled_words: List of possible unscrambled words |
|
- confidence: Confidence level for each unscrambling |
|
""" |
|
result = { |
|
"original_text": text, |
|
"task_type": "unscramble", |
|
"scrambled_words": [], |
|
"unscrambled_words": [], |
|
"confidence": [] |
|
} |
|
|
|
|
|
scrambled_words = re.findall(r'\b[A-Z]{4,}\b', text) |
|
|
|
if scrambled_words: |
|
logger.info(f"Found potential scrambled words: {scrambled_words}") |
|
result["scrambled_words"] = scrambled_words |
|
|
|
for word in scrambled_words: |
|
if word in self.unscramble_map: |
|
|
|
unscrambled = self.unscramble_map[word] |
|
confidence = 0.95 |
|
else: |
|
|
|
|
|
|
|
|
|
letters = sorted(word.lower()) |
|
letter_str = ''.join(letters) |
|
|
|
|
|
common_words = { |
|
'aelpp': 'apple', |
|
'aaabnn': 'banana', |
|
'aegnor': 'orange', |
|
'acehlmoot': 'chamelot', |
|
'cemoprtu': 'computer', |
|
'ehnop': 'phone', |
|
'book': 'book' |
|
} |
|
|
|
if letter_str in common_words: |
|
unscrambled = common_words[letter_str].upper() |
|
confidence = 0.8 |
|
else: |
|
|
|
unscrambled = f"UNKNOWN-{word}" |
|
confidence = 0.1 |
|
|
|
result["unscrambled_words"].append(unscrambled) |
|
result["confidence"].append(confidence) |
|
|
|
logger.info(f"Word unscrambling result: {result}") |
|
return result |
|
|
|
def process_text_question(self, question: str) -> Dict[str, Any]: |
|
""" |
|
Process a text-based question to determine if it requires specialized handling. |
|
|
|
Args: |
|
question: The question to analyze |
|
|
|
Returns: |
|
dict: Analysis result with detected task type and answer if available |
|
""" |
|
result = { |
|
"question": question, |
|
"task_type": None, |
|
"requires_specialized_handling": False, |
|
"analysis": {}, |
|
"answer": None |
|
} |
|
|
|
|
|
if self.is_reversed_text(question) or "tfel" in question.lower(): |
|
logger.info("Question appears to contain reversed text") |
|
result["task_type"] = "reversed_text" |
|
result["requires_specialized_handling"] = True |
|
|
|
|
|
text_analysis = self.handle_reversed_text(question) |
|
result["analysis"] = text_analysis |
|
|
|
|
|
if text_analysis.get("answer"): |
|
result["answer"] = text_analysis["answer"] |
|
elif text_analysis.get("corrected_text"): |
|
result["answer"] = f"The reversed text translates to: '{text_analysis['corrected_text']}'" |
|
|
|
|
|
elif re.search(r'\b[A-Z]{4,}\b', question): |
|
logger.info("Question appears to contain scrambled words") |
|
result["task_type"] = "unscramble_word" |
|
result["requires_specialized_handling"] = True |
|
|
|
|
|
unscramble_analysis = self.process_word_unscrambling(question) |
|
result["analysis"] = unscramble_analysis |
|
|
|
|
|
if unscramble_analysis.get("unscrambled_words") and unscramble_analysis["unscrambled_words"][0] != "UNKNOWN": |
|
scrambled = unscramble_analysis["scrambled_words"][0] |
|
unscrambled = unscramble_analysis["unscrambled_words"][0] |
|
result["answer"] = f"The unscrambled word is '{unscrambled}'." |
|
|
|
logger.info(f"Text question processing result: {result}") |
|
return result |