Spaces:
Sleeping
Sleeping
| """ | |
| Linguistic Analysis Tool for GAIA Agent - Phase 6 | |
| Advanced text pattern recognition, semantic understanding, and linguistic analysis | |
| """ | |
| import re | |
| import logging | |
| from typing import Dict, Any, List, Optional, Tuple, Set | |
| from collections import Counter | |
| import string | |
| # Natural language processing | |
| try: | |
| from textblob import TextBlob | |
| TEXTBLOB_AVAILABLE = True | |
| except ImportError: | |
| TEXTBLOB_AVAILABLE = False | |
| # Advanced regex patterns | |
| try: | |
| import regex | |
| REGEX_AVAILABLE = True | |
| except ImportError: | |
| import re as regex | |
| REGEX_AVAILABLE = False | |
| logger = logging.getLogger(__name__) | |
| class LinguisticAnalyzer: | |
| """ | |
| Advanced linguistic analysis tool for text pattern recognition and understanding. | |
| Features: | |
| - Text pattern recognition and analysis | |
| - Language detection and classification | |
| - Semantic understanding and interpretation | |
| - Text transformation and manipulation | |
| - Grammar and syntax analysis | |
| - Context-aware text processing | |
| """ | |
| def __init__(self): | |
| """Initialize the linguistic analyzer.""" | |
| self.name = "linguistic_analyzer" | |
| self.description = "Advanced linguistic analysis for pattern recognition and semantic understanding" | |
| # Initialize text processing capabilities | |
| self.available = True | |
| # Common text patterns | |
| self.patterns = { | |
| 'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', | |
| 'url': r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', | |
| 'phone': r'(\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})', | |
| 'date': r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b', | |
| 'time': r'\b\d{1,2}:\d{2}(?::\d{2})?(?:\s?[AaPp][Mm])?\b', | |
| 'number': r'-?\d+(?:\.\d+)?', | |
| 'currency': r'\$\d+(?:\.\d{2})?|\d+(?:\.\d{2})?\s?(?:USD|EUR|GBP|JPY)', | |
| 'percentage': r'\d+(?:\.\d+)?%', | |
| 'hashtag': r'#\w+', | |
| 'mention': r'@\w+', | |
| 'word': r'\b\w+\b', | |
| 'sentence': r'[.!?]+', | |
| 'question': r'\?', | |
| 'exclamation': r'!', | |
| } | |
| # Language-specific patterns | |
| self.language_patterns = { | |
| 'english': { | |
| 'articles': r'\b(the|a|an)\b', | |
| 'pronouns': r'\b(i|you|he|she|it|we|they|me|him|her|us|them)\b', | |
| 'prepositions': r'\b(in|on|at|by|for|with|to|from|of|about)\b', | |
| 'conjunctions': r'\b(and|or|but|so|yet|for|nor)\b', | |
| 'common_words': r'\b(is|are|was|were|have|has|had|do|does|did|will|would|could|should)\b' | |
| }, | |
| 'reversed_english': { | |
| 'reversed_articles': r'\b(eht|a|na)\b', | |
| 'reversed_common': r'\b(si|era|saw|erew|evah|sah|dah|od|seod|did|lliw|dluow|dluoc|dluohs)\b' | |
| } | |
| } | |
| # Semantic categories | |
| self.semantic_categories = { | |
| 'direction': ['left', 'right', 'up', 'down', 'north', 'south', 'east', 'west'], | |
| 'color': ['red', 'blue', 'green', 'yellow', 'black', 'white', 'purple', 'orange'], | |
| 'size': ['big', 'small', 'large', 'tiny', 'huge', 'massive', 'little', 'giant'], | |
| 'emotion': ['happy', 'sad', 'angry', 'excited', 'calm', 'nervous', 'joyful', 'depressed'], | |
| 'time': ['morning', 'afternoon', 'evening', 'night', 'today', 'tomorrow', 'yesterday'], | |
| 'number': ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten'] | |
| } | |
| # Opposite word pairs | |
| self.opposites = { | |
| 'left': 'right', 'right': 'left', | |
| 'up': 'down', 'down': 'up', | |
| 'big': 'small', 'small': 'big', | |
| 'large': 'small', 'tiny': 'huge', | |
| 'hot': 'cold', 'cold': 'hot', | |
| 'fast': 'slow', 'slow': 'fast', | |
| 'good': 'bad', 'bad': 'good', | |
| 'yes': 'no', 'no': 'yes', | |
| 'true': 'false', 'false': 'true', | |
| 'on': 'off', 'off': 'on', | |
| 'in': 'out', 'out': 'in', | |
| 'open': 'closed', 'closed': 'open', | |
| 'start': 'end', 'end': 'start', | |
| 'first': 'last', 'last': 'first' | |
| } | |
| logger.info("✅ Linguistic Analyzer initialized") | |
| def extract_patterns(self, text: str, pattern_types: List[str] = None) -> Dict[str, List[str]]: | |
| """ | |
| Extract various patterns from text. | |
| Args: | |
| text: Input text to analyze | |
| pattern_types: List of pattern types to extract (default: all) | |
| Returns: | |
| Dictionary with extracted patterns | |
| """ | |
| if not text: | |
| return {} | |
| if pattern_types is None: | |
| pattern_types = list(self.patterns.keys()) | |
| results = {} | |
| for pattern_type in pattern_types: | |
| if pattern_type in self.patterns: | |
| pattern = self.patterns[pattern_type] | |
| matches = re.findall(pattern, text, re.IGNORECASE) | |
| results[pattern_type] = matches | |
| return results | |
| def analyze_text_structure(self, text: str) -> Dict[str, Any]: | |
| """ | |
| Analyze the structural properties of text. | |
| Args: | |
| text: Input text to analyze | |
| Returns: | |
| Dictionary with structural analysis | |
| """ | |
| if not text: | |
| return {} | |
| # Basic metrics | |
| analysis = { | |
| 'character_count': len(text), | |
| 'word_count': len(text.split()), | |
| 'sentence_count': len(re.findall(r'[.!?]+', text)), | |
| 'paragraph_count': len([p for p in text.split('\n\n') if p.strip()]), | |
| 'line_count': len(text.split('\n')), | |
| 'average_word_length': 0, | |
| 'average_sentence_length': 0, | |
| 'punctuation_count': 0, | |
| 'uppercase_count': 0, | |
| 'lowercase_count': 0, | |
| 'digit_count': 0 | |
| } | |
| # Calculate averages | |
| words = text.split() | |
| if words: | |
| analysis['average_word_length'] = sum(len(word) for word in words) / len(words) | |
| sentences = re.split(r'[.!?]+', text) | |
| sentences = [s.strip() for s in sentences if s.strip()] | |
| if sentences: | |
| analysis['average_sentence_length'] = sum(len(s.split()) for s in sentences) / len(sentences) | |
| # Character type counts | |
| for char in text: | |
| if char in string.punctuation: | |
| analysis['punctuation_count'] += 1 | |
| elif char.isupper(): | |
| analysis['uppercase_count'] += 1 | |
| elif char.islower(): | |
| analysis['lowercase_count'] += 1 | |
| elif char.isdigit(): | |
| analysis['digit_count'] += 1 | |
| return analysis | |
| def detect_language_features(self, text: str) -> Dict[str, Any]: | |
| """ | |
| Detect language-specific features in text. | |
| Args: | |
| text: Input text to analyze | |
| Returns: | |
| Dictionary with language feature analysis | |
| """ | |
| if not text: | |
| return {} | |
| text_lower = text.lower() | |
| features = {} | |
| for language, patterns in self.language_patterns.items(): | |
| lang_features = {} | |
| for feature_type, pattern in patterns.items(): | |
| matches = re.findall(pattern, text_lower) | |
| lang_features[feature_type] = { | |
| 'count': len(matches), | |
| 'matches': matches[:10] # Limit to first 10 matches | |
| } | |
| features[language] = lang_features | |
| return features | |
| def analyze_semantic_content(self, text: str) -> Dict[str, Any]: | |
| """ | |
| Analyze semantic content and categorize words. | |
| Args: | |
| text: Input text to analyze | |
| Returns: | |
| Dictionary with semantic analysis | |
| """ | |
| if not text: | |
| return {} | |
| text_lower = text.lower() | |
| words = re.findall(r'\b\w+\b', text_lower) | |
| semantic_analysis = { | |
| 'total_words': len(words), | |
| 'unique_words': len(set(words)), | |
| 'word_frequency': dict(Counter(words).most_common(20)), | |
| 'semantic_categories': {}, | |
| 'detected_opposites': [] | |
| } | |
| # Categorize words by semantic meaning | |
| for category, category_words in self.semantic_categories.items(): | |
| found_words = [word for word in words if word in category_words] | |
| if found_words: | |
| semantic_analysis['semantic_categories'][category] = { | |
| 'count': len(found_words), | |
| 'words': list(set(found_words)) | |
| } | |
| # Find opposite word pairs | |
| for word in set(words): | |
| if word in self.opposites: | |
| opposite = self.opposites[word] | |
| if opposite in words: | |
| semantic_analysis['detected_opposites'].append({ | |
| 'word': word, | |
| 'opposite': opposite, | |
| 'both_present': True | |
| }) | |
| return semantic_analysis | |
| def find_text_transformations(self, text: str) -> Dict[str, Any]: | |
| """ | |
| Identify possible text transformations (reversals, rotations, etc.). | |
| Args: | |
| text: Input text to analyze | |
| Returns: | |
| Dictionary with transformation analysis | |
| """ | |
| if not text: | |
| return {} | |
| transformations = { | |
| 'original': text, | |
| 'reversed': text[::-1], | |
| 'word_reversed': ' '.join(reversed(text.split())), | |
| 'case_swapped': text.swapcase(), | |
| 'transformations_detected': [] | |
| } | |
| # Check if reversed text makes more sense | |
| reversed_text = text[::-1] | |
| # Analyze both versions for English-like patterns | |
| original_score = self._calculate_english_score(text) | |
| reversed_score = self._calculate_english_score(reversed_text) | |
| if reversed_score > original_score * 1.5: # Significant improvement | |
| transformations['transformations_detected'].append({ | |
| 'type': 'character_reversal', | |
| 'confidence': reversed_score / (original_score + 1), | |
| 'transformed_text': reversed_text | |
| }) | |
| # Check word order reversal | |
| word_reversed = ' '.join(reversed(text.split())) | |
| word_reversed_score = self._calculate_english_score(word_reversed) | |
| if word_reversed_score > original_score * 1.2: | |
| transformations['transformations_detected'].append({ | |
| 'type': 'word_order_reversal', | |
| 'confidence': word_reversed_score / (original_score + 1), | |
| 'transformed_text': word_reversed | |
| }) | |
| return transformations | |
| def _calculate_english_score(self, text: str) -> float: | |
| """Calculate how English-like a text appears.""" | |
| if not text: | |
| return 0.0 | |
| text_lower = text.lower() | |
| score = 0.0 | |
| # Common English words | |
| common_words = [ | |
| 'the', 'and', 'or', 'if', 'you', 'understand', 'this', 'sentence', | |
| 'write', 'opposite', 'of', 'word', 'as', 'answer', 'is', 'are', | |
| 'was', 'were', 'have', 'has', 'had', 'do', 'does', 'did' | |
| ] | |
| # Count common English words | |
| for word in common_words: | |
| if word in text_lower: | |
| score += 1.0 | |
| # Check for English-like patterns | |
| if re.search(r'\b(the|a|an)\s+\w+', text_lower): | |
| score += 2.0 | |
| if re.search(r'\w+\s+(is|are|was|were)\s+\w+', text_lower): | |
| score += 2.0 | |
| # Penalize non-English character patterns | |
| if re.search(r'[^\w\s\.,!?;:\'"()-]', text): | |
| score -= 1.0 | |
| return score | |
| def extract_answer_from_question(self, question: str) -> Dict[str, Any]: | |
| """ | |
| Extract answer from a question using linguistic analysis. | |
| Args: | |
| question: Question text to analyze | |
| Returns: | |
| Dictionary with answer extraction results | |
| """ | |
| result = { | |
| 'question': question, | |
| 'answer': '', | |
| 'confidence': 0.0, | |
| 'method': 'linguistic_analysis', | |
| 'analysis': {} | |
| } | |
| if not question: | |
| return result | |
| # Analyze transformations | |
| transformations = self.find_text_transformations(question) | |
| result['analysis']['transformations'] = transformations | |
| # Check for specific patterns | |
| if 'opposite' in question.lower(): | |
| # Look for opposite word questions | |
| opposite_analysis = self._analyze_opposite_question(question) | |
| result['analysis']['opposite_analysis'] = opposite_analysis | |
| if opposite_analysis['answer']: | |
| result['answer'] = opposite_analysis['answer'] | |
| result['confidence'] = opposite_analysis['confidence'] | |
| result['method'] = 'opposite_detection' | |
| # Check for reversed text patterns | |
| if transformations['transformations_detected']: | |
| best_transformation = max( | |
| transformations['transformations_detected'], | |
| key=lambda x: x['confidence'] | |
| ) | |
| if best_transformation['confidence'] > 0.7: | |
| # Re-analyze the transformed text | |
| transformed_result = self.extract_answer_from_question( | |
| best_transformation['transformed_text'] | |
| ) | |
| if transformed_result['answer']: | |
| result['answer'] = transformed_result['answer'] | |
| result['confidence'] = best_transformation['confidence'] | |
| result['method'] = f"transformation_{best_transformation['type']}" | |
| return result | |
| def _analyze_opposite_question(self, question: str) -> Dict[str, Any]: | |
| """Analyze questions asking for opposite words.""" | |
| result = { | |
| 'answer': '', | |
| 'confidence': 0.0, | |
| 'target_word': '', | |
| 'opposite_found': False | |
| } | |
| question_lower = question.lower() | |
| # Look for words that have opposites | |
| words = re.findall(r'\b\w+\b', question_lower) | |
| for word in words: | |
| if word in self.opposites: | |
| result['target_word'] = word | |
| result['answer'] = self.opposites[word] | |
| result['opposite_found'] = True | |
| result['confidence'] = 0.9 | |
| break | |
| return result | |
| def process_complex_text_query(self, query: str, context: str = '') -> Dict[str, Any]: | |
| """ | |
| Process complex text queries with comprehensive analysis. | |
| Args: | |
| query: Text query to process | |
| context: Additional context | |
| Returns: | |
| Dictionary with comprehensive analysis results | |
| """ | |
| result = { | |
| 'query': query, | |
| 'context': context, | |
| 'structural_analysis': {}, | |
| 'semantic_analysis': {}, | |
| 'pattern_analysis': {}, | |
| 'transformation_analysis': {}, | |
| 'answer_extraction': {}, | |
| 'final_answer': '', | |
| 'confidence': 0.0 | |
| } | |
| if not query: | |
| return result | |
| try: | |
| # Perform comprehensive analysis | |
| result['structural_analysis'] = self.analyze_text_structure(query) | |
| result['semantic_analysis'] = self.analyze_semantic_content(query) | |
| result['pattern_analysis'] = self.extract_patterns(query) | |
| result['transformation_analysis'] = self.find_text_transformations(query) | |
| result['answer_extraction'] = self.extract_answer_from_question(query) | |
| # Determine final answer | |
| if result['answer_extraction']['answer']: | |
| result['final_answer'] = result['answer_extraction']['answer'] | |
| result['confidence'] = result['answer_extraction']['confidence'] | |
| except Exception as e: | |
| logger.error(f"Complex text query processing failed: {e}") | |
| result['error'] = str(e) | |
| return result | |
| def get_linguistic_analysis_tools() -> List[LinguisticAnalyzer]: | |
| """Get list of linguistic analysis tools.""" | |
| try: | |
| analyzer = LinguisticAnalyzer() | |
| if analyzer.available: | |
| return [analyzer] | |
| else: | |
| logger.warning("⚠️ Linguistic analyzer not available") | |
| return [] | |
| except Exception as e: | |
| logger.error(f"❌ Failed to create linguistic analyzer: {e}") | |
| return [] |