File size: 14,387 Bytes
9a6a4dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
"""
Advanced Text Processor for GAIA Agent - Phase 6
Handles RTL text, multi-language analysis, and complex text transformations
"""

import re
import logging
from typing import Dict, Any, List, Optional, Tuple
from pathlib import Path

# Core text processing
import unicodedata
import string

# Language detection and translation
try:
    from langdetect import detect, detect_langs
    from langdetect.lang_detect_exception import LangDetectException
    LANGDETECT_AVAILABLE = True
except ImportError:
    LANGDETECT_AVAILABLE = False

try:
    from googletrans import Translator
    GOOGLETRANS_AVAILABLE = True
except ImportError:
    GOOGLETRANS_AVAILABLE = False

try:
    from textblob import TextBlob
    TEXTBLOB_AVAILABLE = True
except ImportError:
    TEXTBLOB_AVAILABLE = False

logger = logging.getLogger(__name__)


class AdvancedTextProcessor:
    """
    Advanced text processor for complex text analysis and transformation.
    
    Features:
    - RTL (Right-to-Left) text detection and processing
    - Multi-language text analysis and translation
    - Text orientation detection and correction
    - Advanced pattern recognition in text
    - Linguistic analysis and understanding
    - Text reversal and transformation capabilities
    """
    
    def __init__(self):
        """Initialize the advanced text processor."""
        self.name = "advanced_text_processor"
        self.description = "Advanced text processing for RTL text, multi-language analysis, and complex transformations"
        
        # Initialize translation service
        self.translator = None
        if GOOGLETRANS_AVAILABLE:
            try:
                self.translator = Translator()
                logger.info("✅ Google Translator initialized")
            except Exception as e:
                logger.warning(f"⚠️ Failed to initialize Google Translator: {e}")
        
        # RTL language codes
        self.rtl_languages = {
            'ar', 'he', 'fa', 'ur', 'yi', 'ji', 'iw', 'ku', 'ps', 'sd'
        }
        
        # RTL Unicode ranges
        self.rtl_unicode_ranges = [
            (0x0590, 0x05FF),  # Hebrew
            (0x0600, 0x06FF),  # Arabic
            (0x0700, 0x074F),  # Syriac
            (0x0750, 0x077F),  # Arabic Supplement
            (0x0780, 0x07BF),  # Thaana
            (0x07C0, 0x07FF),  # NKo
            (0x0800, 0x083F),  # Samaritan
            (0x0840, 0x085F),  # Mandaic
            (0x08A0, 0x08FF),  # Arabic Extended-A
            (0xFB1D, 0xFB4F),  # Hebrew Presentation Forms
            (0xFB50, 0xFDFF),  # Arabic Presentation Forms-A
            (0xFE70, 0xFEFF),  # Arabic Presentation Forms-B
        ]
        
        self.available = True
        logger.info("✅ Advanced Text Processor initialized")
    
    def detect_text_direction(self, text: str) -> str:
        """
        Detect if text is RTL (Right-to-Left) or LTR (Left-to-Right).
        
        Args:
            text: Input text to analyze
            
        Returns:
            'rtl' for right-to-left text, 'ltr' for left-to-right text
        """
        if not text:
            return 'ltr'
        
        rtl_chars = 0
        total_chars = 0
        
        for char in text:
            if char.isalpha():
                total_chars += 1
                char_code = ord(char)
                
                # Check if character is in RTL Unicode ranges
                for start, end in self.rtl_unicode_ranges:
                    if start <= char_code <= end:
                        rtl_chars += 1
                        break
        
        if total_chars == 0:
            return 'ltr'
        
        rtl_ratio = rtl_chars / total_chars
        return 'rtl' if rtl_ratio > 0.3 else 'ltr'
    
    def reverse_text(self, text: str) -> str:
        """
        Reverse text character by character.
        
        Args:
            text: Input text to reverse
            
        Returns:
            Reversed text
        """
        return text[::-1]
    
    def reverse_words(self, text: str) -> str:
        """
        Reverse the order of words in text.
        
        Args:
            text: Input text to reverse word order
            
        Returns:
            Text with reversed word order
        """
        words = text.split()
        return ' '.join(reversed(words))
    
    def detect_language(self, text: str) -> Dict[str, Any]:
        """
        Detect the language of the input text.
        
        Args:
            text: Input text for language detection
            
        Returns:
            Dictionary with language detection results
        """
        result = {
            'language': 'unknown',
            'confidence': 0.0,
            'is_rtl': False,
            'alternatives': []
        }
        
        if not text or not LANGDETECT_AVAILABLE:
            return result
        
        try:
            # Detect primary language
            detected_lang = detect(text)
            result['language'] = detected_lang
            result['is_rtl'] = detected_lang in self.rtl_languages
            
            # Get confidence scores for multiple languages
            lang_probs = detect_langs(text)
            result['confidence'] = lang_probs[0].prob if lang_probs else 0.0
            result['alternatives'] = [
                {'language': lp.lang, 'confidence': lp.prob}
                for lp in lang_probs[:3]
            ]
            
        except LangDetectException as e:
            logger.warning(f"Language detection failed: {e}")
        
        return result
    
    def translate_text(self, text: str, target_lang: str = 'en', source_lang: str = 'auto') -> Dict[str, Any]:
        """
        Translate text to target language.
        
        Args:
            text: Text to translate
            target_lang: Target language code (default: 'en')
            source_lang: Source language code (default: 'auto')
            
        Returns:
            Dictionary with translation results
        """
        result = {
            'translated_text': text,
            'source_language': 'unknown',
            'target_language': target_lang,
            'success': False
        }
        
        if not self.translator or not text:
            return result
        
        try:
            translation = self.translator.translate(text, dest=target_lang, src=source_lang)
            result['translated_text'] = translation.text
            result['source_language'] = translation.src
            result['success'] = True
            
        except Exception as e:
            logger.warning(f"Translation failed: {e}")
        
        return result
    
    def analyze_text_patterns(self, text: str) -> Dict[str, Any]:
        """
        Analyze text for various patterns and characteristics.
        
        Args:
            text: Input text to analyze
            
        Returns:
            Dictionary with pattern analysis results
        """
        if not text:
            return {}
        
        analysis = {
            'length': len(text),
            'word_count': len(text.split()),
            'sentence_count': len(re.findall(r'[.!?]+', text)),
            'direction': self.detect_text_direction(text),
            'has_numbers': bool(re.search(r'\d', text)),
            'has_punctuation': bool(re.search(r'[^\w\s]', text)),
            'has_uppercase': bool(re.search(r'[A-Z]', text)),
            'has_lowercase': bool(re.search(r'[a-z]', text)),
            'character_types': self._analyze_character_types(text),
            'encoding_info': self._analyze_encoding(text)
        }
        
        # Add language detection
        lang_info = self.detect_language(text)
        analysis['language_info'] = lang_info
        
        return analysis
    
    def _analyze_character_types(self, text: str) -> Dict[str, int]:
        """Analyze character types in text."""
        types = {
            'alphabetic': 0,
            'numeric': 0,
            'punctuation': 0,
            'whitespace': 0,
            'other': 0
        }
        
        for char in text:
            if char.isalpha():
                types['alphabetic'] += 1
            elif char.isdigit():
                types['numeric'] += 1
            elif char in string.punctuation:
                types['punctuation'] += 1
            elif char.isspace():
                types['whitespace'] += 1
            else:
                types['other'] += 1
        
        return types
    
    def _analyze_encoding(self, text: str) -> Dict[str, Any]:
        """Analyze text encoding characteristics."""
        try:
            # Check for different Unicode categories
            categories = {}
            for char in text:
                category = unicodedata.category(char)
                categories[category] = categories.get(category, 0) + 1
            
            return {
                'unicode_categories': categories,
                'normalized_nfc': unicodedata.normalize('NFC', text) == text,
                'normalized_nfd': unicodedata.normalize('NFD', text) == text,
            }
        except Exception as e:
            logger.warning(f"Encoding analysis failed: {e}")
            return {}
    
    def process_rtl_question(self, text: str) -> Dict[str, Any]:
        """
        Process RTL text questions, specifically handling reversed English text.
        
        Args:
            text: Input text that may be reversed
            
        Returns:
            Dictionary with processing results
        """
        result = {
            'original_text': text,
            'is_reversed': False,
            'reversed_text': '',
            'analysis': {},
            'answer': ''
        }
        
        if not text:
            return result
        
        # Check if text appears to be reversed English
        reversed_text = self.reverse_text(text)
        
        # Analyze both original and reversed versions
        original_analysis = self.analyze_text_patterns(text)
        reversed_analysis = self.analyze_text_patterns(reversed_text)
        
        # Determine if the reversed version makes more sense
        # Look for common English patterns in the reversed text
        english_indicators = [
            'the', 'and', 'or', 'if', 'you', 'understand', 'this', 'sentence',
            'write', 'opposite', 'of', 'word', 'as', 'answer'
        ]
        
        reversed_lower = reversed_text.lower()
        english_score = sum(1 for indicator in english_indicators if indicator in reversed_lower)
        
        if english_score > 3:  # Threshold for detecting English
            result['is_reversed'] = True
            result['reversed_text'] = reversed_text
            result['analysis'] = reversed_analysis
            
            # Special handling for the specific GAIA question
            if 'opposite' in reversed_lower and 'left' in reversed_lower:
                result['answer'] = 'right'
        else:
            result['analysis'] = original_analysis
        
        return result
    
    def extract_answer_from_text(self, text: str, question: str = '') -> str:
        """
        Extract the most likely answer from processed text.
        
        Args:
            text: Processed text
            question: Original question for context
            
        Returns:
            Extracted answer
        """
        if not text:
            return ''
        
        # Handle RTL processing result
        if isinstance(text, dict) and 'answer' in text:
            return text['answer']
        
        # Clean and extract answer
        text = text.strip()
        
        # Remove common prefixes
        prefixes = ['answer:', 'the answer is:', 'result:', 'output:']
        for prefix in prefixes:
            if text.lower().startswith(prefix):
                text = text[len(prefix):].strip()
        
        # Extract first meaningful word/phrase
        words = text.split()
        if words:
            return words[0]
        
        return text
    
    def process_text_query(self, query: str, context: str = '') -> Dict[str, Any]:
        """
        Process a text query with advanced analysis.
        
        Args:
            query: Text query to process
            context: Additional context
            
        Returns:
            Dictionary with processing results
        """
        result = {
            'query': query,
            'context': context,
            'processing_type': 'standard',
            'analysis': {},
            'answer': '',
            'confidence': 0.0
        }
        
        if not query:
            return result
        
        # Detect if this might be an RTL question
        direction = self.detect_text_direction(query)
        
        if direction == 'rtl' or self._looks_like_reversed_english(query):
            result['processing_type'] = 'rtl'
            rtl_result = self.process_rtl_question(query)
            result.update(rtl_result)
            result['confidence'] = 0.9 if rtl_result['is_reversed'] else 0.3
        else:
            result['processing_type'] = 'standard'
            result['analysis'] = self.analyze_text_patterns(query)
            result['answer'] = self.extract_answer_from_text(query)
            result['confidence'] = 0.7
        
        return result
    
    def _looks_like_reversed_english(self, text: str) -> bool:
        """Check if text looks like reversed English."""
        if not text:
            return False
        
        # Check for reversed English patterns
        reversed_text = self.reverse_text(text)
        english_words = ['the', 'and', 'if', 'you', 'this', 'write', 'word', 'answer']
        
        found_words = sum(1 for word in english_words if word in reversed_text.lower())
        return found_words >= 2


def get_advanced_text_processing_tools() -> List[AdvancedTextProcessor]:
    """Get list of advanced text processing tools."""
    try:
        processor = AdvancedTextProcessor()
        if processor.available:
            return [processor]
        else:
            logger.warning("⚠️ Advanced text processor not available")
            return []
    except Exception as e:
        logger.error(f"❌ Failed to create advanced text processor: {e}")
        return []