import re import hashlib from typing import List, Dict, Any from urllib.parse import urlparse def clean_text(text: str) -> str: """Clean and normalize text content""" # Remove excessive whitespace text = re.sub(r'\s+', ' ', text) # Remove special characters but keep punctuation text = re.sub(r'[^\w\s\.,!?;:\-\(\)\[\]{}"\']', ' ', text) # Fix spacing around punctuation text = re.sub(r'\s+([,.!?;:])', r'\1', text) text = re.sub(r'([,.!?;:])\s*', r'\1 ', text) return text.strip() def extract_domain_keywords(text: str, domain: str = "insurance") -> List[str]: """Extract domain-specific keywords from text""" domain_patterns = { "insurance": [ r'\b(?:policy|coverage|premium|claim|benefit|deductible|copay)\b', r'\b(?:waiting period|grace period|renewal|exclusion)\b', r'\b(?:insured|insurer|policyholder|beneficiary)\b' ], "legal": [ r'\b(?:contract|agreement|clause|provision|liability)\b', r'\b(?:terms|conditions|obligations|rights|duties)\b', r'\b(?:breach|compliance|violation|penalty)\b' ], "hr": [ r'\b(?:employee|employer|employment|salary|benefits)\b', r'\b(?:leave|vacation|sick|medical|dental)\b', r'\b(?:performance|evaluation|promotion|termination)\b' ] } keywords = [] patterns = domain_patterns.get(domain, domain_patterns["insurance"]) for pattern in patterns: matches = re.findall(pattern, text, re.IGNORECASE) keywords.extend([match.lower() for match in matches]) # Remove duplicates and return return list(set(keywords)) def create_document_hash(content: str) -> str: """Create a hash for document content for caching""" return hashlib.md5(content.encode()).hexdigest() def is_valid_url(url: str) -> bool: """Check if URL is valid""" try: result = urlparse(url) return all([result.scheme, result.netloc]) except: return False def split_text_smartly(text: str, max_chunk_size: int, overlap: int = 100) -> List[str]: """Split text into chunks while preserving sentence boundaries""" if len(text) <= max_chunk_size: return [text] chunks = [] sentences = re.split(r'(?<=[.!?])\s+', text) current_chunk = "" for sentence in sentences: # Check if adding this sentence would exceed chunk size if len(current_chunk) + len(sentence) > max_chunk_size and current_chunk: chunks.append(current_chunk.strip()) # Start new chunk with overlap words = current_chunk.split() if len(words) > overlap // 10: overlap_text = ' '.join(words[-(overlap // 10):]) current_chunk = overlap_text + " " + sentence else: current_chunk = sentence else: if current_chunk: current_chunk += " " + sentence else: current_chunk = sentence # Add final chunk if current_chunk.strip(): chunks.append(current_chunk.strip()) return chunks def calculate_similarity_score(query_embedding, doc_embedding) -> float: """Calculate cosine similarity between embeddings""" try: import numpy as np # Normalize embeddings query_norm = query_embedding / np.linalg.norm(query_embedding) doc_norm = doc_embedding / np.linalg.norm(doc_embedding) # Calculate cosine similarity similarity = np.dot(query_norm, doc_norm) return float(similarity) except: return 0.0 def format_processing_time(seconds: float) -> str: """Format processing time in human-readable format""" if seconds < 1: return f"{seconds*1000:.0f}ms" elif seconds < 60: return f"{seconds:.1f}s" else: minutes = int(seconds // 60) remaining_seconds = seconds % 60 return f"{minutes}m {remaining_seconds:.1f}s" def extract_numbers_and_dates(text: str) -> Dict[str, List[str]]: """Extract numbers and dates from text""" numbers = re.findall(r'\b\d+(?:\.\d+)?\b', text) dates = re.findall(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b|\b\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{2,4}\b', text, re.IGNORECASE) return { "numbers": numbers, "dates": dates } def truncate_text(text: str, max_length: int, suffix: str = "...") -> str: """Truncate text to maximum length""" if len(text) <= max_length: return text return text[:max_length - len(suffix)] + suffix