Spaces:
Running
Running
import re | |
import hashlib | |
from typing import List, Dict, Any | |
from urllib.parse import urlparse | |
def clean_text(text: str) -> str: | |
"""Clean and normalize text content""" | |
# Remove excessive whitespace | |
text = re.sub(r'\s+', ' ', text) | |
# Remove special characters but keep punctuation | |
text = re.sub(r'[^\w\s\.,!?;:\-\(\)\[\]{}"\']', ' ', text) | |
# Fix spacing around punctuation | |
text = re.sub(r'\s+([,.!?;:])', r'\1', text) | |
text = re.sub(r'([,.!?;:])\s*', r'\1 ', text) | |
return text.strip() | |
def extract_domain_keywords(text: str, domain: str = "insurance") -> List[str]: | |
"""Extract domain-specific keywords from text""" | |
domain_patterns = { | |
"insurance": [ | |
r'\b(?:policy|coverage|premium|claim|benefit|deductible|copay)\b', | |
r'\b(?:waiting period|grace period|renewal|exclusion)\b', | |
r'\b(?:insured|insurer|policyholder|beneficiary)\b' | |
], | |
"legal": [ | |
r'\b(?:contract|agreement|clause|provision|liability)\b', | |
r'\b(?:terms|conditions|obligations|rights|duties)\b', | |
r'\b(?:breach|compliance|violation|penalty)\b' | |
], | |
"hr": [ | |
r'\b(?:employee|employer|employment|salary|benefits)\b', | |
r'\b(?:leave|vacation|sick|medical|dental)\b', | |
r'\b(?:performance|evaluation|promotion|termination)\b' | |
] | |
} | |
keywords = [] | |
patterns = domain_patterns.get(domain, domain_patterns["insurance"]) | |
for pattern in patterns: | |
matches = re.findall(pattern, text, re.IGNORECASE) | |
keywords.extend([match.lower() for match in matches]) | |
# Remove duplicates and return | |
return list(set(keywords)) | |
def create_document_hash(content: str) -> str: | |
"""Create a hash for document content for caching""" | |
return hashlib.md5(content.encode()).hexdigest() | |
def is_valid_url(url: str) -> bool: | |
"""Check if URL is valid""" | |
try: | |
result = urlparse(url) | |
return all([result.scheme, result.netloc]) | |
except: | |
return False | |
def split_text_smartly(text: str, max_chunk_size: int, overlap: int = 100) -> List[str]: | |
"""Split text into chunks while preserving sentence boundaries""" | |
if len(text) <= max_chunk_size: | |
return [text] | |
chunks = [] | |
sentences = re.split(r'(?<=[.!?])\s+', text) | |
current_chunk = "" | |
for sentence in sentences: | |
# Check if adding this sentence would exceed chunk size | |
if len(current_chunk) + len(sentence) > max_chunk_size and current_chunk: | |
chunks.append(current_chunk.strip()) | |
# Start new chunk with overlap | |
words = current_chunk.split() | |
if len(words) > overlap // 10: | |
overlap_text = ' '.join(words[-(overlap // 10):]) | |
current_chunk = overlap_text + " " + sentence | |
else: | |
current_chunk = sentence | |
else: | |
if current_chunk: | |
current_chunk += " " + sentence | |
else: | |
current_chunk = sentence | |
# Add final chunk | |
if current_chunk.strip(): | |
chunks.append(current_chunk.strip()) | |
return chunks | |
def calculate_similarity_score(query_embedding, doc_embedding) -> float: | |
"""Calculate cosine similarity between embeddings""" | |
try: | |
import numpy as np | |
# Normalize embeddings | |
query_norm = query_embedding / np.linalg.norm(query_embedding) | |
doc_norm = doc_embedding / np.linalg.norm(doc_embedding) | |
# Calculate cosine similarity | |
similarity = np.dot(query_norm, doc_norm) | |
return float(similarity) | |
except: | |
return 0.0 | |
def format_processing_time(seconds: float) -> str: | |
"""Format processing time in human-readable format""" | |
if seconds < 1: | |
return f"{seconds*1000:.0f}ms" | |
elif seconds < 60: | |
return f"{seconds:.1f}s" | |
else: | |
minutes = int(seconds // 60) | |
remaining_seconds = seconds % 60 | |
return f"{minutes}m {remaining_seconds:.1f}s" | |
def extract_numbers_and_dates(text: str) -> Dict[str, List[str]]: | |
"""Extract numbers and dates from text""" | |
numbers = re.findall(r'\b\d+(?:\.\d+)?\b', text) | |
dates = re.findall(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b|\b\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{2,4}\b', text, re.IGNORECASE) | |
return { | |
"numbers": numbers, | |
"dates": dates | |
} | |
def truncate_text(text: str, max_length: int, suffix: str = "...") -> str: | |
"""Truncate text to maximum length""" | |
if len(text) <= max_length: | |
return text | |
return text[:max_length - len(suffix)] + suffix | |