Spaces:

Manjesh501
/

hackrx

Running

App Files Files Community

hackrx / utils /text_processing.py

Manjesh501

Initial commit with all project files

e159fb9 8 days ago

raw

history blame contribute delete

4.72 kB

	import re
	import hashlib
	from typing import List, Dict, Any
	from urllib.parse import urlparse

	def clean_text(text: str) -> str:
	"""Clean and normalize text content"""
	# Remove excessive whitespace
	text = re.sub(r'\s+', ' ', text)
	# Remove special characters but keep punctuation
	text = re.sub(r'[^\w\s\.,!?;:\-\(\)\[\]{}"\']', ' ', text)
	# Fix spacing around punctuation
	text = re.sub(r'\s+([,.!?;:])', r'\1', text)
	text = re.sub(r'([,.!?;:])\s*', r'\1 ', text)

	return text.strip()

	def extract_domain_keywords(text: str, domain: str = "insurance") -> List[str]:
	"""Extract domain-specific keywords from text"""
	domain_patterns = {
	"insurance": [
	r'\b(?:policy\|coverage\|premium\|claim\|benefit\|deductible\|copay)\b',
	r'\b(?:waiting period\|grace period\|renewal\|exclusion)\b',
	r'\b(?:insured\|insurer\|policyholder\|beneficiary)\b'
	],
	"legal": [
	r'\b(?:contract\|agreement\|clause\|provision\|liability)\b',
	r'\b(?:terms\|conditions\|obligations\|rights\|duties)\b',
	r'\b(?:breach\|compliance\|violation\|penalty)\b'
	],
	"hr": [
	r'\b(?:employee\|employer\|employment\|salary\|benefits)\b',
	r'\b(?:leave\|vacation\|sick\|medical\|dental)\b',
	r'\b(?:performance\|evaluation\|promotion\|termination)\b'
	]
	}

	keywords = []
	patterns = domain_patterns.get(domain, domain_patterns["insurance"])

	for pattern in patterns:
	matches = re.findall(pattern, text, re.IGNORECASE)
	keywords.extend([match.lower() for match in matches])

	# Remove duplicates and return
	return list(set(keywords))

	def create_document_hash(content: str) -> str:
	"""Create a hash for document content for caching"""
	return hashlib.md5(content.encode()).hexdigest()

	def is_valid_url(url: str) -> bool:
	"""Check if URL is valid"""
	try:
	result = urlparse(url)
	return all([result.scheme, result.netloc])
	except:
	return False

	def split_text_smartly(text: str, max_chunk_size: int, overlap: int = 100) -> List[str]:
	"""Split text into chunks while preserving sentence boundaries"""
	if len(text) <= max_chunk_size:
	return [text]

	chunks = []
	sentences = re.split(r'(?<=[.!?])\s+', text)

	current_chunk = ""

	for sentence in sentences:
	# Check if adding this sentence would exceed chunk size
	if len(current_chunk) + len(sentence) > max_chunk_size and current_chunk:
	chunks.append(current_chunk.strip())

	# Start new chunk with overlap
	words = current_chunk.split()
	if len(words) > overlap // 10:
	overlap_text = ' '.join(words[-(overlap // 10):])
	current_chunk = overlap_text + " " + sentence
	else:
	current_chunk = sentence
	else:
	if current_chunk:
	current_chunk += " " + sentence
	else:
	current_chunk = sentence

	# Add final chunk
	if current_chunk.strip():
	chunks.append(current_chunk.strip())

	return chunks

	def calculate_similarity_score(query_embedding, doc_embedding) -> float:
	"""Calculate cosine similarity between embeddings"""
	try:
	import numpy as np

	# Normalize embeddings
	query_norm = query_embedding / np.linalg.norm(query_embedding)
	doc_norm = doc_embedding / np.linalg.norm(doc_embedding)

	# Calculate cosine similarity
	similarity = np.dot(query_norm, doc_norm)

	return float(similarity)
	except:
	return 0.0

	def format_processing_time(seconds: float) -> str:
	"""Format processing time in human-readable format"""
	if seconds < 1:
	return f"{seconds*1000:.0f}ms"
	elif seconds < 60:
	return f"{seconds:.1f}s"
	else:
	minutes = int(seconds // 60)
	remaining_seconds = seconds % 60
	return f"{minutes}m {remaining_seconds:.1f}s"

	def extract_numbers_and_dates(text: str) -> Dict[str, List[str]]:
	"""Extract numbers and dates from text"""
	numbers = re.findall(r'\b\d+(?:\.\d+)?\b', text)
	dates = re.findall(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b\|\b\d{1,2}\s+(?:Jan\|Feb\|Mar\|Apr\|May\|Jun\|Jul\|Aug\|Sep\|Oct\|Nov\|Dec)[a-z]*\s+\d{2,4}\b', text, re.IGNORECASE)

	return {
	"numbers": numbers,
	"dates": dates
	}

	def truncate_text(text: str, max_length: int, suffix: str = "...") -> str:
	"""Truncate text to maximum length"""
	if len(text) <= max_length:
	return text

	return text[:max_length - len(suffix)] + suffix