legal-doc-backend / backend /app /utils /enhanced_legal_processor.py
Harsh Upadhyay
adding backend to spaces with initial commit.
8397f09
import re
from typing import Dict, List, Any
class EnhancedLegalProcessor:
def __init__(self):
# Patterns for different document elements
self.table_pattern = re.compile(r'(\|\s*[^\n]+\s*\|(?:\n\|\s*[^\n]+\s*\|)+)')
self.list_pattern = re.compile(r'(?:^|\n)(?:\d+\.|\*|\-)\s+[^\n]+(?:\n(?:\d+\.|\*|\-)\s+[^\n]+)*')
self.formula_pattern = re.compile(r'\$[^$]+\$')
self.abbreviation_pattern = re.compile(r'\b[A-Z]{2,}(?:\s+[A-Z]{2,})*\b')
def process_document(self, text: str) -> Dict[str, Any]:
"""Process a legal document and extract various elements."""
return {
"tables": self._extract_tables(text),
"lists": self._extract_lists(text),
"formulas": self._extract_formulas(text),
"abbreviations": self._extract_abbreviations(text),
"definitions": self._extract_definitions(text),
"cleaned_text": self._clean_text(text)
}
def _extract_tables(self, text: str) -> List[str]:
"""Extract tables from the text."""
return self.table_pattern.findall(text)
def _extract_lists(self, text: str) -> List[str]:
"""Extract lists from the text."""
return self.list_pattern.findall(text)
def _extract_formulas(self, text: str) -> List[str]:
"""Extract mathematical formulas from the text."""
return self.formula_pattern.findall(text)
def _extract_abbreviations(self, text: str) -> List[str]:
"""Extract abbreviations from the text."""
return self.abbreviation_pattern.findall(text)
def _extract_definitions(self, text: str) -> Dict[str, str]:
"""Extract definitions from the text."""
definitions = {}
# Pattern for "X means Y" or "X shall mean Y"
definition_pattern = re.compile(r'([A-Z][A-Za-z\s]+)(?:\s+means|\s+shall\s+mean)\s+([^\.]+)')
for match in definition_pattern.finditer(text):
term = match.group(1).strip()
definition = match.group(2).strip()
definitions[term] = definition
return definitions
def _clean_text(self, text: str) -> str:
"""Clean the text by removing unnecessary whitespace and formatting."""
# Remove multiple spaces
text = re.sub(r'\s+', ' ', text)
# Remove multiple newlines
text = re.sub(r'\n+', '\n', text)
# Remove leading/trailing whitespace
text = text.strip()
return text
# Create a singleton instance
enhanced_legal_processor = EnhancedLegalProcessor()