Spaces:
Runtime error
Runtime error
import re | |
from typing import Dict, List, Any | |
class EnhancedLegalProcessor: | |
def __init__(self): | |
# Patterns for different document elements | |
self.table_pattern = re.compile(r'(\|\s*[^\n]+\s*\|(?:\n\|\s*[^\n]+\s*\|)+)') | |
self.list_pattern = re.compile(r'(?:^|\n)(?:\d+\.|\*|\-)\s+[^\n]+(?:\n(?:\d+\.|\*|\-)\s+[^\n]+)*') | |
self.formula_pattern = re.compile(r'\$[^$]+\$') | |
self.abbreviation_pattern = re.compile(r'\b[A-Z]{2,}(?:\s+[A-Z]{2,})*\b') | |
def process_document(self, text: str) -> Dict[str, Any]: | |
"""Process a legal document and extract various elements.""" | |
return { | |
"tables": self._extract_tables(text), | |
"lists": self._extract_lists(text), | |
"formulas": self._extract_formulas(text), | |
"abbreviations": self._extract_abbreviations(text), | |
"definitions": self._extract_definitions(text), | |
"cleaned_text": self._clean_text(text) | |
} | |
def _extract_tables(self, text: str) -> List[str]: | |
"""Extract tables from the text.""" | |
return self.table_pattern.findall(text) | |
def _extract_lists(self, text: str) -> List[str]: | |
"""Extract lists from the text.""" | |
return self.list_pattern.findall(text) | |
def _extract_formulas(self, text: str) -> List[str]: | |
"""Extract mathematical formulas from the text.""" | |
return self.formula_pattern.findall(text) | |
def _extract_abbreviations(self, text: str) -> List[str]: | |
"""Extract abbreviations from the text.""" | |
return self.abbreviation_pattern.findall(text) | |
def _extract_definitions(self, text: str) -> Dict[str, str]: | |
"""Extract definitions from the text.""" | |
definitions = {} | |
# Pattern for "X means Y" or "X shall mean Y" | |
definition_pattern = re.compile(r'([A-Z][A-Za-z\s]+)(?:\s+means|\s+shall\s+mean)\s+([^\.]+)') | |
for match in definition_pattern.finditer(text): | |
term = match.group(1).strip() | |
definition = match.group(2).strip() | |
definitions[term] = definition | |
return definitions | |
def _clean_text(self, text: str) -> str: | |
"""Clean the text by removing unnecessary whitespace and formatting.""" | |
# Remove multiple spaces | |
text = re.sub(r'\s+', ' ', text) | |
# Remove multiple newlines | |
text = re.sub(r'\n+', '\n', text) | |
# Remove leading/trailing whitespace | |
text = text.strip() | |
return text | |
# Create a singleton instance | |
enhanced_legal_processor = EnhancedLegalProcessor() |