Spaces:
Runtime error
Runtime error
import re | |
from typing import Dict, List, Set, Any | |
class LegalDomainFeatures: | |
def __init__(self): | |
# Initialize sets for different legal entities | |
self.parties = set() | |
self.dates = set() | |
self.amounts = set() | |
self.citations = set() | |
self.jurisdictions = set() | |
self.courts = set() | |
self.statutes = set() | |
self.regulations = set() | |
self.cases = set() | |
# Compile regex patterns | |
self.patterns = { | |
'parties': re.compile(r'\b(?:Party|Parties|Lessor|Lessee|Buyer|Seller|Plaintiff|Defendant)\s+(?:of|to|in|the)\s+(?:the\s+)?(?:first|second|third|fourth|fifth)\s+(?:part|party)\b'), | |
'dates': re.compile(r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2}(?:st|nd|rd|th)?,\s+\d{4}\b'), | |
'amounts': re.compile(r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?'), | |
'citations': re.compile(r'\b\d+\s+U\.S\.C\.\s+\d+|\b\d+\s+F\.R\.\s+\d+|\b\d+\s+CFR\s+\d+'), | |
'jurisdictions': re.compile(r'\b(?:State|Commonwealth|District|Territory)\s+of\s+[A-Za-z\s]+'), | |
'courts': re.compile(r'\b(?:Supreme|Appellate|District|Circuit|County|Municipal)\s+Court\b'), | |
'statutes': re.compile(r'\b(?:Act|Statute|Law|Code)\s+of\s+[A-Za-z\s]+\b'), | |
'regulations': re.compile(r'\b(?:Regulation|Rule|Order)\s+\d+\b'), | |
'cases': re.compile(r'\b[A-Za-z]+\s+v\.\s+[A-Za-z]+\b') | |
} | |
def process_legal_document(self, text: str) -> Dict[str, Any]: | |
"""Process a legal document and extract domain-specific features.""" | |
# Clear previous extractions | |
self._clear_extractions() | |
# Extract legal entities | |
self._extract_legal_entities(text) | |
# Extract relationships | |
relationships = self._extract_legal_relationships(text) | |
# Extract legal terms | |
terms = self._extract_legal_terms(text) | |
# Categorize document | |
category = self._categorize_document(text) | |
return { | |
"entities": { | |
"parties": list(self.parties), | |
"dates": list(self.dates), | |
"amounts": list(self.amounts), | |
"citations": list(self.citations), | |
"jurisdictions": list(self.jurisdictions), | |
"courts": list(self.courts), | |
"statutes": list(self.statutes), | |
"regulations": list(self.regulations), | |
"cases": list(self.cases) | |
}, | |
"relationships": relationships, | |
"terms": terms, | |
"category": category | |
} | |
def _clear_extractions(self): | |
"""Clear all extracted entities.""" | |
self.parties.clear() | |
self.dates.clear() | |
self.amounts.clear() | |
self.citations.clear() | |
self.jurisdictions.clear() | |
self.courts.clear() | |
self.statutes.clear() | |
self.regulations.clear() | |
self.cases.clear() | |
def _extract_legal_entities(self, text: str): | |
"""Extract legal entities from the text.""" | |
for entity_type, pattern in self.patterns.items(): | |
matches = pattern.finditer(text) | |
for match in matches: | |
getattr(self, entity_type).add(match.group()) | |
def _extract_legal_relationships(self, text: str) -> List[Dict[str, str]]: | |
"""Extract legal relationships from the text.""" | |
relationships = [] | |
# Pattern for relationships like "X shall Y" or "X must Y" | |
relationship_pattern = re.compile(r'([A-Z][A-Za-z\s]+)(?:\s+shall|\s+must|\s+will)\s+([^\.]+)') | |
for match in relationship_pattern.finditer(text): | |
subject = match.group(1).strip() | |
obligation = match.group(2).strip() | |
relationships.append({ | |
"subject": subject, | |
"obligation": obligation | |
}) | |
return relationships | |
def _extract_legal_terms(self, text: str) -> Dict[str, str]: | |
"""Extract legal terms and their definitions.""" | |
terms = {} | |
# Pattern for terms like "X means Y" or "X shall mean Y" | |
term_pattern = re.compile(r'([A-Z][A-Za-z\s]+)(?:\s+means|\s+shall\s+mean)\s+([^\.]+)') | |
for match in term_pattern.finditer(text): | |
term = match.group(1).strip() | |
definition = match.group(2).strip() | |
terms[term] = definition | |
return terms | |
def _categorize_document(self, text: str) -> str: | |
"""Categorize the document based on its content.""" | |
# Simple categorization based on keywords | |
if any(word in text.lower() for word in ['contract', 'agreement', 'lease']): | |
return "Contract" | |
elif any(word in text.lower() for word in ['complaint', 'petition', 'motion']): | |
return "Pleading" | |
elif any(word in text.lower() for word in ['statute', 'act', 'law']): | |
return "Statute" | |
elif any(word in text.lower() for word in ['regulation', 'rule', 'order']): | |
return "Regulation" | |
else: | |
return "Other" | |
# Create a singleton instance | |
legal_domain_features = LegalDomainFeatures() |