legal-doc-backend / backend /app /utils /legal_domain_features.py
Harsh Upadhyay
adding backend to spaces with initial commit.
8397f09
import re
from typing import Dict, List, Set, Any
class LegalDomainFeatures:
def __init__(self):
# Initialize sets for different legal entities
self.parties = set()
self.dates = set()
self.amounts = set()
self.citations = set()
self.jurisdictions = set()
self.courts = set()
self.statutes = set()
self.regulations = set()
self.cases = set()
# Compile regex patterns
self.patterns = {
'parties': re.compile(r'\b(?:Party|Parties|Lessor|Lessee|Buyer|Seller|Plaintiff|Defendant)\s+(?:of|to|in|the)\s+(?:the\s+)?(?:first|second|third|fourth|fifth)\s+(?:part|party)\b'),
'dates': re.compile(r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2}(?:st|nd|rd|th)?,\s+\d{4}\b'),
'amounts': re.compile(r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?'),
'citations': re.compile(r'\b\d+\s+U\.S\.C\.\s+\d+|\b\d+\s+F\.R\.\s+\d+|\b\d+\s+CFR\s+\d+'),
'jurisdictions': re.compile(r'\b(?:State|Commonwealth|District|Territory)\s+of\s+[A-Za-z\s]+'),
'courts': re.compile(r'\b(?:Supreme|Appellate|District|Circuit|County|Municipal)\s+Court\b'),
'statutes': re.compile(r'\b(?:Act|Statute|Law|Code)\s+of\s+[A-Za-z\s]+\b'),
'regulations': re.compile(r'\b(?:Regulation|Rule|Order)\s+\d+\b'),
'cases': re.compile(r'\b[A-Za-z]+\s+v\.\s+[A-Za-z]+\b')
}
def process_legal_document(self, text: str) -> Dict[str, Any]:
"""Process a legal document and extract domain-specific features."""
# Clear previous extractions
self._clear_extractions()
# Extract legal entities
self._extract_legal_entities(text)
# Extract relationships
relationships = self._extract_legal_relationships(text)
# Extract legal terms
terms = self._extract_legal_terms(text)
# Categorize document
category = self._categorize_document(text)
return {
"entities": {
"parties": list(self.parties),
"dates": list(self.dates),
"amounts": list(self.amounts),
"citations": list(self.citations),
"jurisdictions": list(self.jurisdictions),
"courts": list(self.courts),
"statutes": list(self.statutes),
"regulations": list(self.regulations),
"cases": list(self.cases)
},
"relationships": relationships,
"terms": terms,
"category": category
}
def _clear_extractions(self):
"""Clear all extracted entities."""
self.parties.clear()
self.dates.clear()
self.amounts.clear()
self.citations.clear()
self.jurisdictions.clear()
self.courts.clear()
self.statutes.clear()
self.regulations.clear()
self.cases.clear()
def _extract_legal_entities(self, text: str):
"""Extract legal entities from the text."""
for entity_type, pattern in self.patterns.items():
matches = pattern.finditer(text)
for match in matches:
getattr(self, entity_type).add(match.group())
def _extract_legal_relationships(self, text: str) -> List[Dict[str, str]]:
"""Extract legal relationships from the text."""
relationships = []
# Pattern for relationships like "X shall Y" or "X must Y"
relationship_pattern = re.compile(r'([A-Z][A-Za-z\s]+)(?:\s+shall|\s+must|\s+will)\s+([^\.]+)')
for match in relationship_pattern.finditer(text):
subject = match.group(1).strip()
obligation = match.group(2).strip()
relationships.append({
"subject": subject,
"obligation": obligation
})
return relationships
def _extract_legal_terms(self, text: str) -> Dict[str, str]:
"""Extract legal terms and their definitions."""
terms = {}
# Pattern for terms like "X means Y" or "X shall mean Y"
term_pattern = re.compile(r'([A-Z][A-Za-z\s]+)(?:\s+means|\s+shall\s+mean)\s+([^\.]+)')
for match in term_pattern.finditer(text):
term = match.group(1).strip()
definition = match.group(2).strip()
terms[term] = definition
return terms
def _categorize_document(self, text: str) -> str:
"""Categorize the document based on its content."""
# Simple categorization based on keywords
if any(word in text.lower() for word in ['contract', 'agreement', 'lease']):
return "Contract"
elif any(word in text.lower() for word in ['complaint', 'petition', 'motion']):
return "Pleading"
elif any(word in text.lower() for word in ['statute', 'act', 'law']):
return "Statute"
elif any(word in text.lower() for word in ['regulation', 'rule', 'order']):
return "Regulation"
else:
return "Other"
# Create a singleton instance
legal_domain_features = LegalDomainFeatures()