Spaces:

TheGod-2003
/

legal-doc-backend

Runtime error

legal-doc-backend / backend /app /utils /legal_domain_features.py

Harsh Upadhyay

adding backend to spaces with initial commit.

8397f09 about 2 months ago

5.3 kB

	import re
	from typing import Dict, List, Set, Any

	class LegalDomainFeatures:
	def __init__(self):
	# Initialize sets for different legal entities
	self.parties = set()
	self.dates = set()
	self.amounts = set()
	self.citations = set()
	self.jurisdictions = set()
	self.courts = set()
	self.statutes = set()
	self.regulations = set()
	self.cases = set()

	# Compile regex patterns
	self.patterns = {
	'parties': re.compile(r'\b(?:Party\|Parties\|Lessor\|Lessee\|Buyer\|Seller\|Plaintiff\|Defendant)\s+(?:of\|to\|in\|the)\s+(?:the\s+)?(?:first\|second\|third\|fourth\|fifth)\s+(?:part\|party)\b'),
	'dates': re.compile(r'\b(?:January\|February\|March\|April\|May\|June\|July\|August\|September\|October\|November\|December)\s+\d{1,2}(?:st\|nd\|rd\|th)?,\s+\d{4}\b'),
	'amounts': re.compile(r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?'),
	'citations': re.compile(r'\b\d+\s+U\.S\.C\.\s+\d+\|\b\d+\s+F\.R\.\s+\d+\|\b\d+\s+CFR\s+\d+'),
	'jurisdictions': re.compile(r'\b(?:State\|Commonwealth\|District\|Territory)\s+of\s+[A-Za-z\s]+'),
	'courts': re.compile(r'\b(?:Supreme\|Appellate\|District\|Circuit\|County\|Municipal)\s+Court\b'),
	'statutes': re.compile(r'\b(?:Act\|Statute\|Law\|Code)\s+of\s+[A-Za-z\s]+\b'),
	'regulations': re.compile(r'\b(?:Regulation\|Rule\|Order)\s+\d+\b'),
	'cases': re.compile(r'\b[A-Za-z]+\s+v\.\s+[A-Za-z]+\b')
	}

	def process_legal_document(self, text: str) -> Dict[str, Any]:
	"""Process a legal document and extract domain-specific features."""
	# Clear previous extractions
	self._clear_extractions()

	# Extract legal entities
	self._extract_legal_entities(text)

	# Extract relationships
	relationships = self._extract_legal_relationships(text)

	# Extract legal terms
	terms = self._extract_legal_terms(text)

	# Categorize document
	category = self._categorize_document(text)

	return {
	"entities": {
	"parties": list(self.parties),
	"dates": list(self.dates),
	"amounts": list(self.amounts),
	"citations": list(self.citations),
	"jurisdictions": list(self.jurisdictions),
	"courts": list(self.courts),
	"statutes": list(self.statutes),
	"regulations": list(self.regulations),
	"cases": list(self.cases)
	},
	"relationships": relationships,
	"terms": terms,
	"category": category
	}

	def _clear_extractions(self):
	"""Clear all extracted entities."""
	self.parties.clear()
	self.dates.clear()
	self.amounts.clear()
	self.citations.clear()
	self.jurisdictions.clear()
	self.courts.clear()
	self.statutes.clear()
	self.regulations.clear()
	self.cases.clear()

	def _extract_legal_entities(self, text: str):
	"""Extract legal entities from the text."""
	for entity_type, pattern in self.patterns.items():
	matches = pattern.finditer(text)
	for match in matches:
	getattr(self, entity_type).add(match.group())

	def _extract_legal_relationships(self, text: str) -> List[Dict[str, str]]:
	"""Extract legal relationships from the text."""
	relationships = []
	# Pattern for relationships like "X shall Y" or "X must Y"
	relationship_pattern = re.compile(r'([A-Z][A-Za-z\s]+)(?:\s+shall\|\s+must\|\s+will)\s+([^\.]+)')

	for match in relationship_pattern.finditer(text):
	subject = match.group(1).strip()
	obligation = match.group(2).strip()
	relationships.append({
	"subject": subject,
	"obligation": obligation
	})

	return relationships

	def _extract_legal_terms(self, text: str) -> Dict[str, str]:
	"""Extract legal terms and their definitions."""
	terms = {}
	# Pattern for terms like "X means Y" or "X shall mean Y"
	term_pattern = re.compile(r'([A-Z][A-Za-z\s]+)(?:\s+means\|\s+shall\s+mean)\s+([^\.]+)')

	for match in term_pattern.finditer(text):
	term = match.group(1).strip()
	definition = match.group(2).strip()
	terms[term] = definition

	return terms

	def _categorize_document(self, text: str) -> str:
	"""Categorize the document based on its content."""
	# Simple categorization based on keywords
	if any(word in text.lower() for word in ['contract', 'agreement', 'lease']):
	return "Contract"
	elif any(word in text.lower() for word in ['complaint', 'petition', 'motion']):
	return "Pleading"
	elif any(word in text.lower() for word in ['statute', 'act', 'law']):
	return "Statute"
	elif any(word in text.lower() for word in ['regulation', 'rule', 'order']):
	return "Regulation"
	else:
	return "Other"

	# Create a singleton instance
	legal_domain_features = LegalDomainFeatures()