File size: 5,296 Bytes
8397f09
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import re
from typing import Dict, List, Set, Any

class LegalDomainFeatures:
    def __init__(self):
        # Initialize sets for different legal entities
        self.parties = set()
        self.dates = set()
        self.amounts = set()
        self.citations = set()
        self.jurisdictions = set()
        self.courts = set()
        self.statutes = set()
        self.regulations = set()
        self.cases = set()
        
        # Compile regex patterns
        self.patterns = {
            'parties': re.compile(r'\b(?:Party|Parties|Lessor|Lessee|Buyer|Seller|Plaintiff|Defendant)\s+(?:of|to|in|the)\s+(?:the\s+)?(?:first|second|third|fourth|fifth)\s+(?:part|party)\b'),
            'dates': re.compile(r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2}(?:st|nd|rd|th)?,\s+\d{4}\b'),
            'amounts': re.compile(r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?'),
            'citations': re.compile(r'\b\d+\s+U\.S\.C\.\s+\d+|\b\d+\s+F\.R\.\s+\d+|\b\d+\s+CFR\s+\d+'),
            'jurisdictions': re.compile(r'\b(?:State|Commonwealth|District|Territory)\s+of\s+[A-Za-z\s]+'),
            'courts': re.compile(r'\b(?:Supreme|Appellate|District|Circuit|County|Municipal)\s+Court\b'),
            'statutes': re.compile(r'\b(?:Act|Statute|Law|Code)\s+of\s+[A-Za-z\s]+\b'),
            'regulations': re.compile(r'\b(?:Regulation|Rule|Order)\s+\d+\b'),
            'cases': re.compile(r'\b[A-Za-z]+\s+v\.\s+[A-Za-z]+\b')
        }
        
    def process_legal_document(self, text: str) -> Dict[str, Any]:
        """Process a legal document and extract domain-specific features."""
        # Clear previous extractions
        self._clear_extractions()
        
        # Extract legal entities
        self._extract_legal_entities(text)
        
        # Extract relationships
        relationships = self._extract_legal_relationships(text)
        
        # Extract legal terms
        terms = self._extract_legal_terms(text)
        
        # Categorize document
        category = self._categorize_document(text)
        
        return {
            "entities": {
                "parties": list(self.parties),
                "dates": list(self.dates),
                "amounts": list(self.amounts),
                "citations": list(self.citations),
                "jurisdictions": list(self.jurisdictions),
                "courts": list(self.courts),
                "statutes": list(self.statutes),
                "regulations": list(self.regulations),
                "cases": list(self.cases)
            },
            "relationships": relationships,
            "terms": terms,
            "category": category
        }
    
    def _clear_extractions(self):
        """Clear all extracted entities."""
        self.parties.clear()
        self.dates.clear()
        self.amounts.clear()
        self.citations.clear()
        self.jurisdictions.clear()
        self.courts.clear()
        self.statutes.clear()
        self.regulations.clear()
        self.cases.clear()
    
    def _extract_legal_entities(self, text: str):
        """Extract legal entities from the text."""
        for entity_type, pattern in self.patterns.items():
            matches = pattern.finditer(text)
            for match in matches:
                getattr(self, entity_type).add(match.group())
    
    def _extract_legal_relationships(self, text: str) -> List[Dict[str, str]]:
        """Extract legal relationships from the text."""
        relationships = []
        # Pattern for relationships like "X shall Y" or "X must Y"
        relationship_pattern = re.compile(r'([A-Z][A-Za-z\s]+)(?:\s+shall|\s+must|\s+will)\s+([^\.]+)')
        
        for match in relationship_pattern.finditer(text):
            subject = match.group(1).strip()
            obligation = match.group(2).strip()
            relationships.append({
                "subject": subject,
                "obligation": obligation
            })
            
        return relationships
    
    def _extract_legal_terms(self, text: str) -> Dict[str, str]:
        """Extract legal terms and their definitions."""
        terms = {}
        # Pattern for terms like "X means Y" or "X shall mean Y"
        term_pattern = re.compile(r'([A-Z][A-Za-z\s]+)(?:\s+means|\s+shall\s+mean)\s+([^\.]+)')
        
        for match in term_pattern.finditer(text):
            term = match.group(1).strip()
            definition = match.group(2).strip()
            terms[term] = definition
            
        return terms
    
    def _categorize_document(self, text: str) -> str:
        """Categorize the document based on its content."""
        # Simple categorization based on keywords
        if any(word in text.lower() for word in ['contract', 'agreement', 'lease']):
            return "Contract"
        elif any(word in text.lower() for word in ['complaint', 'petition', 'motion']):
            return "Pleading"
        elif any(word in text.lower() for word in ['statute', 'act', 'law']):
            return "Statute"
        elif any(word in text.lower() for word in ['regulation', 'rule', 'order']):
            return "Regulation"
        else:
            return "Other"

# Create a singleton instance
legal_domain_features = LegalDomainFeatures()