Spaces:
Sleeping
Sleeping
File size: 40,834 Bytes
1099afe bdedf43 1099afe 522f7a0 1099afe bdedf43 1099afe bdedf43 1099afe bdedf43 522f7a0 1099afe bdedf43 522f7a0 1099afe bdedf43 522f7a0 1099afe bdedf43 1099afe bdedf43 1099afe bdedf43 1099afe bdedf43 1099afe bdedf43 1099afe bdedf43 1099afe bdedf43 1099afe bdedf43 1099afe bdedf43 1099afe bdedf43 1099afe bdedf43 1099afe bdedf43 1099afe 522f7a0 bdedf43 522f7a0 bdedf43 522f7a0 bdedf43 522f7a0 1099afe bdedf43 1099afe bdedf43 1099afe bdedf43 1099afe 522f7a0 1099afe 522f7a0 1099afe 522f7a0 1099afe bdedf43 522f7a0 bdedf43 522f7a0 bdedf43 522f7a0 1099afe 522f7a0 1099afe 522f7a0 1099afe bdedf43 1099afe bdedf43 1099afe bdedf43 1099afe bdedf43 1099afe bdedf43 1099afe bdedf43 1099afe bdedf43 1099afe bdedf43 1099afe bdedf43 1099afe bdedf43 1099afe bdedf43 1099afe bdedf43 1099afe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 |
# DEPENDENCIES
import re
import sys
import torch
import numpy as np
from typing import Any
from typing import List
from typing import Dict
from typing import Tuple
from pathlib import Path
from typing import Optional
from dataclasses import dataclass
from sentence_transformers import util
# Import utilities
sys.path.append(str(Path(__file__).parent.parent))
from utils.logger import log_info
from utils.logger import log_error
from config.risk_rules import ContractType
from config.model_config import ModelConfig
from utils.text_processor import TextProcessor
from utils.logger import ContractAnalyzerLogger
from services.data_models import ContractCategory
class ContractClassifier:
"""
Contract categorization using:
1. Legal-BERT embeddings + semantic similarity
2. Multi-label classification (a contract can be multiple types)
3. Hierarchical categories (Employment -> Full-Time/Contract/Internship)
4. Confidence scoring with explanations
"""
# CATEGORY HIERARCHY WITH KEYWORDS - UPDATED TO MATCH YOUR CATEGORIES
CATEGORY_HIERARCHY = {'employment' : {'subcategories' : ['full_time', 'part_time', 'contract_worker', 'internship', 'executive'],
'keywords' : ['employee', 'employment', 'employer', 'job', 'position', 'staff', 'salary', 'wages', 'compensation', 'payroll', 'benefits', 'health insurance', 'retirement', 'pension', '401(k)', 'vacation', 'paid time off', 'sick leave', 'holidays', 'probation', 'performance review', 'promotion', 'termination', 'job description', 'duties', 'responsibilities', 'work hours', 'overtime', 'timekeeping', 'attendance', 'confidentiality', 'non-compete', 'non-solicitation', 'intellectual property', 'inventions', 'work product', 'severance', 'notice period', 'resignation', 'dismissal'],
'weight' : 1.1,
},
'consulting' : {'subcategories' : ['independent_contractor', 'advisory', 'professional_services', 'freelance'],
'keywords' : ['consultant', 'consulting', 'independent contractor', 'statement of work', 'deliverables', 'professional services', 'hourly rate', 'project scope', 'milestone', 'acceptance criteria', 'work product', '1099', 'self-employed', 'contractor', 'consulting services', 'expert advice', 'advisory services', 'project basis', 'task order'],
'weight' : 1.0,
},
'nda' : {'subcategories' : ['mutual_nda', 'unilateral_nda', 'confidentiality_agreement'],
'keywords' : ['non-disclosure', 'confidentiality', 'proprietary information', 'nda', 'disclosure agreement', 'trade secret', 'confidential information', 'receiving party', 'disclosing party', 'confidentiality obligation', 'non-use', 'non-circumvention', 'secrecy', 'protected information', 'confidentiality period', 'return of information'],
'weight' : 1.0,
},
'software' : {'subcategories' : ['software_license', 'saas', 'cloud_services', 'development', 'api_access'],
'keywords' : ['software', 'license', 'saas', 'subscription', 'source code', 'object code', 'api', 'cloud', 'hosting', 'maintenance', 'updates', 'support', 'uptime', 'service level', 'software as a service', 'platform', 'application', 'user license', 'perpetual license', 'subscription fee', 'end user license agreement', 'eula'],
'weight' : 1.1,
},
'service' : {'subcategories' : ['master_services', 'maintenance', 'support', 'subscription'],
'keywords' : ['service provider', 'services', 'sla', 'service level agreement', 'uptime', 'response time', 'support', 'maintenance', 'service credits', 'performance metrics', 'implementation', 'professional services', 'service description', 'service fees', 'service term', 'service delivery', 'service scope'],
'weight' : 1.0,
},
'partnership' : {'subcategories' : ['business_partnership', 'joint_venture', 'strategic_alliance'],
'keywords' : ['partnership', 'joint venture', 'equity', 'shares', 'profit sharing', 'loss allocation', 'management', 'governance', 'voting rights', 'dissolution', 'capital contribution', 'distribution', 'membership interest', 'operating agreement', 'board of directors', 'partnership agreement'],
'weight' : 1.0,
},
'lease' : {'subcategories' : ['residential_lease', 'commercial_lease', 'sublease', 'equipment_lease'],
'keywords' : ['landlord', 'tenant', 'lease', 'premises', 'rent', 'property', 'security deposit', 'utilities', 'maintenance', 'repairs', 'eviction', 'lease term', 'renewal', 'square footage', 'rental agreement', 'lessor', 'lessee', 'property management', 'common areas', 'quiet enjoyment'],
'weight' : 1.0,
},
'purchase' : {'subcategories' : ['asset_purchase', 'stock_purchase', 'goods_purchase'],
'keywords' : ['purchase', 'sale', 'buyer', 'seller', 'goods', 'products', 'delivery', 'shipment', 'payment terms', 'invoice', 'purchase price', 'quantity', 'specifications', 'purchase order', 'sales agreement', 'bill of sale', 'title transfer', 'risk of loss', 'closing date'],
'weight' : 1.0,
},
'general' : {'subcategories' : ['standard_agreement', 'basic_contract'],
'keywords' : ['agreement', 'contract', 'party', 'parties', 'terms and conditions', 'governing law', 'jurisdiction', 'dispute resolution', 'force majeure', 'notice', 'amendment', 'assignment', 'severability', 'entire agreement'],
'weight' : 0.8,
},
}
# SUBCATEGORY DETECTION PATTERNS
SUBCATEGORY_PATTERNS = {'full_time' : ['full-time', 'full time', 'permanent', 'regular employee', '40 hours', 'exempt employee', 'salary basis'],
'part_time' : ['part-time', 'part time', 'hours per week', 'non-exempt', 'hourly employee', 'temporary', 'seasonal'],
'contract_worker' : ['independent contractor', 'contract', 'fixed term', 'temporary', 'contract period', 'contract worker', 'contract employee'],
'internship' : ['intern', 'internship', 'student', 'training program', 'educational', 'college credit', 'unpaid intern'],
'executive' : ['executive', 'ceo', 'cfo', 'cto', 'president', 'vice president', 'director', 'officer', 'executive compensation', 'stock options', 'golden parachute'],
'independent_contractor' : ['independent contractor', '1099', 'contractor', 'self-employed', 'freelance', 'consultant agreement'],
'advisory' : ['advisor', 'advisory', 'counsel', 'consulting services', 'expert advice', 'advisory board', 'strategic advisory'],
'professional_services' : ['professional services', 'consulting services', 'engagement', 'service provider', 'professional firm'],
'freelance' : ['freelance', 'freelancer', 'gig', 'project-based', 'freelance work', 'gig economy'],
'mutual_nda' : ['mutual', 'both parties', 'each party', 'reciprocal', 'mutual confidentiality', 'two-way'],
'unilateral_nda' : ['one-way', 'receiving party', 'disclosing party', 'unilateral', 'single party', 'one party'],
'confidentiality_agreement' : ['confidentiality agreement', 'secrecy agreement', 'proprietary information agreement'],
'software_license' : ['software license', 'license key', 'perpetual license', 'end user license', 'software agreement'],
'saas' : ['software as a service', 'saas', 'subscription', 'cloud-based', 'web-based', 'online service'],
'cloud_services' : ['cloud services', 'cloud computing', 'infrastructure', 'iaas', 'paas', 'cloud hosting'],
'development' : ['software development', 'custom development', 'development services', 'programming', 'coding'],
'api_access' : ['api', 'application programming interface', 'api access', 'api key', 'rest api', 'graphql'],
'master_services' : ['master services agreement', 'msa', 'master agreement', 'framework agreement'],
'maintenance' : ['maintenance agreement', 'maintenance services', 'preventive maintenance', 'repair services'],
'support' : ['support agreement', 'technical support', 'customer support', 'help desk'],
'subscription' : ['subscription agreement', 'subscription service', 'recurring billing', 'subscription fee'],
'business_partnership' : ['partnership', 'general partnership', 'limited partnership', 'partnership agreement'],
'joint_venture' : ['joint venture', 'jv agreement', 'joint venture agreement', 'strategic alliance'],
'strategic_alliance' : ['strategic alliance', 'collaboration agreement', 'cooperation agreement'],
'residential_lease' : ['residential', 'apartment', 'house', 'dwelling', 'residential property', 'tenant', 'landlord', 'rental'],
'commercial_lease' : ['commercial', 'office space', 'retail space', 'commercial property', 'business premises', 'commercial tenant'],
'sublease' : ['sublease', 'sublet', 'subtenant', 'sublessee', 'sublessor'],
'equipment_lease' : ['equipment lease', 'equipment rental', 'lease equipment', 'leased property'],
'asset_purchase' : ['asset purchase', 'business assets', 'asset sale', 'purchase assets'],
'stock_purchase' : ['stock purchase', 'share purchase', 'equity purchase', 'stock sale'],
'goods_purchase' : ['goods purchase', 'product purchase', 'merchandise', 'inventory purchase'],
'standard_agreement' : ['standard agreement', 'template agreement', 'boilerplate contract'],
'basic_contract' : ['basic contract', 'simple agreement', 'standard terms'],
}
DEFAULT_CONFIDENCE_THRESHOLD = 0.65
MULTI_LABEL_THRESHOLD = 0.55
def __init__(self, model_loader):
"""
Initialize contract classifier
Arguments:
----------
model_loader : ModelLoader instance for accessing Legal-BERT and embeddings
"""
self.model_loader = model_loader
self.embedding_model = None
self.legal_bert_model = None
self.legal_bert_tokenizer = None
self.device = None
# Category template embeddings (computed once)
self.category_embeddings = dict()
# Text processor for preprocessing : Don't need spaCy for classification
self.text_processor = TextProcessor(use_spacy = False)
# Logger
self.logger = ContractAnalyzerLogger.get_logger()
# Lazy load models
self._lazy_load()
def _lazy_load(self):
"""
Lazy load models on first use
"""
if self.embedding_model is None:
try:
log_info("Loading models for contract classification...")
# Load embedding model
self.embedding_model = self.model_loader.load_embedding_model()
# Load Legal-BERT
self.legal_bert_model, self.legal_bert_tokenizer = self.model_loader.load_legal_bert()
self.device = self.model_loader.device
# Prepare category embeddings
self._prepare_category_embeddings()
log_info("Contract classifier models loaded successfully")
except Exception as e:
log_error(e, context = {"component" : "ContractClassifier", "operation" : "model_loading"})
raise
def _extract_classification_context(self, full_text: str) -> str:
"""
Extract key legal sections for more accurate classification
Focuses on preamble, definitions, and core agreement sections
Arguments:
----------
full_text { str } : Full contract text
Returns:
--------
{ str } : Context-rich excerpt for classification
"""
sections = list()
# First 2000 chars (usually contains parties, effective date, preamble)
sections.append(full_text[:2000])
# WHEREAS clauses (recitals - explains purpose and background)
whereas_section = self._extract_section_between(full_text, "WHEREAS", "NOW THEREFORE")
if whereas_section:
sections.append(whereas_section)
# AGREEMENT section (core contractual terms)
agreement_section = self._extract_section_between(full_text, "AGREEMENT", "TERMS AND CONDITIONS")
if not agreement_section:
agreement_section = self._extract_section_containing(full_text, ["AGREES AS FOLLOWS", "HEREBY AGREES"])
if agreement_section:
sections.append(agreement_section)
# Key definition sections
definitions_section = self._extract_section_containing(full_text, ["DEFINITIONS", "MEANING OF TERMS"])
if definitions_section:
sections.append(definitions_section)
# Combine and clean
context = " ".join([section.strip() for section in sections if section and section.strip()])
# Fallback to original text if context extraction failed
return context if (len(context) > 500) else full_text
def _extract_section_between(self, text: str, start_marker: str, end_marker: str) -> Optional[str]:
"""
Extract text between two markers (case-insensitive)
"""
try:
pattern = re.compile(f"{re.escape(start_marker)}(.*?){re.escape(end_marker)}", re.IGNORECASE | re.DOTALL)
match = pattern.search(text)
return match.group(1).strip() if match else None
except Exception:
return None
def _extract_section_containing(self, text: str, markers: List[str]) -> Optional[str]:
"""
Extract section containing any of the markers
"""
for marker in markers:
if marker.lower() in text.lower():
# Extract 500 chars around the marker
idx = text.lower().find(marker.lower())
start = max(0, idx - 250)
end = min(len(text), idx + len(marker) + 250)
return text[start:end]
return None
def _prepare_category_embeddings(self):
"""
Pre-compute embeddings for each category template
"""
log_info("Preparing category embeddings...")
# More specific templates for each category
category_templates = {
'employment': "Employment agreement between employer and employee covering salary benefits job duties work hours vacation sick leave performance reviews termination conditions confidentiality and intellectual property rights",
'consulting': "Consulting services agreement with independent contractor statement of work deliverables hourly rate project scope milestones acceptance criteria work product ownership and payment terms for professional services",
'nda': "Non-disclosure agreement protecting confidential information trade secrets proprietary data between parties with confidentiality obligations non-use provisions and return of information requirements",
'software': "Software license agreement or SaaS subscription for technology services including source code access updates maintenance support service level agreements uptime guarantees and API access",
'service': "Service level agreement for professional services maintenance support with performance metrics service credits response times uptime guarantees and implementation requirements",
'partnership': "Business partnership joint venture agreement covering equity shares profit distribution management governance voting rights dissolution terms and capital contributions",
'lease': "Real estate lease agreement for property rental covering premises description rent payments security deposits maintenance responsibilities utilities and eviction terms",
'purchase': "Sales purchase agreement for goods products with buyer seller terms covering delivery shipment payment terms invoices purchase price quantity specifications and title transfer",
'general': "General contract agreement with standard terms and conditions governing law jurisdiction dispute resolution force majeure notice provisions and general legal framework"
}
for category, template in category_templates.items():
# Encode template
embedding = self.embedding_model.encode(template, convert_to_tensor = True)
self.category_embeddings[category] = embedding
log_info(f"Prepared embeddings for {len(self.category_embeddings)} categories")
# MAIN CLASSIFICATION METHOD
@ContractAnalyzerLogger.log_execution_time("classify_contract")
def classify_contract(self, contract_text: str, min_confidence: float = 0.50) -> ContractCategory:
"""
Classify contract into granular categories with confidence scoring
Process:
1. Keyword-based initial scoring
2. Semantic similarity with embeddings
3. Legal-BERT enhanced classification
4. Subcategory detection
5. Confidence calibration
Arguments:
----------
contract_text { str } : Full contract text
min_confidence { float } : Minimum confidence threshold (0.0-1.0)
Returns:
--------
{ ContractCategory } : ContractCategory object with classification results
"""
# Validate input
if (not contract_text or (len(contract_text) < 100)):
raise ValueError("Contract text too short for classification")
# Use default threshold if not specified
if min_confidence is None:
min_confidence = self.DEFAULT_CONFIDENCE_THRESHOLD
# Preprocess text
text_excerpt = self._extract_classification_context(full_text = contract_text)
log_info("Starting contract classification",
text_length = len(contract_text),
excerpt_length = len(text_excerpt),
)
# Keyword scoring
keyword_scores = self._score_keywords(text_lower = contract_text.lower())
# Semantic similarity
semantic_scores = self._semantic_similarity(text = text_excerpt)
# Legal-BERT semantic similarity (enhanced)
legal_bert_scores = self._legal_bert_similarity(text = text_excerpt)
# Combine scores (weighted average)
combined_scores = self._combine_scores(keyword_scores = keyword_scores,
semantic_scores = semantic_scores,
legal_bert_scores = legal_bert_scores,
)
# Get primary category
if not combined_scores:
log_info("No categories detected, defaulting to 'general'")
return ContractCategory(category = "general",
subcategory = None,
confidence = 0.5,
reasoning = ["Unable to determine specific contract type"],
detected_keywords = [],
)
primary_category = max(combined_scores, key = combined_scores.get)
confidence = combined_scores[primary_category]
# Detect subcategory
subcategory = self._detect_subcategory(text = contract_text,
primary_category = primary_category,
)
# Generate reasoning
reasoning = self._generate_reasoning(contract_text = contract_text,
primary_category = primary_category,
subcategory = subcategory,
keyword_scores = keyword_scores,
semantic_scores = semantic_scores,
legal_bert_scores = legal_bert_scores,
combined_scores = combined_scores,
)
# Extract detected keywords
detected_keywords = self._extract_detected_keywords(contract_text, primary_category)
# Get alternative categories: Top 3 alternatives
alternative_categories = sorted([(cat, score) for cat, score in combined_scores.items() if cat != primary_category],
key = lambda x: x[1],
reverse = True,
)[:3]
result = ContractCategory(category = primary_category,
subcategory = subcategory,
confidence = confidence,
reasoning = reasoning,
detected_keywords = detected_keywords,
alternative_categories = alternative_categories,
)
log_info("Contract classified successfully",
category = primary_category,
subcategory = subcategory,
confidence = confidence,
)
return result
def _score_keywords(self, text_lower: str) -> Dict[str, float]:
"""
Score each category based on keyword presence
Arguments:
----------
text_lower { str } : Lowercase contract text
Returns:
--------
{ dict } : Dictionary of {category: score}
"""
scores = dict()
for category, config in self.CATEGORY_HIERARCHY.items():
keywords = config['keywords']
weight = config['weight']
# Count keyword matches with partial matching for multi-word terms
keyword_count = 0
for keyword in keywords:
# Check for exact match or partial match for multi-word terms
if ' ' in keyword:
# For multi-word terms, check if all words appear in text
words = keyword.split()
if all(word in text_lower for word in words):
keyword_count += 1
else:
# For single words, exact word boundary match
if re.search(rf'\b{re.escape(keyword)}\b', text_lower):
keyword_count += 1
# Normalize by number of keywords and apply weight
normalized_score = (keyword_count / len(keywords)) * weight
# Cap at 1.0
scores[category] = min(normalized_score, 1.0)
return scores
def _semantic_similarity(self, text: str) -> Dict[str, float]:
"""
Calculate semantic similarity to category templates using embeddings
Arguments:
----------
text { str } : Contract text excerpt
Returns:
--------
{ dict } : Dictionary of {category: similarity_score}
"""
# Encode contract text
text_embedding = self.embedding_model.encode(text, convert_to_tensor = True)
# Calculate similarity to each category
similarities = dict()
for category, cat_embedding in self.category_embeddings.items():
similarity = util.cos_sim(text_embedding, cat_embedding)[0][0].item()
similarities[category] = similarity
return similarities
def _legal_bert_similarity(self, text: str) -> Dict[str, float]:
"""
Use Legal-BERT for semantic similarity calculation
Arguments:
----------
text { str } : Contract text excerpt
Returns:
--------
{ dict } : Dictionary of {category: similarity_score} using Legal-BERT embeddings
"""
# Get Legal-BERT embedding for the text
text_embedding = self._get_legal_bert_embedding(text)
# Calculate similarity to each category's Legal-BERT embedding
similarities = dict()
for category in self.CATEGORY_HIERARCHY.keys():
# Get pre-computed category embedding
cat_embedding = self._get_legal_bert_embedding(f"This is a {category.replace('_', ' ')} contract agreement")
# Calculate cosine similarity
similarity = torch.nn.functional.cosine_similarity(torch.tensor(text_embedding).unsqueeze(0), torch.tensor(cat_embedding).unsqueeze(0)).item()
similarities[category] = similarity
return similarities
def _get_legal_bert_embedding(self, text: str) -> np.ndarray:
"""
Get Legal-BERT embedding for text using [CLS] token
Arguments:
----------
text { str } : Input text
Returns:
--------
{ np.ndarray } : Embedding vector
"""
# Tokenize
inputs = self.legal_bert_tokenizer(text,
return_tensors = "pt",
padding = True,
truncation = True,
max_length = 512,
).to(self.device)
# Get embeddings
with torch.no_grad():
outputs = self.legal_bert_model(**inputs)
# Use [CLS] token embedding (first token)
cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()[0]
return cls_embedding
def _combine_scores(self, keyword_scores: Dict[str, float], semantic_scores: Dict[str, float], legal_bert_scores: Dict[str, float] = None) -> Dict[str, float]:
"""
Combine scores from different methods (weighted average)
Arguments:
----------
keyword_scores { dict } : Keyword-based scores
semantic_scores { dict } : Semantic similarity scores
legal_bert_scores { dict } : Legal-BERT similarity scores (optional)
Returns:
--------
{ dict } : Combined scores dictionary
"""
combined = dict()
# Weights for each method
keyword_weight = 0.35
semantic_weight = 0.35
legal_bert_weight = 0.30
for category in self.CATEGORY_HIERARCHY.keys():
score = (keyword_scores.get(category, 0) * keyword_weight +
semantic_scores.get(category, 0) * semantic_weight +
legal_bert_scores.get(category, 0) * legal_bert_weight
)
combined[category] = score
return combined
def _detect_subcategory(self, text: str, primary_category: str) -> Optional[str]:
"""
Detect specific subcategory within primary category
Arguments:
----------
text { str } : Full contract text
primary_category { str } : Detected primary category
Returns:
--------
{ str } : Subcategory name or None
"""
text_lower = text.lower()
# Get subcategories for this category
subcategories = self.CATEGORY_HIERARCHY[primary_category]['subcategories']
# Score each subcategory
subcat_scores = dict()
for subcat in subcategories:
if subcat in self.SUBCATEGORY_PATTERNS:
patterns = self.SUBCATEGORY_PATTERNS[subcat]
score = sum(1 for pattern in patterns if pattern in text_lower)
subcat_scores[subcat] = score
# Return best match if any
if (subcat_scores and (max(subcat_scores.values()) > 0)):
best_subcat = max(subcat_scores, key = subcat_scores.get)
log_info(f"Detected subcategory: {best_subcat}",
category = primary_category,
score = subcat_scores[best_subcat],
)
return best_subcat
return None
def _generate_reasoning(self, contract_text: str, primary_category: str, subcategory: Optional[str], keyword_scores: Dict[str, float],
semantic_scores: Dict[str, float], legal_bert_scores: Dict[str, float], combined_scores: Dict[str, float]) -> List[str]:
"""
Generate human-readable reasoning for classification
Returns:
--------
{ list } : List of reasoning statements
"""
reasoning = list()
# Primary category reasoning
keyword_match = keyword_scores.get(primary_category, 0)
semantic_match = semantic_scores.get(primary_category, 0)
legal_bert_match = legal_bert_scores.get(primary_category, 0)
# Keyword-based reasoning
if (keyword_match > 0.6):
reasoning.append(f"Strong keyword indicators for {primary_category.replace('_', ' ')} category ({int(keyword_match * 100)}% keyword match)")
elif (keyword_match > 0.3):
reasoning.append(f"Moderate keyword presence for {primary_category.replace('_', ' ')} ({int(keyword_match * 100)}% keyword match)")
elif (keyword_match > 0.1):
reasoning.append(f"Limited keyword indicators for {primary_category.replace('_', ' ')} ({int(keyword_match * 100)}% keyword match)")
# Semantic similarity reasoning
if (semantic_match > 0.70):
reasoning.append(f"High semantic similarity to {primary_category.replace('_', ' ')} agreements (similarity: {semantic_match:.2f})")
elif (semantic_match > 0.55):
reasoning.append(f"Moderate semantic similarity to {primary_category.replace('_', ' ')} contracts (similarity: {semantic_match:.2f})"
)
# Legal-BERT reasoning
if (legal_bert_match > 0.65):
reasoning.append(f"Legal-BERT analysis strongly supports {primary_category.replace('_', ' ')} classification (similarity: {legal_bert_match:.2f})"
)
elif (legal_bert_match > 0.50):
reasoning.append(f"Legal-BERT analysis moderately supports {primary_category.replace('_', ' ')} classification (similarity: {legal_bert_match:.2f})"
)
# Subcategory reasoning
if subcategory:
reasoning.append(f"Specific subcategory identified: {subcategory.replace('_', ' ')}")
# Alternative categories (if close)
sorted_scores = sorted(combined_scores.items(), key = lambda x: x[1], reverse = True)
if ((len(sorted_scores) > 1) and (sorted_scores[1][1] > 0.30)):
alt_category, alt_score = sorted_scores[1]
reasoning.append(f"Also contains elements of {alt_category.replace('_', ' ')} (secondary match: {alt_score:.2f})")
# If no strong reasoning
if not reasoning:
reasoning.append("Classification based on general contract structure and terminology")
return reasoning
def _extract_detected_keywords(self, text: str, category: str) -> List[str]:
"""
Extract which specific keywords were found
Arguments:
----------
text { str } : Contract text
category { str } : Detected category
Returns:
--------
{ list } : List of detected keywords
"""
text_lower = text.lower()
keywords = self.CATEGORY_HIERARCHY[category]['keywords']
detected = [kw for kw in keywords if kw in text_lower]
# Return all detected keywords
return detected
@ContractAnalyzerLogger.log_execution_time("classify_multi_label")
def classify_multi_label(self, text: str, threshold: float = None) -> List[ContractCategory]:
"""
Classify as multiple categories if applicable (e.g., Employment + NDA, Consulting + IP Assignment)
Arguments:
----------
text { str } : Contract text
threshold { float } : Minimum confidence threshold for multi-label
Returns:
--------
{ list } : List of ContractCategory objects (sorted by confidence)
"""
# Use multi-label threshold if not specified
if threshold is None:
threshold = self.MULTI_LABEL_THRESHOLD
log_info("Starting multi-label classification", threshold = threshold)
# Get scores
keyword_scores = self._score_keywords(text_lower = text.lower())
semantic_scores = self._semantic_similarity(text = text)
legal_bert_scores = self._legal_bert_similarity(text = text)
combined_scores = self._combine_scores(keyword_scores = keyword_scores,
semantic_scores = semantic_scores,
legal_bert_scores = legal_bert_scores,
)
# Get all categories above threshold
matches = list()
for category, score in combined_scores.items():
if (score >= threshold):
subcategory = self._detect_subcategory(text = text,
primary_category = category,
)
reasoning = self._generate_reasoning(contract_text = text,
primary_category = category,
subcategory = subcategory,
keyword_scores = keyword_scores,
semantic_scores = semantic_scores,
legal_bert_scores = legal_bert_scores,
combined_scores = combined_scores,
)
keywords = self._extract_detected_keywords(text = text,
category = category,
)
matches.append(ContractCategory(category = category,
subcategory = subcategory,
confidence = score,
reasoning = reasoning,
detected_keywords = keywords,
)
)
# Sort by confidence
matches.sort(key = lambda x: x.confidence, reverse = True)
log_info(f"Multi-label classification found {len(matches)} categories")
return matches if matches else [self.classify_contract(text)]
def get_category_description(self, category: str) -> str:
"""
Get human-readable description of a category
"""
descriptions = {'employment' : 'Employment agreements governing employer-employee relationships',
'consulting' : 'Consulting and independent contractor agreements',
'nda' : 'Non-disclosure and confidentiality agreements',
'software' : 'Software licensing and technology service agreements',
'service' : 'Professional service and maintenance agreements',
'partnership' : 'Partnership, joint venture, and corporate agreements',
'lease' : 'Property lease, rental, and equipment lease agreements',
'purchase' : 'Sales, purchase, and goods transfer agreements',
'general' : 'General contract agreements with standard terms and conditions',
}
return descriptions.get(category, 'General contract agreement')
def get_all_categories(self) -> List[str]:
"""
Get list of all supported categories
"""
return list(self.CATEGORY_HIERARCHY.keys())
def get_subcategories(self, category: str) -> List[str]:
"""
Get subcategories for a specific category
"""
return self.CATEGORY_HIERARCHY.get(category, {}).get('subcategories', []) |