File size: 40,834 Bytes
1099afe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bdedf43
 
1099afe
 
522f7a0
1099afe
 
 
 
bdedf43
1099afe
 
 
 
 
bdedf43
1099afe
bdedf43
522f7a0
1099afe
 
bdedf43
522f7a0
1099afe
 
bdedf43
522f7a0
1099afe
bdedf43
 
1099afe
 
bdedf43
 
1099afe
 
bdedf43
 
1099afe
 
bdedf43
 
1099afe
 
bdedf43
 
1099afe
 
bdedf43
 
 
1099afe
 
 
 
bdedf43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1099afe
bdedf43
 
 
1099afe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bdedf43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1099afe
 
 
 
 
 
bdedf43
 
 
 
 
 
 
 
 
 
 
 
 
 
1099afe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bdedf43
1099afe
 
bdedf43
 
 
 
 
 
1099afe
 
 
 
 
 
522f7a0
bdedf43
522f7a0
bdedf43
 
522f7a0
 
bdedf43
522f7a0
 
1099afe
 
 
 
 
bdedf43
1099afe
 
 
 
 
 
 
 
 
 
 
 
bdedf43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1099afe
 
bdedf43
1099afe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
522f7a0
1099afe
 
 
522f7a0
1099afe
 
 
 
 
 
522f7a0
 
1099afe
bdedf43
522f7a0
 
bdedf43
 
 
 
 
 
 
 
 
522f7a0
bdedf43
 
522f7a0
1099afe
 
 
522f7a0
 
1099afe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
522f7a0
 
 
1099afe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bdedf43
 
1099afe
 
 
 
bdedf43
 
 
1099afe
bdedf43
 
 
1099afe
bdedf43
1099afe
 
 
bdedf43
 
 
 
 
 
 
1099afe
 
 
 
 
 
 
 
 
bdedf43
1099afe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bdedf43
 
1099afe
 
 
bdedf43
1099afe
 
 
 
 
 
 
 
 
 
 
 
 
bdedf43
 
 
 
1099afe
 
 
bdedf43
 
 
 
 
 
 
1099afe
 
 
 
 
 
bdedf43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1099afe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bdedf43
 
 
 
 
 
 
 
 
1099afe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
# DEPENDENCIES
import re
import sys
import torch
import numpy as np
from typing import Any
from typing import List
from typing import Dict
from typing import Tuple
from pathlib import Path
from typing import Optional
from dataclasses import dataclass
from sentence_transformers import util

# Import utilities
sys.path.append(str(Path(__file__).parent.parent))

from utils.logger import log_info
from utils.logger import log_error
from config.risk_rules import ContractType
from config.model_config import ModelConfig
from utils.text_processor import TextProcessor
from utils.logger import ContractAnalyzerLogger
from services.data_models import ContractCategory


class ContractClassifier:
    """
    Contract categorization using:
    1. Legal-BERT embeddings + semantic similarity
    2. Multi-label classification (a contract can be multiple types)
    3. Hierarchical categories (Employment -> Full-Time/Contract/Internship)
    4. Confidence scoring with explanations
    """
    # CATEGORY HIERARCHY WITH KEYWORDS - UPDATED TO MATCH YOUR CATEGORIES
    CATEGORY_HIERARCHY   = {'employment'            : {'subcategories' : ['full_time', 'part_time', 'contract_worker', 'internship', 'executive'],
                                                       'keywords'      : ['employee', 'employment', 'employer', 'job', 'position', 'staff', 'salary', 'wages', 'compensation', 'payroll', 'benefits', 'health insurance', 'retirement', 'pension', '401(k)', 'vacation', 'paid time off', 'sick leave', 'holidays', 'probation', 'performance review', 'promotion', 'termination', 'job description', 'duties', 'responsibilities', 'work hours', 'overtime', 'timekeeping', 'attendance', 'confidentiality', 'non-compete', 'non-solicitation', 'intellectual property', 'inventions', 'work product', 'severance', 'notice period', 'resignation', 'dismissal'],
                                                       'weight'        : 1.1,
                                                      },
                            'consulting'            : {'subcategories' : ['independent_contractor', 'advisory', 'professional_services', 'freelance'],
                                                       'keywords'      : ['consultant', 'consulting', 'independent contractor', 'statement of work', 'deliverables', 'professional services', 'hourly rate', 'project scope', 'milestone', 'acceptance criteria', 'work product', '1099', 'self-employed', 'contractor', 'consulting services', 'expert advice', 'advisory services', 'project basis', 'task order'],
                                                       'weight'        : 1.0,
                                                      },
                            'nda'                   : {'subcategories' : ['mutual_nda', 'unilateral_nda', 'confidentiality_agreement'],
                                                       'keywords'      : ['non-disclosure', 'confidentiality', 'proprietary information', 'nda', 'disclosure agreement', 'trade secret', 'confidential information', 'receiving party', 'disclosing party', 'confidentiality obligation', 'non-use', 'non-circumvention', 'secrecy', 'protected information', 'confidentiality period', 'return of information'],
                                                       'weight'        : 1.0, 
                                                      },
                            'software'              : {'subcategories' : ['software_license', 'saas', 'cloud_services', 'development', 'api_access'],
                                                       'keywords'      : ['software', 'license', 'saas', 'subscription', 'source code', 'object code', 'api', 'cloud', 'hosting', 'maintenance', 'updates', 'support', 'uptime', 'service level', 'software as a service', 'platform', 'application', 'user license', 'perpetual license', 'subscription fee', 'end user license agreement', 'eula'],
                                                       'weight'        : 1.1,
                                                      },
                            'service'               : {'subcategories' : ['master_services', 'maintenance', 'support', 'subscription'],
                                                       'keywords'      : ['service provider', 'services', 'sla', 'service level agreement', 'uptime', 'response time', 'support', 'maintenance', 'service credits', 'performance metrics', 'implementation', 'professional services', 'service description', 'service fees', 'service term', 'service delivery', 'service scope'],
                                                       'weight'        : 1.0,
                                                      },
                            'partnership'           : {'subcategories' : ['business_partnership', 'joint_venture', 'strategic_alliance'],
                                                       'keywords'      : ['partnership', 'joint venture', 'equity', 'shares', 'profit sharing', 'loss allocation', 'management', 'governance', 'voting rights', 'dissolution', 'capital contribution', 'distribution', 'membership interest', 'operating agreement', 'board of directors', 'partnership agreement'],
                                                       'weight'        : 1.0,
                                                      },
                            'lease'                 : {'subcategories' : ['residential_lease', 'commercial_lease', 'sublease', 'equipment_lease'],
                                                       'keywords'      : ['landlord', 'tenant', 'lease', 'premises', 'rent', 'property', 'security deposit', 'utilities', 'maintenance', 'repairs', 'eviction', 'lease term', 'renewal', 'square footage', 'rental agreement', 'lessor', 'lessee', 'property management', 'common areas', 'quiet enjoyment'],
                                                       'weight'        : 1.0,
                                                      },
                            'purchase'              : {'subcategories' : ['asset_purchase', 'stock_purchase', 'goods_purchase'],
                                                       'keywords'      : ['purchase', 'sale', 'buyer', 'seller', 'goods', 'products', 'delivery', 'shipment', 'payment terms', 'invoice', 'purchase price', 'quantity', 'specifications', 'purchase order', 'sales agreement', 'bill of sale', 'title transfer', 'risk of loss', 'closing date'],
                                                       'weight'        : 1.0,
                                                      },
                            'general'               : {'subcategories' : ['standard_agreement', 'basic_contract'],
                                                       'keywords'      : ['agreement', 'contract', 'party', 'parties', 'terms and conditions', 'governing law', 'jurisdiction', 'dispute resolution', 'force majeure', 'notice', 'amendment', 'assignment', 'severability', 'entire agreement'],
                                                       'weight'        : 0.8,
                                                      },
                           } 
    
    # SUBCATEGORY DETECTION PATTERNS
    SUBCATEGORY_PATTERNS = {'full_time'                 : ['full-time', 'full time', 'permanent', 'regular employee', '40 hours', 'exempt employee', 'salary basis'],
                            'part_time'                 : ['part-time', 'part time', 'hours per week', 'non-exempt', 'hourly employee', 'temporary', 'seasonal'],
                            'contract_worker'           : ['independent contractor', 'contract', 'fixed term', 'temporary', 'contract period', 'contract worker', 'contract employee'],
                            'internship'                : ['intern', 'internship', 'student', 'training program', 'educational', 'college credit', 'unpaid intern'],
                            'executive'                 : ['executive', 'ceo', 'cfo', 'cto', 'president', 'vice president', 'director', 'officer', 'executive compensation', 'stock options', 'golden parachute'],
                            'independent_contractor'    : ['independent contractor', '1099', 'contractor', 'self-employed', 'freelance', 'consultant agreement'],
                            'advisory'                  : ['advisor', 'advisory', 'counsel', 'consulting services', 'expert advice', 'advisory board', 'strategic advisory'],
                            'professional_services'     : ['professional services', 'consulting services', 'engagement', 'service provider', 'professional firm'],
                            'freelance'                 : ['freelance', 'freelancer', 'gig', 'project-based', 'freelance work', 'gig economy'],
                            'mutual_nda'                : ['mutual', 'both parties', 'each party', 'reciprocal', 'mutual confidentiality', 'two-way'],
                            'unilateral_nda'            : ['one-way', 'receiving party', 'disclosing party', 'unilateral', 'single party', 'one party'],
                            'confidentiality_agreement' : ['confidentiality agreement', 'secrecy agreement', 'proprietary information agreement'],
                            'software_license'          : ['software license', 'license key', 'perpetual license', 'end user license', 'software agreement'],
                            'saas'                      : ['software as a service', 'saas', 'subscription', 'cloud-based', 'web-based', 'online service'],
                            'cloud_services'            : ['cloud services', 'cloud computing', 'infrastructure', 'iaas', 'paas', 'cloud hosting'],
                            'development'               : ['software development', 'custom development', 'development services', 'programming', 'coding'],
                            'api_access'                : ['api', 'application programming interface', 'api access', 'api key', 'rest api', 'graphql'],
                            'master_services'           : ['master services agreement', 'msa', 'master agreement', 'framework agreement'],
                            'maintenance'               : ['maintenance agreement', 'maintenance services', 'preventive maintenance', 'repair services'],
                            'support'                   : ['support agreement', 'technical support', 'customer support', 'help desk'],
                            'subscription'              : ['subscription agreement', 'subscription service', 'recurring billing', 'subscription fee'],
                            'business_partnership'      : ['partnership', 'general partnership', 'limited partnership', 'partnership agreement'],
                            'joint_venture'             : ['joint venture', 'jv agreement', 'joint venture agreement', 'strategic alliance'],
                            'strategic_alliance'        : ['strategic alliance', 'collaboration agreement', 'cooperation agreement'],
                            'residential_lease'         : ['residential', 'apartment', 'house', 'dwelling', 'residential property', 'tenant', 'landlord', 'rental'],
                            'commercial_lease'          : ['commercial', 'office space', 'retail space', 'commercial property', 'business premises', 'commercial tenant'],
                            'sublease'                  : ['sublease', 'sublet', 'subtenant', 'sublessee', 'sublessor'],
                            'equipment_lease'           : ['equipment lease', 'equipment rental', 'lease equipment', 'leased property'],
                            'asset_purchase'            : ['asset purchase', 'business assets', 'asset sale', 'purchase assets'],
                            'stock_purchase'            : ['stock purchase', 'share purchase', 'equity purchase', 'stock sale'],
                            'goods_purchase'            : ['goods purchase', 'product purchase', 'merchandise', 'inventory purchase'],
                            'standard_agreement'        : ['standard agreement', 'template agreement', 'boilerplate contract'],
                            'basic_contract'            : ['basic contract', 'simple agreement', 'standard terms'],
                           }

    DEFAULT_CONFIDENCE_THRESHOLD = 0.65 
    MULTI_LABEL_THRESHOLD        = 0.55  
                        

    def __init__(self, model_loader):
        """
        Initialize contract classifier
        
        Arguments:
        ----------
            model_loader : ModelLoader instance for accessing Legal-BERT and embeddings
         """
        self.model_loader         = model_loader
        self.embedding_model      = None
        self.legal_bert_model     = None
        self.legal_bert_tokenizer = None
        self.device               = None
        
        # Category template embeddings (computed once)
        self.category_embeddings  = dict()
        
        # Text processor for preprocessing : Don't need spaCy for classification
        self.text_processor       = TextProcessor(use_spacy = False)  
        
        # Logger
        self.logger               = ContractAnalyzerLogger.get_logger()
        
        # Lazy load models
        self._lazy_load()

    
    def _lazy_load(self):
        """
        Lazy load models on first use
        """
        if self.embedding_model is None:
            try:
                log_info("Loading models for contract classification...")
                
                # Load embedding model
                self.embedding_model                             = self.model_loader.load_embedding_model()
                
                # Load Legal-BERT
                self.legal_bert_model, self.legal_bert_tokenizer = self.model_loader.load_legal_bert()
                self.device                                      = self.model_loader.device
                
                # Prepare category embeddings
                self._prepare_category_embeddings()
                
                log_info("Contract classifier models loaded successfully")
                
            except Exception as e:
                log_error(e, context = {"component" : "ContractClassifier", "operation" : "model_loading"})
                raise

    
    def _extract_classification_context(self, full_text: str) -> str:
        """
        Extract key legal sections for more accurate classification
        Focuses on preamble, definitions, and core agreement sections
        
        Arguments:
        ----------
            full_text { str } : Full contract text
        
        Returns:
        --------
                { str }    : Context-rich excerpt for classification
        """
        sections = list()
        
        # First 2000 chars (usually contains parties, effective date, preamble)
        sections.append(full_text[:2000])
        
        # WHEREAS clauses (recitals - explains purpose and background)
        whereas_section = self._extract_section_between(full_text, "WHEREAS", "NOW THEREFORE")
        
        if whereas_section:
            sections.append(whereas_section)
        
        # AGREEMENT section (core contractual terms)
        agreement_section = self._extract_section_between(full_text, "AGREEMENT", "TERMS AND CONDITIONS")
        
        if not agreement_section:
            agreement_section = self._extract_section_containing(full_text, ["AGREES AS FOLLOWS", "HEREBY AGREES"])
        
        if agreement_section:
            sections.append(agreement_section)
        
        # Key definition sections
        definitions_section = self._extract_section_containing(full_text, ["DEFINITIONS", "MEANING OF TERMS"])
        
        if definitions_section:
            sections.append(definitions_section)
        
        # Combine and clean
        context = " ".join([section.strip() for section in sections if section and section.strip()])
        
        # Fallback to original text if context extraction failed
        return context if (len(context) > 500) else full_text


    def _extract_section_between(self, text: str, start_marker: str, end_marker: str) -> Optional[str]:
        """
        Extract text between two markers (case-insensitive)
        """
        try:
            pattern = re.compile(f"{re.escape(start_marker)}(.*?){re.escape(end_marker)}", re.IGNORECASE | re.DOTALL)
            match   = pattern.search(text)
            
            return match.group(1).strip() if match else None
        
        except Exception:
            return None


    def _extract_section_containing(self, text: str, markers: List[str]) -> Optional[str]:
        """
        Extract section containing any of the markers
        """
        for marker in markers:
            if marker.lower() in text.lower():
                # Extract 500 chars around the marker
                idx   = text.lower().find(marker.lower())
                start = max(0, idx - 250)
                end   = min(len(text), idx + len(marker) + 250)
                
                return text[start:end]
        
        return None

    
    def _prepare_category_embeddings(self):
        """
        Pre-compute embeddings for each category template
        """
        log_info("Preparing category embeddings...")
        
        # More specific templates for each category
        category_templates = {
            'employment': "Employment agreement between employer and employee covering salary benefits job duties work hours vacation sick leave performance reviews termination conditions confidentiality and intellectual property rights",
            'consulting': "Consulting services agreement with independent contractor statement of work deliverables hourly rate project scope milestones acceptance criteria work product ownership and payment terms for professional services",
            'nda': "Non-disclosure agreement protecting confidential information trade secrets proprietary data between parties with confidentiality obligations non-use provisions and return of information requirements",
            'software': "Software license agreement or SaaS subscription for technology services including source code access updates maintenance support service level agreements uptime guarantees and API access",
            'service': "Service level agreement for professional services maintenance support with performance metrics service credits response times uptime guarantees and implementation requirements",
            'partnership': "Business partnership joint venture agreement covering equity shares profit distribution management governance voting rights dissolution terms and capital contributions",
            'lease': "Real estate lease agreement for property rental covering premises description rent payments security deposits maintenance responsibilities utilities and eviction terms",
            'purchase': "Sales purchase agreement for goods products with buyer seller terms covering delivery shipment payment terms invoices purchase price quantity specifications and title transfer",
            'general': "General contract agreement with standard terms and conditions governing law jurisdiction dispute resolution force majeure notice provisions and general legal framework"
        }
        
        for category, template in category_templates.items():
            # Encode template
            embedding                          = self.embedding_model.encode(template, convert_to_tensor = True)
            self.category_embeddings[category] = embedding
        
        log_info(f"Prepared embeddings for {len(self.category_embeddings)} categories")
    

    # MAIN CLASSIFICATION METHOD
    @ContractAnalyzerLogger.log_execution_time("classify_contract")
    def classify_contract(self, contract_text: str, min_confidence: float = 0.50) -> ContractCategory:
        """
        Classify contract into granular categories with confidence scoring
        
        Process:
        1. Keyword-based initial scoring
        2. Semantic similarity with embeddings
        3. Legal-BERT enhanced classification
        4. Subcategory detection
        5. Confidence calibration
        
        Arguments:
        ----------
            contract_text   { str }  : Full contract text

            min_confidence { float } : Minimum confidence threshold (0.0-1.0)
        
        Returns:
        --------

            { ContractCategory }     : ContractCategory object with classification results
        """
        # Validate input
        if (not contract_text or (len(contract_text) < 100)):
            raise ValueError("Contract text too short for classification")
        
        # Use default threshold if not specified
        if min_confidence is None:
            min_confidence = self.DEFAULT_CONFIDENCE_THRESHOLD

        # Preprocess text 
        text_excerpt = self._extract_classification_context(full_text = contract_text)
        
        log_info("Starting contract classification", 
                 text_length    = len(contract_text),
                 excerpt_length = len(text_excerpt),
                )
        
        # Keyword scoring
        keyword_scores    = self._score_keywords(text_lower = contract_text.lower())

        # Semantic similarity
        semantic_scores   = self._semantic_similarity(text = text_excerpt)

        # Legal-BERT semantic similarity (enhanced)
        legal_bert_scores = self._legal_bert_similarity(text = text_excerpt)

        # Combine scores (weighted average)
        combined_scores   = self._combine_scores(keyword_scores    = keyword_scores,
                                                 semantic_scores   = semantic_scores,
                                                 legal_bert_scores = legal_bert_scores,
                                                )
        
        # Get primary category
        if not combined_scores:
            log_info("No categories detected, defaulting to 'general'")
            return ContractCategory(category          = "general",
                                    subcategory       = None,
                                    confidence        = 0.5,
                                    reasoning         = ["Unable to determine specific contract type"],
                                    detected_keywords = [],
                                   )
        
        primary_category       = max(combined_scores, key = combined_scores.get)
        confidence             = combined_scores[primary_category]
        
        # Detect subcategory
        subcategory            = self._detect_subcategory(text             = contract_text, 
                                                          primary_category = primary_category,
                                                         )
        
        # Generate reasoning
        reasoning              = self._generate_reasoning(contract_text     = contract_text,
                                                          primary_category  = primary_category,
                                                          subcategory       = subcategory,
                                                          keyword_scores    = keyword_scores,
                                                          semantic_scores   = semantic_scores,
                                                          legal_bert_scores = legal_bert_scores,
                                                          combined_scores   = combined_scores,
                                                         )
        
        # Extract detected keywords
        detected_keywords      = self._extract_detected_keywords(contract_text, primary_category)
        
        # Get alternative categories: Top 3 alternatives
        alternative_categories = sorted([(cat, score) for cat, score in combined_scores.items() if cat != primary_category],
                                        key     = lambda x: x[1],
                                        reverse = True,
                                       )[:3]
        
        result                 = ContractCategory(category               = primary_category,
                                                  subcategory            = subcategory,
                                                  confidence             = confidence,
                                                  reasoning              = reasoning,
                                                  detected_keywords      = detected_keywords,
                                                  alternative_categories = alternative_categories,
                                                 )
        
        log_info("Contract classified successfully",
                 category    = primary_category,
                 subcategory = subcategory,
                 confidence  = confidence,
                )
        
        return result
    
    
    def _score_keywords(self, text_lower: str) -> Dict[str, float]:
        """
        Score each category based on keyword presence

        Arguments:
        ----------
            text_lower { str } : Lowercase contract text

        Returns:
        --------
               { dict }        : Dictionary of {category: score}
        """
        scores = dict()
        for category, config in self.CATEGORY_HIERARCHY.items():
            keywords      = config['keywords']
            weight        = config['weight']
            
            # Count keyword matches with partial matching for multi-word terms
            keyword_count = 0
            
            for keyword in keywords:
                # Check for exact match or partial match for multi-word terms
                if ' ' in keyword:
                    # For multi-word terms, check if all words appear in text
                    words = keyword.split()
                    if all(word in text_lower for word in words):
                        keyword_count += 1
                
                else:
                    # For single words, exact word boundary match
                    if re.search(rf'\b{re.escape(keyword)}\b', text_lower):
                        keyword_count += 1

            # Normalize by number of keywords and apply weight
            normalized_score = (keyword_count / len(keywords)) * weight
            
            # Cap at 1.0
            scores[category] = min(normalized_score, 1.0)
        
        return scores
    

    def _semantic_similarity(self, text: str) -> Dict[str, float]:
        """
        Calculate semantic similarity to category templates using embeddings
        
        Arguments:
        ----------
            text { str } : Contract text excerpt
        
        Returns:
        --------
            { dict }     : Dictionary of {category: similarity_score}
        """
        # Encode contract text
        text_embedding = self.embedding_model.encode(text, convert_to_tensor = True)
        
        # Calculate similarity to each category
        similarities   = dict()

        for category, cat_embedding in self.category_embeddings.items():
            similarity             = util.cos_sim(text_embedding, cat_embedding)[0][0].item()
            similarities[category] = similarity
        
        return similarities

    
    def _legal_bert_similarity(self, text: str) -> Dict[str, float]:
        """
        Use Legal-BERT for semantic similarity calculation
        
        Arguments:
        ----------
            text { str } : Contract text excerpt
        
        Returns:
        --------
            { dict }     : Dictionary of {category: similarity_score} using Legal-BERT embeddings
        """
        # Get Legal-BERT embedding for the text
        text_embedding = self._get_legal_bert_embedding(text)
        
        # Calculate similarity to each category's Legal-BERT embedding
        similarities   = dict()
        
        for category in self.CATEGORY_HIERARCHY.keys():
            # Get pre-computed category embedding
            cat_embedding          = self._get_legal_bert_embedding(f"This is a {category.replace('_', ' ')} contract agreement")
            
            # Calculate cosine similarity
            similarity             = torch.nn.functional.cosine_similarity(torch.tensor(text_embedding).unsqueeze(0), torch.tensor(cat_embedding).unsqueeze(0)).item()
            
            similarities[category] = similarity
        
        return similarities
    
    
    def _get_legal_bert_embedding(self, text: str) -> np.ndarray:
        """
        Get Legal-BERT embedding for text using [CLS] token
        
        Arguments:
        ----------
            text { str }   : Input text
        
        Returns:
        --------
            { np.ndarray } : Embedding vector
        """
        # Tokenize
        inputs = self.legal_bert_tokenizer(text,
                                           return_tensors = "pt",
                                           padding        = True,
                                           truncation     = True,
                                           max_length     = 512,
                                          ).to(self.device)
        
        # Get embeddings
        with torch.no_grad():
            outputs       = self.legal_bert_model(**inputs)
            # Use [CLS] token embedding (first token)
            cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()[0]
        
        return cls_embedding
    

    def _combine_scores(self, keyword_scores: Dict[str, float], semantic_scores: Dict[str, float], legal_bert_scores: Dict[str, float] = None) -> Dict[str, float]:
        """
        Combine scores from different methods (weighted average)
        
        Arguments:
        ----------
            keyword_scores    { dict } : Keyword-based scores

            semantic_scores   { dict } : Semantic similarity scores
            
            legal_bert_scores { dict } : Legal-BERT similarity scores (optional)
        
        Returns:
        --------
                   { dict }            : Combined scores dictionary
        """
        combined          = dict()
        
        # Weights for each method
        keyword_weight    = 0.35  
        semantic_weight   = 0.35 
        legal_bert_weight = 0.30 
        
        for category in self.CATEGORY_HIERARCHY.keys():
            score = (keyword_scores.get(category, 0) * keyword_weight + 
                     semantic_scores.get(category, 0) * semantic_weight + 
                     legal_bert_scores.get(category, 0) * legal_bert_weight
                    )
            
            combined[category] = score
        
        return combined
    
    
    def _detect_subcategory(self, text: str, primary_category: str) -> Optional[str]:
        """
        Detect specific subcategory within primary category
        
        Arguments:
        ----------
            text             { str } : Full contract text

            primary_category { str } : Detected primary category
        
        Returns:
        --------
                  { str }            : Subcategory name or None
        """
        text_lower    = text.lower()
        
        # Get subcategories for this category
        subcategories = self.CATEGORY_HIERARCHY[primary_category]['subcategories']
        
        # Score each subcategory
        subcat_scores = dict()

        for subcat in subcategories:
            if subcat in self.SUBCATEGORY_PATTERNS:
                patterns              = self.SUBCATEGORY_PATTERNS[subcat]
                score                 = sum(1 for pattern in patterns if pattern in text_lower)
                subcat_scores[subcat] = score
        
        # Return best match if any
        if (subcat_scores and (max(subcat_scores.values()) > 0)):
            best_subcat = max(subcat_scores, key = subcat_scores.get)
            log_info(f"Detected subcategory: {best_subcat}", 
                     category = primary_category,
                     score    = subcat_scores[best_subcat],
                    )

            return best_subcat
        
        return None
    

    def _generate_reasoning(self, contract_text: str, primary_category: str, subcategory: Optional[str], keyword_scores: Dict[str, float], 
                            semantic_scores: Dict[str, float], legal_bert_scores: Dict[str, float], combined_scores: Dict[str, float]) -> List[str]:
        """
        Generate human-readable reasoning for classification
        
        Returns:
        --------
            { list } : List of reasoning statements
        """
        reasoning        = list()
        
        # Primary category reasoning
        keyword_match    = keyword_scores.get(primary_category, 0)
        semantic_match   = semantic_scores.get(primary_category, 0)
        legal_bert_match = legal_bert_scores.get(primary_category, 0)
        
        # Keyword-based reasoning
        if (keyword_match > 0.6):
            reasoning.append(f"Strong keyword indicators for {primary_category.replace('_', ' ')} category ({int(keyword_match * 100)}% keyword match)")

        elif (keyword_match > 0.3):
            reasoning.append(f"Moderate keyword presence for {primary_category.replace('_', ' ')} ({int(keyword_match * 100)}% keyword match)")

        elif (keyword_match > 0.1):
            reasoning.append(f"Limited keyword indicators for {primary_category.replace('_', ' ')} ({int(keyword_match * 100)}% keyword match)")
        
        # Semantic similarity reasoning
        if (semantic_match > 0.70):
            reasoning.append(f"High semantic similarity to {primary_category.replace('_', ' ')} agreements (similarity: {semantic_match:.2f})")

        elif (semantic_match > 0.55):
            reasoning.append(f"Moderate semantic similarity to {primary_category.replace('_', ' ')} contracts (similarity: {semantic_match:.2f})"
                            )
        
        # Legal-BERT reasoning
        if (legal_bert_match > 0.65):
            reasoning.append(f"Legal-BERT analysis strongly supports {primary_category.replace('_', ' ')} classification (similarity: {legal_bert_match:.2f})"
                            )

        elif (legal_bert_match > 0.50):
            reasoning.append(f"Legal-BERT analysis moderately supports {primary_category.replace('_', ' ')} classification (similarity: {legal_bert_match:.2f})"
                            )
        
        # Subcategory reasoning
        if subcategory:
            reasoning.append(f"Specific subcategory identified: {subcategory.replace('_', ' ')}")
        
        # Alternative categories (if close)
        sorted_scores = sorted(combined_scores.items(), key = lambda x: x[1], reverse = True)
        
        if ((len(sorted_scores) > 1) and (sorted_scores[1][1] > 0.30)):
            alt_category, alt_score = sorted_scores[1]
            
            reasoning.append(f"Also contains elements of {alt_category.replace('_', ' ')} (secondary match: {alt_score:.2f})")
        
        # If no strong reasoning
        if not reasoning:
            reasoning.append("Classification based on general contract structure and terminology")
        
        return reasoning

    
    def _extract_detected_keywords(self, text: str, category: str) -> List[str]:
        """
        Extract which specific keywords were found
        
        Arguments:
        ----------
            text     { str } : Contract text

            category { str } : Detected category
        
        Returns:
        --------
                { list }     : List of detected keywords
        """
        text_lower = text.lower()
        keywords   = self.CATEGORY_HIERARCHY[category]['keywords']
        
        detected   = [kw for kw in keywords if kw in text_lower]

        # Return all detected keywords
        return detected
    
    
    @ContractAnalyzerLogger.log_execution_time("classify_multi_label")
    def classify_multi_label(self, text: str, threshold: float = None) -> List[ContractCategory]:
        """
        Classify as multiple categories if applicable (e.g., Employment + NDA, Consulting + IP Assignment)
        
        Arguments:
        ----------
            text       { str }  : Contract text

            threshold { float } : Minimum confidence threshold for multi-label
        
        Returns:
        --------
                 { list }       : List of ContractCategory objects (sorted by confidence)
        """
        # Use multi-label threshold if not specified
        if threshold is None:
            threshold = self.MULTI_LABEL_THRESHOLD

        log_info("Starting multi-label classification", threshold = threshold)
        
        # Get scores
        keyword_scores    = self._score_keywords(text_lower = text.lower())
        semantic_scores   = self._semantic_similarity(text = text)
        legal_bert_scores = self._legal_bert_similarity(text = text)
        combined_scores   = self._combine_scores(keyword_scores    = keyword_scores, 
                                                 semantic_scores   = semantic_scores, 
                                                 legal_bert_scores = legal_bert_scores,
                                                )
        
        # Get all categories above threshold
        matches         = list()

        for category, score in combined_scores.items():
            if (score >= threshold):
                subcategory = self._detect_subcategory(text             = text, 
                                                       primary_category = category,
                                                      )
                                                  
                reasoning   = self._generate_reasoning(contract_text     = text, 
                                                       primary_category  = category, 
                                                       subcategory       = subcategory, 
                                                       keyword_scores    = keyword_scores, 
                                                       semantic_scores   = semantic_scores, 
                                                       legal_bert_scores = legal_bert_scores, 
                                                       combined_scores   = combined_scores,
                                                      )

                keywords    = self._extract_detected_keywords(text     = text, 
                                                              category = category,
                                                             )
                
                matches.append(ContractCategory(category          = category,
                                                subcategory       = subcategory,
                                                confidence        = score,
                                                reasoning         = reasoning,
                                                detected_keywords = keywords,
                                               )
                              )
        
        # Sort by confidence
        matches.sort(key = lambda x: x.confidence, reverse = True)
        
        log_info(f"Multi-label classification found {len(matches)} categories")
        
        return matches if matches else [self.classify_contract(text)]
    

    def get_category_description(self, category: str) -> str:
        """
        Get human-readable description of a category
        """
        descriptions = {'employment'  : 'Employment agreements governing employer-employee relationships',
                        'consulting'  : 'Consulting and independent contractor agreements',
                        'nda'         : 'Non-disclosure and confidentiality agreements',
                        'software'    : 'Software licensing and technology service agreements',
                        'service'     : 'Professional service and maintenance agreements',
                        'partnership' : 'Partnership, joint venture, and corporate agreements',
                        'lease'       : 'Property lease, rental, and equipment lease agreements',
                        'purchase'    : 'Sales, purchase, and goods transfer agreements',
                        'general'     : 'General contract agreements with standard terms and conditions',
                       }

        return descriptions.get(category, 'General contract agreement')

    
    def get_all_categories(self) -> List[str]:
        """
        Get list of all supported categories
        """
        return list(self.CATEGORY_HIERARCHY.keys())
    

    def get_subcategories(self, category: str) -> List[str]:
        """
        Get subcategories for a specific category
        """
        return self.CATEGORY_HIERARCHY.get(category, {}).get('subcategories', [])