File size: 4,842 Bytes
1099afe
 
 
 
 
 
 
 
 
 
 
 
 
 
bdedf43
1099afe
 
 
 
 
 
bdedf43
1099afe
 
bdedf43
1099afe
bdedf43
1099afe
 
 
 
bdedf43
1099afe
bdedf43
1099afe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bdedf43
 
1099afe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# DEPENDENCIES
from pathlib import Path


class ModelConfig:
    """
    Model-specific configurations - FOR AI MODEL SETTINGS ONLY
    """
    # Directory Settings
    MODEL_DIR = Path("models")
    CACHE_DIR = Path("cache/models")
    
    # Model Architecture Settings
    LEGAL_BERT        = {"model_name"      : "nlpaueb/legal-bert-base-uncased",
                         "local_path"      : MODEL_DIR / "nlpaueb" / "legal-bert-base-uncased",
                         "task"            : "clause-extraction",
                         "max_length"      : 512,
                         "batch_size"      : 16,
                         "hidden_dim"      : 768,
                         "num_layers"      : 12,
                         "attention_heads" : 12,
                         "force_download"  : False, 
                        }
    
    # Embedding Model Settings  
    EMBEDDING_MODEL   = {"model_name"           : "sentence-transformers/all-MiniLM-L6-v2",
                         "local_path"           : MODEL_DIR / "sentence-transformers" / "all-MiniLM-L6-v2",
                         "dimension"            : 384,
                         "pooling"              : "mean",
                         "normalize"            : True,
                         "similarity_threshold" : 0.7,
                         "force_download"       : True,  
                        }

    
    # Classification Model Settings
    CLASSIFIER_MODEL  = {"embedding_dim"    : 384,
                         "hidden_dim"       : 256,
                         "num_categories"   : 12,
                         "dropout_rate"     : 0.1,
                         "learning_rate"    : 2e-5,
                         "max_seq_length"   : 512,
                        }
    
    # Clause Extraction Settings
    CLAUSE_EXTRACTION = {"min_clause_length"    : 50,
                         "max_clause_length"    : 2000,
                         "confidence_threshold" : 0.7,
                         "overlap_threshold"    : 0.3,
                         "max_clauses_per_doc"  : 50,
                        }
    
    # Risk Analysis Settings
    RISK_ANALYSIS     = {"score_ranges"     : {"low"      : (0, 40),
                                               "medium"   : (40, 60),
                                               "high"     : (60, 80),
                                               "critical" : (80, 100),
                                              },
                         "weight_decay"     : 0.1,
                         "smoothing_factor" : 0.5,
                        }
    
    # Market Comparison Settings
    MARKET_COMPARISON = {"similarity_threshold" : 0.75,
                         "min_matches_required" : 3,
                         "max_comparisons"      : 20,
                         "embedding_cache_size" : 1000,
                        }
    
    # LLM Generation Settings
    LLM_GENERATION    = {"max_tokens"        : 5000,
                         "temperature"       : 0.1,
                         "top_p"             : 0.9,
                         "frequency_penalty" : 0.1,
                         "presence_penalty"  : 0.1,
                         "stop_sequences"    : ["\n\n", "###", "---"],
                        }
    
    # Text Processing Settings
    TEXT_PROCESSING   = {"chunk_size"          : 512,
                         "chunk_overlap"       : 50,
                         "min_sentence_length" : 10,
                         "max_sentence_length" : 200,
                         "entity_confidence"   : 0.8,
                        }

    @classmethod
    def ensure_directories(cls):
        """
        Ensure all required directories exist
        """
        directories = [cls.MODEL_DIR,
                       cls.CACHE_DIR,
                       cls.MODEL_DIR / "nlpaueb" / "legal-bert-base-uncased",
                       cls.MODEL_DIR / "sentence-transformers" / "all-MiniLM-L6-v2",
                      ]
                    
        for directory in directories:
            directory.mkdir(parents = True, exist_ok = True)


    @classmethod
    def get_model_config(cls, model_type: str) -> dict:
        """
        Get configuration for specific model type
        """
        config_map = {"legal_bert"        : cls.LEGAL_BERT,
                      "embedding"         : cls.EMBEDDING_MODEL,
                      "classifier"        : cls.CLASSIFIER_MODEL,
                      "clause_extraction" : cls.CLAUSE_EXTRACTION,
                      "risk_analysis"     : cls.RISK_ANALYSIS,
                      "market_comparison" : cls.MARKET_COMPARISON,
                      "llm_generation"    : cls.LLM_GENERATION,
                      "text_processing"   : cls.TEXT_PROCESSING,
                     }
        
        return config_map.get(model_type, {})