contract-guard-ai / config /model_config.py
satyakimitra's picture
Everything updated
bdedf43
# DEPENDENCIES
from pathlib import Path
class ModelConfig:
"""
Model-specific configurations - FOR AI MODEL SETTINGS ONLY
"""
# Directory Settings
MODEL_DIR = Path("models")
CACHE_DIR = Path("cache/models")
# Model Architecture Settings
LEGAL_BERT = {"model_name" : "nlpaueb/legal-bert-base-uncased",
"local_path" : MODEL_DIR / "nlpaueb" / "legal-bert-base-uncased",
"task" : "clause-extraction",
"max_length" : 512,
"batch_size" : 16,
"hidden_dim" : 768,
"num_layers" : 12,
"attention_heads" : 12,
"force_download" : False,
}
# Embedding Model Settings
EMBEDDING_MODEL = {"model_name" : "sentence-transformers/all-MiniLM-L6-v2",
"local_path" : MODEL_DIR / "sentence-transformers" / "all-MiniLM-L6-v2",
"dimension" : 384,
"pooling" : "mean",
"normalize" : True,
"similarity_threshold" : 0.7,
"force_download" : True,
}
# Classification Model Settings
CLASSIFIER_MODEL = {"embedding_dim" : 384,
"hidden_dim" : 256,
"num_categories" : 12,
"dropout_rate" : 0.1,
"learning_rate" : 2e-5,
"max_seq_length" : 512,
}
# Clause Extraction Settings
CLAUSE_EXTRACTION = {"min_clause_length" : 50,
"max_clause_length" : 2000,
"confidence_threshold" : 0.7,
"overlap_threshold" : 0.3,
"max_clauses_per_doc" : 50,
}
# Risk Analysis Settings
RISK_ANALYSIS = {"score_ranges" : {"low" : (0, 40),
"medium" : (40, 60),
"high" : (60, 80),
"critical" : (80, 100),
},
"weight_decay" : 0.1,
"smoothing_factor" : 0.5,
}
# Market Comparison Settings
MARKET_COMPARISON = {"similarity_threshold" : 0.75,
"min_matches_required" : 3,
"max_comparisons" : 20,
"embedding_cache_size" : 1000,
}
# LLM Generation Settings
LLM_GENERATION = {"max_tokens" : 5000,
"temperature" : 0.1,
"top_p" : 0.9,
"frequency_penalty" : 0.1,
"presence_penalty" : 0.1,
"stop_sequences" : ["\n\n", "###", "---"],
}
# Text Processing Settings
TEXT_PROCESSING = {"chunk_size" : 512,
"chunk_overlap" : 50,
"min_sentence_length" : 10,
"max_sentence_length" : 200,
"entity_confidence" : 0.8,
}
@classmethod
def ensure_directories(cls):
"""
Ensure all required directories exist
"""
directories = [cls.MODEL_DIR,
cls.CACHE_DIR,
cls.MODEL_DIR / "nlpaueb" / "legal-bert-base-uncased",
cls.MODEL_DIR / "sentence-transformers" / "all-MiniLM-L6-v2",
]
for directory in directories:
directory.mkdir(parents = True, exist_ok = True)
@classmethod
def get_model_config(cls, model_type: str) -> dict:
"""
Get configuration for specific model type
"""
config_map = {"legal_bert" : cls.LEGAL_BERT,
"embedding" : cls.EMBEDDING_MODEL,
"classifier" : cls.CLASSIFIER_MODEL,
"clause_extraction" : cls.CLAUSE_EXTRACTION,
"risk_analysis" : cls.RISK_ANALYSIS,
"market_comparison" : cls.MARKET_COMPARISON,
"llm_generation" : cls.LLM_GENERATION,
"text_processing" : cls.TEXT_PROCESSING,
}
return config_map.get(model_type, {})