Spaces:

satyakimitra
/

contract-guard-ai

Running

App Files Files Community

contract-guard-ai / config /model_config.py

satyakimitra

Everything updated

bdedf43 about 1 month ago

raw

history blame contribute delete

4.84 kB

	# DEPENDENCIES
	from pathlib import Path


	class ModelConfig:
	"""
	Model-specific configurations - FOR AI MODEL SETTINGS ONLY
	"""
	# Directory Settings
	MODEL_DIR = Path("models")
	CACHE_DIR = Path("cache/models")

	# Model Architecture Settings
	LEGAL_BERT = {"model_name" : "nlpaueb/legal-bert-base-uncased",
	"local_path" : MODEL_DIR / "nlpaueb" / "legal-bert-base-uncased",
	"task" : "clause-extraction",
	"max_length" : 512,
	"batch_size" : 16,
	"hidden_dim" : 768,
	"num_layers" : 12,
	"attention_heads" : 12,
	"force_download" : False,
	}

	# Embedding Model Settings
	EMBEDDING_MODEL = {"model_name" : "sentence-transformers/all-MiniLM-L6-v2",
	"local_path" : MODEL_DIR / "sentence-transformers" / "all-MiniLM-L6-v2",
	"dimension" : 384,
	"pooling" : "mean",
	"normalize" : True,
	"similarity_threshold" : 0.7,
	"force_download" : True,
	}


	# Classification Model Settings
	CLASSIFIER_MODEL = {"embedding_dim" : 384,
	"hidden_dim" : 256,
	"num_categories" : 12,
	"dropout_rate" : 0.1,
	"learning_rate" : 2e-5,
	"max_seq_length" : 512,
	}

	# Clause Extraction Settings
	CLAUSE_EXTRACTION = {"min_clause_length" : 50,
	"max_clause_length" : 2000,
	"confidence_threshold" : 0.7,
	"overlap_threshold" : 0.3,
	"max_clauses_per_doc" : 50,
	}

	# Risk Analysis Settings
	RISK_ANALYSIS = {"score_ranges" : {"low" : (0, 40),
	"medium" : (40, 60),
	"high" : (60, 80),
	"critical" : (80, 100),
	},
	"weight_decay" : 0.1,
	"smoothing_factor" : 0.5,
	}

	# Market Comparison Settings
	MARKET_COMPARISON = {"similarity_threshold" : 0.75,
	"min_matches_required" : 3,
	"max_comparisons" : 20,
	"embedding_cache_size" : 1000,
	}

	# LLM Generation Settings
	LLM_GENERATION = {"max_tokens" : 5000,
	"temperature" : 0.1,
	"top_p" : 0.9,
	"frequency_penalty" : 0.1,
	"presence_penalty" : 0.1,
	"stop_sequences" : ["\n\n", "###", "---"],
	}

	# Text Processing Settings
	TEXT_PROCESSING = {"chunk_size" : 512,
	"chunk_overlap" : 50,
	"min_sentence_length" : 10,
	"max_sentence_length" : 200,
	"entity_confidence" : 0.8,
	}

	@classmethod
	def ensure_directories(cls):
	"""
	Ensure all required directories exist
	"""
	directories = [cls.MODEL_DIR,
	cls.CACHE_DIR,
	cls.MODEL_DIR / "nlpaueb" / "legal-bert-base-uncased",
	cls.MODEL_DIR / "sentence-transformers" / "all-MiniLM-L6-v2",
	]

	for directory in directories:
	directory.mkdir(parents = True, exist_ok = True)


	@classmethod
	def get_model_config(cls, model_type: str) -> dict:
	"""
	Get configuration for specific model type
	"""
	config_map = {"legal_bert" : cls.LEGAL_BERT,
	"embedding" : cls.EMBEDDING_MODEL,
	"classifier" : cls.CLASSIFIER_MODEL,
	"clause_extraction" : cls.CLAUSE_EXTRACTION,
	"risk_analysis" : cls.RISK_ANALYSIS,
	"market_comparison" : cls.MARKET_COMPARISON,
	"llm_generation" : cls.LLM_GENERATION,
	"text_processing" : cls.TEXT_PROCESSING,
	}

	return config_map.get(model_type, {})