Spaces:

jukofyork
/

merge-lora

Running

App Files Files Community

merge-lora / app.py

jukofyork

Removed scale options

e60537d verified 11 days ago

raw

history blame contribute delete

13.4 kB

	import gc
	import gradio as gr
	import torch
	from huggingface_hub import hf_hub_download, HfApi, login, list_repo_files
	from safetensors import safe_open
	from safetensors.torch import save_file, load_file
	import os
	import shutil
	import json

	api = HfApi()

	def info_fn(text):
	gr.Info(text)

	def warning_fn(text):
	gr.Warning(text)

	def load_lora_state(lora_model_name):
	"""Download and load LoRA adapter weights"""
	temp_lora_dir = "/tmp/lora_adapter"
	os.makedirs(temp_lora_dir, exist_ok=True)

	# Download adapter config
	config_path = hf_hub_download(
	repo_id=lora_model_name,
	filename="adapter_config.json",
	local_dir=temp_lora_dir,
	local_dir_use_symlinks=False
	)

	with open(config_path, 'r') as f:
	lora_config = json.load(f)

	scale = lora_config['lora_alpha'] / lora_config['r']

	# Download adapter weights
	try:
	adapter_path = hf_hub_download(
	repo_id=lora_model_name,
	filename="adapter_model.safetensors",
	local_dir=temp_lora_dir,
	local_dir_use_symlinks=False
	)
	lora_state = load_file(adapter_path, device='cpu')
	except:
	adapter_path = hf_hub_download(
	repo_id=lora_model_name,
	filename="adapter_model.bin",
	local_dir=temp_lora_dir,
	local_dir_use_symlinks=False
	)
	lora_state = torch.load(adapter_path, map_location='cpu')

	return lora_state, scale, temp_lora_dir

	def find_lora_weights(lora_state, key):
	"""Find corresponding LoRA A and B weights for a given key"""
	lora_A = None
	lora_B = None

	# Remove .weight suffix and handle potential prefixes
	clean_key = key.replace('.weight', '')

	for lora_key, lora_weight in lora_state.items():
	if clean_key in lora_key or clean_key.replace('language_model.', '') in lora_key:
	if 'lora_A' in lora_key:
	lora_A = lora_weight
	elif 'lora_B' in lora_key:
	lora_B = lora_weight

	# Both should be None or both should have values
	if (lora_A is None) != (lora_B is None):
	return None, None

	return lora_A, lora_B

	def download_and_upload_non_model_files(base_model_name, output_repo_name):
	"""Download and upload non-model files (config, tokenizer, etc.)"""
	temp_config_dir = "/tmp/config_files"
	os.makedirs(temp_config_dir, exist_ok=True)

	try:
	# List all files in the repository
	files = list_repo_files(repo_id=base_model_name)

	# Filter non-model files
	non_model_files = [
	f for f in files
	if not (f.startswith('model') and f.endswith('.safetensors'))
	]

	# Download and upload each non-model file
	for filename in non_model_files:
	if filename.endswith(('.gguf', '.bin')) and 'model' in filename:
	continue # Skip other model formats

	try:
	file_path = hf_hub_download(
	repo_id=base_model_name,
	filename=filename,
	local_dir=temp_config_dir,
	local_dir_use_symlinks=False
	)

	# Upload to output repo
	api.upload_file(
	path_or_fileobj=file_path,
	path_in_repo=filename,
	repo_id=output_repo_name,
	repo_type="model"
	)

	except Exception as e:
	info_fn(f"Skipping {filename}: {e}")

	finally:
	shutil.rmtree(temp_config_dir, ignore_errors=True)

	def merge_lora_efficient(hf_token, base_model_name, lora_model_name, output_repo_name,
	multiplicative_lora, progress=gr.Progress()):
	temp_lora_dir = None
	try:
	login(hf_token)

	progress(0.1, desc="Loading LoRA adapter...")
	info_fn("Loading LoRA adapter...")

	# Load LoRA state (this downloads the adapter)
	lora_state, scale, temp_lora_dir = load_lora_state(lora_model_name)
	info_fn(f"Using LoRA scale: {scale}")

	progress(0.2, desc="Creating output repository...")

	# Create repository
	try:
	repo_url = api.create_repo(repo_id=output_repo_name, exist_ok=True)
	info_fn(f"Repository created/updated: {repo_url}")
	except Exception as e:
	warning_fn(f"Repository might already exist: {e}")

	progress(0.3, desc="Uploading configuration files...")
	info_fn("Uploading configuration files...")

	# Download and upload non-model files
	download_and_upload_non_model_files(base_model_name, output_repo_name)

	progress(0.4, desc="Finding model shards...")
	info_fn("Finding model shards...")

	# Get list of all safetensors files
	all_files = list_repo_files(repo_id=base_model_name)
	shard_files = [f for f in all_files if f.startswith('model') and f.endswith('.safetensors')]

	if not shard_files:
	raise FileNotFoundError("No model safetensors files found in the repository")

	info_fn(f"Found {len(shard_files)} model shards to process")

	merged_tensors = 0
	total_shards = len(shard_files)

	# Process each shard individually
	for i, shard_filename in enumerate(shard_files):
	progress(0.4 + (i / total_shards) * 0.5,
	desc=f"Processing {shard_filename} ({i+1}/{total_shards})")
	info_fn(f"Processing shard {i+1}/{total_shards}: {shard_filename}")

	# Create temporary directory for this shard only
	temp_shard_dir = f"/tmp/shard_{i}"
	os.makedirs(temp_shard_dir, exist_ok=True)

	try:
	# Download the current shard
	shard_path = hf_hub_download(
	repo_id=base_model_name,
	filename=shard_filename,
	local_dir=temp_shard_dir,
	local_dir_use_symlinks=False
	)

	# Process the shard
	tensors = {}
	shard_merged_count = 0

	with safe_open(shard_path, framework='pt', device='cpu') as f:
	# Get metadata if available
	metadata = f.metadata() if hasattr(f, 'metadata') else {}

	for key in f.keys():
	tensor = f.get_tensor(key)

	# Try to find corresponding LoRA weights
	lora_A, lora_B = find_lora_weights(lora_state, key)

	if lora_A is not None and lora_B is not None:
	lora_type = "Multiplicative" if multiplicative_lora else "Additive"
	info_fn(f"Merging {lora_type} LoRA weights for {key}")
	shard_merged_count += 1
	merged_tensors += 1

	# Convert to float32 for computation
	original_dtype = tensor.dtype
	tensor_f32 = tensor.to(torch.float32)
	lora_A_f32 = lora_A.to(torch.float32)
	lora_B_f32 = lora_B.to(torch.float32)

	if multiplicative_lora:
	# Apply Multiplicative-LoRA: W = W + scale * B @ A @ W
	tensor_f32 += scale * lora_B_f32 @ lora_A_f32 @ tensor_f32
	else:
	# Apply standard LoRA: W = W + scale * B @ A
	tensor_f32 += scale * lora_B_f32 @ lora_A_f32

	# Convert back to original dtype
	tensor = tensor_f32.to(original_dtype)

	# Clean up intermediate tensors
	del tensor_f32, lora_A_f32, lora_B_f32
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	tensors[key] = tensor

	# Save processed shard to temporary file
	output_shard_path = os.path.join(temp_shard_dir, f"processed_{shard_filename}")
	save_file(tensors, output_shard_path, metadata=metadata)

	info_fn(f"Shard {shard_filename}: Merged {shard_merged_count} tensors")

	# Upload the processed shard
	api.upload_file(
	path_or_fileobj=output_shard_path,
	path_in_repo=shard_filename,
	repo_id=output_repo_name,
	repo_type="model"
	)

	# Clean up this shard's data
	del tensors
	gc.collect()

	finally:
	# Always clean up the temporary shard directory
	shutil.rmtree(temp_shard_dir, ignore_errors=True)

	progress(1.0, desc="Upload completed!")

	success_msg = f"✓ Successfully merged and uploaded model!\nModel URL: https://huggingface.co/{output_repo_name}\nProcessed {total_shards} shards\nMerged {merged_tensors} layers with LoRA weights"
	info_fn("Merge completed successfully!")

	return success_msg

	except Exception as e:
	error_msg = f"✗ Error during merge: {str(e)}"
	warning_fn(error_msg)
	return error_msg

	finally:
	# Cleanup LoRA directory
	if temp_lora_dir and os.path.exists(temp_lora_dir):
	shutil.rmtree(temp_lora_dir, ignore_errors=True)
	gc.collect()

	INTRODUCTION_TEXT = """
	## Memory-Efficient LoRA Merge

	This tool merges LoRA (Low-Rank Adaptation) adapters with base models using a memory-efficient approach that processes model files individually, significantly reducing memory requirements compared to traditional methods.

	### Key Features
	- Minimal Memory Usage: Processes one model shard at a time instead of loading the entire model
	- Streaming Processing: Downloads → Processes → Uploads → Deletes each shard sequentially
	- Automatic Cleanup: Temporary files are automatically removed after processing
	- Progress Tracking: Real-time status updates throughout the merge process
	- Advanced Options: Multiplicative LoRA support
	"""

	DETAILS_TEXT = """
	### How It Works
	LoRA enables efficient fine-tuning by adding small adapter weights rather than modifying the entire model. This tool applies the LoRA transformation:

	- Standard Additive-LoRA: `W_new = W + scale × B^T @ A`
	- Multiplicative LoRA: `W_new = W + scale × B^T @ A @ W`

	### Memory Efficiency
	- Traditional approach: Loads entire model (~15GB+ for 7B parameter models)
	- This approach: Peak usage determined by largest shard size, not total model size
	- Result: Enables merging of much larger models on limited hardware

	### Example Usage
	- Base Model: `microsoft/DialoGPT-medium`
	- LoRA Adapter: `username/my-trained-lora`
	- Output Name: `username/dialogpt-merged`

	### Attribution
	This tool builds upon excellent work from the community:

	- Base implementation: [Weyaxi/merge-lora](https://huggingface.co/spaces/Weyaxi/merge-lora)
	- Memory-efficient method: [qlora-pipe](https://github.com/tdrussell/qlora-pipe/blob/main/tools/merge_lora.py) by tdrussell
	"""

	with gr.Blocks(title="Memory-Efficient LoRA Merge", theme=gr.themes.Soft()) as demo:
	gr.Markdown(INTRODUCTION_TEXT)

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### Configuration")
	hf_token = gr.Textbox(
	label="Hugging Face Token",
	placeholder="hf_...",
	type="password",
	info="Token with write access to create repositories"
	)
	base_model_name = gr.Textbox(
	label="Base Model Repository",
	placeholder="microsoft/DialoGPT-medium",
	info="The original model to merge LoRA into"
	)
	lora_model_name = gr.Textbox(
	label="LoRA Adapter Repository",
	placeholder="username/my-lora-adapter",
	info="Repository containing adapter_model.safetensors"
	)
	output_repo_name = gr.Textbox(
	label="Output Repository Name",
	placeholder="username/my-merged-model",
	info="Name for the new merged model repository"
	)

	gr.Markdown("### Advanced Options")
	multiplicative_lora = gr.Checkbox(
	label="Multiplicative LoRA",
	value=False,
	info="Apply a \"multiplicative-LoRA\" instead of a standard \"additive-LoRA\""
	)

	with gr.Column(scale=1):
	gr.Markdown("### Status")
	output_text = gr.Textbox(
	label="Merge Progress & Results",
	lines=20,
	interactive=False,
	show_copy_button=True
	)

	with gr.Row():
	submit_btn = gr.Button("Start LoRA Merge", variant="primary", size="lg")

	submit_btn.click(
	fn=merge_lora_efficient,
	inputs=[hf_token, base_model_name, lora_model_name, output_repo_name, multiplicative_lora],
	outputs=output_text
	)

	gr.Markdown(DETAILS_TEXT)

	demo.queue()
	demo.launch(show_error=True)