Spaces:

jukofyork
/

merge-lora

Running

App Files Files Community

jukofyork commited on 22 days ago

Commit

cbbd0ce

verified ·

1 Parent(s): 063b1a8

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -64

app.py CHANGED Viewed

@@ -20,7 +20,7 @@ def load_lora_state(lora_model_name):
     """Download and load LoRA adapter weights"""
     temp_lora_dir = "/tmp/lora_adapter"
     os.makedirs(temp_lora_dir, exist_ok=True)
     # Download adapter config
     config_path = hf_hub_download(
         repo_id=lora_model_name,
@@ -28,12 +28,12 @@ def load_lora_state(lora_model_name):
         local_dir=temp_lora_dir,
         local_dir_use_symlinks=False
     )
     with open(config_path, 'r') as f:
         lora_config = json.load(f)
     scale = lora_config['lora_alpha'] / lora_config['r']
     # Download adapter weights
     try:
         adapter_path = hf_hub_download(
@@ -51,50 +51,50 @@ def load_lora_state(lora_model_name):
             local_dir_use_symlinks=False
         )
         lora_state = torch.load(adapter_path, map_location='cpu')
     return lora_state, scale, temp_lora_dir
 def find_lora_weights(lora_state, key):
     """Find corresponding LoRA A and B weights for a given key"""
     lora_A = None
     lora_B = None
     # Remove .weight suffix and handle potential prefixes
     clean_key = key.replace('.weight', '')
     for lora_key, lora_weight in lora_state.items():
         if clean_key in lora_key or clean_key.replace('language_model.', '') in lora_key:
             if 'lora_A' in lora_key:
                 lora_A = lora_weight
             elif 'lora_B' in lora_key:
                 lora_B = lora_weight
     # Both should be None or both should have values
     if (lora_A is None) != (lora_B is None):
         return None, None
     return lora_A, lora_B
 def download_and_upload_non_model_files(base_model_name, output_repo_name):
     """Download and upload non-model files (config, tokenizer, etc.)"""
     temp_config_dir = "/tmp/config_files"
     os.makedirs(temp_config_dir, exist_ok=True)
     try:
         # List all files in the repository
         files = list_repo_files(repo_id=base_model_name)
         # Filter non-model files
         non_model_files = [
-            f for f in files
             if not (f.startswith('model') and f.endswith('.safetensors'))
         ]
         # Download and upload each non-model file
         for filename in non_model_files:
             if filename.endswith(('.gguf', '.bin')) and 'model' in filename:
                 continue  # Skip other model formats
             try:
                 file_path = hf_hub_download(
                     repo_id=base_model_name,
@@ -102,7 +102,7 @@ def download_and_upload_non_model_files(base_model_name, output_repo_name):
                     local_dir=temp_config_dir,
                     local_dir_use_symlinks=False
                 )
                 # Upload to output repo
                 api.upload_file(
                     path_or_fileobj=file_path,
@@ -110,70 +110,70 @@ def download_and_upload_non_model_files(base_model_name, output_repo_name):
                     repo_id=output_repo_name,
                     repo_type="model"
                 )
             except Exception as e:
                 info_fn(f"Skipping {filename}: {e}")
     finally:
         shutil.rmtree(temp_config_dir, ignore_errors=True)
-def merge_lora_efficient(hf_token, base_model_name, lora_model_name, output_repo_name,
                         lora_scale, lm_head_scale, multiplicative_lora, progress=gr.Progress()):
     temp_lora_dir = None
     try:
         login(hf_token)
         progress(0.1, desc="Loading LoRA adapter...")
         info_fn("Loading LoRA adapter...")
         # Load LoRA state (this downloads the adapter)
         lora_state, base_scale, temp_lora_dir = load_lora_state(lora_model_name)
         # Apply LoRA scale multiplier
         scale = base_scale * lora_scale
         info_fn(f"Using LoRA scale: {scale} (base: {base_scale}, multiplier: {lora_scale})")
         progress(0.2, desc="Creating output repository...")
         # Create repository
         try:
             repo_url = api.create_repo(repo_id=output_repo_name, exist_ok=True)
             info_fn(f"Repository created/updated: {repo_url}")
         except Exception as e:
             warning_fn(f"Repository might already exist: {e}")
         progress(0.3, desc="Uploading configuration files...")
         info_fn("Uploading configuration files...")
         # Download and upload non-model files
         download_and_upload_non_model_files(base_model_name, output_repo_name)
         progress(0.4, desc="Finding model shards...")
         info_fn("Finding model shards...")
         # Get list of all safetensors files
         all_files = list_repo_files(repo_id=base_model_name)
         shard_files = [f for f in all_files if f.startswith('model') and f.endswith('.safetensors')]
         if not shard_files:
             raise FileNotFoundError("No model safetensors files found in the repository")
         info_fn(f"Found {len(shard_files)} model shards to process")
         merged_tensors = 0
         scaled_lm_heads = 0
         total_shards = len(shard_files)
         # Process each shard individually
         for i, shard_filename in enumerate(shard_files):
-            progress(0.4 + (i / total_shards) * 0.5,
                     desc=f"Processing {shard_filename} ({i+1}/{total_shards})")
             info_fn(f"Processing shard {i+1}/{total_shards}: {shard_filename}")
             # Create temporary directory for this shard only
             temp_shard_dir = f"/tmp/shard_{i}"
             os.makedirs(temp_shard_dir, exist_ok=True)
             try:
                 # Download the current shard
                 shard_path = hf_hub_download(
@@ -182,19 +182,19 @@ def merge_lora_efficient(hf_token, base_model_name, lora_model_name, output_repo
                     local_dir=temp_shard_dir,
                     local_dir_use_symlinks=False
                 )
                 # Process the shard
                 tensors = {}
                 shard_merged_count = 0
                 shard_lm_head_count = 0
                 with safe_open(shard_path, framework='pt', device='cpu') as f:
                     # Get metadata if available
                     metadata = f.metadata() if hasattr(f, 'metadata') else {}
                     for key in f.keys():
                         tensor = f.get_tensor(key)
                         # Apply lm_head scaling if applicable
                         if key.endswith('lm_head.weight') and lm_head_scale != 1.0:
                             info_fn(f"Scaling {key} by {lm_head_scale}")
@@ -204,45 +204,45 @@ def merge_lora_efficient(hf_token, base_model_name, lora_model_name, output_repo
                             tensor = tensor.to(original_dtype)
                             shard_lm_head_count += 1
                             scaled_lm_heads += 1
                         # Try to find corresponding LoRA weights
                         lora_A, lora_B = find_lora_weights(lora_state, key)
                         if lora_A is not None and lora_B is not None:
                             lora_type = "Multiplicative" if multiplicative_lora else "Additive"
                             info_fn(f"Merging {lora_type} LoRA weights for {key}")
                             shard_merged_count += 1
                             merged_tensors += 1
                             # Convert to float32 for computation
                             original_dtype = tensor.dtype
                             tensor_f32 = tensor.to(torch.float32)
                             lora_A_f32 = lora_A.to(torch.float32)
                             lora_B_f32 = lora_B.to(torch.float32)
                             if multiplicative_lora:
                                 # Apply Multiplicative-LoRA: W = W + scale * B @ A @ W
                                 tensor_f32 += scale * lora_B_f32 @ lora_A_f32 @ tensor_f32
                             else:
                                 # Apply standard LoRA: W = W + scale * B @ A
                                 tensor_f32 += scale * lora_B_f32 @ lora_A_f32
                             # Convert back to original dtype
                             tensor = tensor_f32.to(original_dtype)
                             # Clean up intermediate tensors
                             del tensor_f32, lora_A_f32, lora_B_f32
                             if torch.cuda.is_available():
                                 torch.cuda.empty_cache()
                         tensors[key] = tensor
                 # Save processed shard to temporary file
                 output_shard_path = os.path.join(temp_shard_dir, f"processed_{shard_filename}")
                 save_file(tensors, output_shard_path, metadata=metadata)
                 info_fn(f"Shard {shard_filename}:\n- Merged {shard_merged_count} tensors\n- Scaled {shard_lm_head_count} lm_head tensors")
                 # Upload the processed shard
                 api.upload_file(
                     path_or_fileobj=output_shard_path,
@@ -250,27 +250,27 @@ def merge_lora_efficient(hf_token, base_model_name, lora_model_name, output_repo
                     repo_id=output_repo_name,
                     repo_type="model"
                 )
                 # Clean up this shard's data
                 del tensors
                 gc.collect()
             finally:
                 # Always clean up the temporary shard directory
                 shutil.rmtree(temp_shard_dir, ignore_errors=True)
         progress(1.0, desc="Upload completed!")
         success_msg = f"✓ Successfully merged and uploaded model!\nModel URL: https://huggingface.co/{output_repo_name}\nProcessed {total_shards} shards\nMerged {merged_tensors} layers with LoRA weights\nScaled {scaled_lm_heads} lm_head layers"
         info_fn("Merge completed successfully!")
         return success_msg
     except Exception as e:
         error_msg = f"✗ Error during merge: {str(e)}"
         warning_fn(error_msg)
         return error_msg
     finally:
         # Cleanup LoRA directory
         if temp_lora_dir and os.path.exists(temp_lora_dir):
@@ -284,11 +284,13 @@ This tool merges LoRA (Low-Rank Adaptation) adapters with base models using a me
 ### Key Features
 - **Minimal Memory Usage**: Processes one model shard at a time instead of loading the entire model
-- **Streaming Processing**: Downloads → Processes → Uploads → Deletes each shard sequentially
 - **Automatic Cleanup**: Temporary files are automatically removed after processing
 - **Progress Tracking**: Real-time status updates throughout the merge process
-- **Advanced Options**: Configurable LoRA scaling, LM head scaling, and multiplicative LoRA support
 ### How It Works
 LoRA enables efficient fine-tuning by adding small adapter weights rather than modifying the entire model. This tool applies the LoRA transformation with configurable scaling:
@@ -299,7 +301,7 @@ Additionally, the model's default temperature behavior can be adjusted by scalin
 - **Up-scaling**: Makes the model's outputs more peaked, requiring lower temperature settings for the same output distribution
 - **Down-scaling**: Makes the model's outputs flatter, requiring higher temperature settings for the same output distribution
-- **Examples**:
   - Scaling `lm_head.weight` by `1.25` makes the new model with `temperature = 1.0` act like the old model with `temperature = 0.8`
   - Scaling `lm_head.weight` by `0.667` makes the new model with `temperature = 1.0` act like the old model with `temperature = 1.5`
@@ -309,8 +311,8 @@ Additionally, the model's default temperature behavior can be adjusted by scalin
 - **Result**: Enables merging of much larger models on limited hardware
 ### Example Usage
-- **Base Model:** `microsoft/DialoGPT-medium`
-- **LoRA Adapter:** `username/my-trained-lora`
 - **Output Name:** `username/dialogpt-merged`
 ### Attribution
@@ -338,7 +340,7 @@ with gr.Blocks(title="Memory-Efficient LoRA Merge", theme=gr.themes.Soft()) as d
                 info="The original model to merge LoRA into"
             )
             lora_model_name = gr.Textbox(
-                label="LoRA Adapter Repository",
                 placeholder="username/my-lora-adapter",
                 info="Repository containing adapter_model.safetensors"
             )
@@ -347,7 +349,7 @@ with gr.Blocks(title="Memory-Efficient LoRA Merge", theme=gr.themes.Soft()) as d
                 placeholder="username/my-merged-model",
                 info="Name for the new merged model repository"
             )
             gr.Markdown("### Advanced Options")
             lora_scale = gr.Number(
                 label="LoRA Scale",
@@ -382,13 +384,15 @@ with gr.Blocks(title="Memory-Efficient LoRA Merge", theme=gr.themes.Soft()) as d
     with gr.Row():
         submit_btn = gr.Button("Start LoRA Merge", variant="primary", size="lg")
     submit_btn.click(
         fn=merge_lora_efficient,
-        inputs=[hf_token, base_model_name, lora_model_name, output_repo_name,
                 lora_scale, lm_head_scale, multiplicative_lora],
         outputs=output_text
     )
 demo.queue()
 demo.launch(show_error=True)

     """Download and load LoRA adapter weights"""
     temp_lora_dir = "/tmp/lora_adapter"
     os.makedirs(temp_lora_dir, exist_ok=True)
     # Download adapter config
     config_path = hf_hub_download(
         repo_id=lora_model_name,
         local_dir=temp_lora_dir,
         local_dir_use_symlinks=False
     )
     with open(config_path, 'r') as f:
         lora_config = json.load(f)
     scale = lora_config['lora_alpha'] / lora_config['r']
     # Download adapter weights
     try:
         adapter_path = hf_hub_download(
             local_dir_use_symlinks=False
         )
         lora_state = torch.load(adapter_path, map_location='cpu')
     return lora_state, scale, temp_lora_dir
 def find_lora_weights(lora_state, key):
     """Find corresponding LoRA A and B weights for a given key"""
     lora_A = None
     lora_B = None
     # Remove .weight suffix and handle potential prefixes
     clean_key = key.replace('.weight', '')
     for lora_key, lora_weight in lora_state.items():
         if clean_key in lora_key or clean_key.replace('language_model.', '') in lora_key:
             if 'lora_A' in lora_key:
                 lora_A = lora_weight
             elif 'lora_B' in lora_key:
                 lora_B = lora_weight
     # Both should be None or both should have values
     if (lora_A is None) != (lora_B is None):
         return None, None
     return lora_A, lora_B
 def download_and_upload_non_model_files(base_model_name, output_repo_name):
     """Download and upload non-model files (config, tokenizer, etc.)"""
     temp_config_dir = "/tmp/config_files"
     os.makedirs(temp_config_dir, exist_ok=True)
     try:
         # List all files in the repository
         files = list_repo_files(repo_id=base_model_name)
         # Filter non-model files
         non_model_files = [
+            f for f in files
             if not (f.startswith('model') and f.endswith('.safetensors'))
         ]
         # Download and upload each non-model file
         for filename in non_model_files:
             if filename.endswith(('.gguf', '.bin')) and 'model' in filename:
                 continue  # Skip other model formats
             try:
                 file_path = hf_hub_download(
                     repo_id=base_model_name,
                     local_dir=temp_config_dir,
                     local_dir_use_symlinks=False
                 )
                 # Upload to output repo
                 api.upload_file(
                     path_or_fileobj=file_path,
                     repo_id=output_repo_name,
                     repo_type="model"
                 )
             except Exception as e:
                 info_fn(f"Skipping {filename}: {e}")
     finally:
         shutil.rmtree(temp_config_dir, ignore_errors=True)
+def merge_lora_efficient(hf_token, base_model_name, lora_model_name, output_repo_name,
                         lora_scale, lm_head_scale, multiplicative_lora, progress=gr.Progress()):
     temp_lora_dir = None
     try:
         login(hf_token)
         progress(0.1, desc="Loading LoRA adapter...")
         info_fn("Loading LoRA adapter...")
         # Load LoRA state (this downloads the adapter)
         lora_state, base_scale, temp_lora_dir = load_lora_state(lora_model_name)
         # Apply LoRA scale multiplier
         scale = base_scale * lora_scale
         info_fn(f"Using LoRA scale: {scale} (base: {base_scale}, multiplier: {lora_scale})")
         progress(0.2, desc="Creating output repository...")
         # Create repository
         try:
             repo_url = api.create_repo(repo_id=output_repo_name, exist_ok=True)
             info_fn(f"Repository created/updated: {repo_url}")
         except Exception as e:
             warning_fn(f"Repository might already exist: {e}")
         progress(0.3, desc="Uploading configuration files...")
         info_fn("Uploading configuration files...")
         # Download and upload non-model files
         download_and_upload_non_model_files(base_model_name, output_repo_name)
         progress(0.4, desc="Finding model shards...")
         info_fn("Finding model shards...")
         # Get list of all safetensors files
         all_files = list_repo_files(repo_id=base_model_name)
         shard_files = [f for f in all_files if f.startswith('model') and f.endswith('.safetensors')]
         if not shard_files:
             raise FileNotFoundError("No model safetensors files found in the repository")
         info_fn(f"Found {len(shard_files)} model shards to process")
         merged_tensors = 0
         scaled_lm_heads = 0
         total_shards = len(shard_files)
         # Process each shard individually
         for i, shard_filename in enumerate(shard_files):
+            progress(0.4 + (i / total_shards) * 0.5,
                     desc=f"Processing {shard_filename} ({i+1}/{total_shards})")
             info_fn(f"Processing shard {i+1}/{total_shards}: {shard_filename}")
             # Create temporary directory for this shard only
             temp_shard_dir = f"/tmp/shard_{i}"
             os.makedirs(temp_shard_dir, exist_ok=True)
             try:
                 # Download the current shard
                 shard_path = hf_hub_download(
                     local_dir=temp_shard_dir,
                     local_dir_use_symlinks=False
                 )
                 # Process the shard
                 tensors = {}
                 shard_merged_count = 0
                 shard_lm_head_count = 0
                 with safe_open(shard_path, framework='pt', device='cpu') as f:
                     # Get metadata if available
                     metadata = f.metadata() if hasattr(f, 'metadata') else {}
                     for key in f.keys():
                         tensor = f.get_tensor(key)
                         # Apply lm_head scaling if applicable
                         if key.endswith('lm_head.weight') and lm_head_scale != 1.0:
                             info_fn(f"Scaling {key} by {lm_head_scale}")
                             tensor = tensor.to(original_dtype)
                             shard_lm_head_count += 1
                             scaled_lm_heads += 1
                         # Try to find corresponding LoRA weights
                         lora_A, lora_B = find_lora_weights(lora_state, key)
                         if lora_A is not None and lora_B is not None:
                             lora_type = "Multiplicative" if multiplicative_lora else "Additive"
                             info_fn(f"Merging {lora_type} LoRA weights for {key}")
                             shard_merged_count += 1
                             merged_tensors += 1
                             # Convert to float32 for computation
                             original_dtype = tensor.dtype
                             tensor_f32 = tensor.to(torch.float32)
                             lora_A_f32 = lora_A.to(torch.float32)
                             lora_B_f32 = lora_B.to(torch.float32)
                             if multiplicative_lora:
                                 # Apply Multiplicative-LoRA: W = W + scale * B @ A @ W
                                 tensor_f32 += scale * lora_B_f32 @ lora_A_f32 @ tensor_f32
                             else:
                                 # Apply standard LoRA: W = W + scale * B @ A
                                 tensor_f32 += scale * lora_B_f32 @ lora_A_f32
                             # Convert back to original dtype
                             tensor = tensor_f32.to(original_dtype)
                             # Clean up intermediate tensors
                             del tensor_f32, lora_A_f32, lora_B_f32
                             if torch.cuda.is_available():
                                 torch.cuda.empty_cache()
                         tensors[key] = tensor
                 # Save processed shard to temporary file
                 output_shard_path = os.path.join(temp_shard_dir, f"processed_{shard_filename}")
                 save_file(tensors, output_shard_path, metadata=metadata)
                 info_fn(f"Shard {shard_filename}:\n- Merged {shard_merged_count} tensors\n- Scaled {shard_lm_head_count} lm_head tensors")
                 # Upload the processed shard
                 api.upload_file(
                     path_or_fileobj=output_shard_path,
                     repo_id=output_repo_name,
                     repo_type="model"
                 )
                 # Clean up this shard's data
                 del tensors
                 gc.collect()
             finally:
                 # Always clean up the temporary shard directory
                 shutil.rmtree(temp_shard_dir, ignore_errors=True)
         progress(1.0, desc="Upload completed!")
         success_msg = f"✓ Successfully merged and uploaded model!\nModel URL: https://huggingface.co/{output_repo_name}\nProcessed {total_shards} shards\nMerged {merged_tensors} layers with LoRA weights\nScaled {scaled_lm_heads} lm_head layers"
         info_fn("Merge completed successfully!")
         return success_msg
     except Exception as e:
         error_msg = f"✗ Error during merge: {str(e)}"
         warning_fn(error_msg)
         return error_msg
     finally:
         # Cleanup LoRA directory
         if temp_lora_dir and os.path.exists(temp_lora_dir):
 ### Key Features
 - **Minimal Memory Usage**: Processes one model shard at a time instead of loading the entire model
+- **Streaming Processing**: Downloads → Processes → Uploads → Deletes each shard sequentially
 - **Automatic Cleanup**: Temporary files are automatically removed after processing
 - **Progress Tracking**: Real-time status updates throughout the merge process
+- **Advanced Options**: Configurable LoRA scaling, LM-head scaling, and multiplicative LoRA support
+"""
+DETAILS_TEXT = """
 ### How It Works
 LoRA enables efficient fine-tuning by adding small adapter weights rather than modifying the entire model. This tool applies the LoRA transformation with configurable scaling:
 - **Up-scaling**: Makes the model's outputs more peaked, requiring lower temperature settings for the same output distribution
 - **Down-scaling**: Makes the model's outputs flatter, requiring higher temperature settings for the same output distribution
+- **Examples**:
   - Scaling `lm_head.weight` by `1.25` makes the new model with `temperature = 1.0` act like the old model with `temperature = 0.8`
   - Scaling `lm_head.weight` by `0.667` makes the new model with `temperature = 1.0` act like the old model with `temperature = 1.5`
 - **Result**: Enables merging of much larger models on limited hardware
 ### Example Usage
+- **Base Model:** `microsoft/DialoGPT-medium`
+- **LoRA Adapter:** `username/my-trained-lora`
 - **Output Name:** `username/dialogpt-merged`
 ### Attribution
                 info="The original model to merge LoRA into"
             )
             lora_model_name = gr.Textbox(
+                label="LoRA Adapter Repository",
                 placeholder="username/my-lora-adapter",
                 info="Repository containing adapter_model.safetensors"
             )
                 placeholder="username/my-merged-model",
                 info="Name for the new merged model repository"
             )
             gr.Markdown("### Advanced Options")
             lora_scale = gr.Number(
                 label="LoRA Scale",
     with gr.Row():
         submit_btn = gr.Button("Start LoRA Merge", variant="primary", size="lg")
     submit_btn.click(
         fn=merge_lora_efficient,
+        inputs=[hf_token, base_model_name, lora_model_name, output_repo_name,
                 lora_scale, lm_head_scale, multiplicative_lora],
         outputs=output_text
     )
+    gr.Markdown(DETAILS_TEXT)
 demo.queue()
 demo.launch(show_error=True)