Spaces:

jukofyork
/

merge-lora

Running

App Files Files Community

jukofyork commited on Jun 3

Commit

e60537d

verified ·

1 Parent(s): cbbd0ce

Removed scale options

Browse files

Files changed (1) hide show

app.py +8 -48

app.py CHANGED Viewed

@@ -118,7 +118,7 @@ def download_and_upload_non_model_files(base_model_name, output_repo_name):
         shutil.rmtree(temp_config_dir, ignore_errors=True)
 def merge_lora_efficient(hf_token, base_model_name, lora_model_name, output_repo_name,
-                        lora_scale, lm_head_scale, multiplicative_lora, progress=gr.Progress()):
     temp_lora_dir = None
     try:
         login(hf_token)
@@ -127,11 +127,8 @@ def merge_lora_efficient(hf_token, base_model_name, lora_model_name, output_repo
         info_fn("Loading LoRA adapter...")
         # Load LoRA state (this downloads the adapter)
-        lora_state, base_scale, temp_lora_dir = load_lora_state(lora_model_name)
-        # Apply LoRA scale multiplier
-        scale = base_scale * lora_scale
-        info_fn(f"Using LoRA scale: {scale} (base: {base_scale}, multiplier: {lora_scale})")
         progress(0.2, desc="Creating output repository...")
@@ -161,7 +158,6 @@ def merge_lora_efficient(hf_token, base_model_name, lora_model_name, output_repo
         info_fn(f"Found {len(shard_files)} model shards to process")
         merged_tensors = 0
-        scaled_lm_heads = 0
         total_shards = len(shard_files)
         # Process each shard individually
@@ -186,7 +182,6 @@ def merge_lora_efficient(hf_token, base_model_name, lora_model_name, output_repo
                 # Process the shard
                 tensors = {}
                 shard_merged_count = 0
-                shard_lm_head_count = 0
                 with safe_open(shard_path, framework='pt', device='cpu') as f:
                     # Get metadata if available
@@ -195,16 +190,6 @@ def merge_lora_efficient(hf_token, base_model_name, lora_model_name, output_repo
                     for key in f.keys():
                         tensor = f.get_tensor(key)
-                        # Apply lm_head scaling if applicable
-                        if key.endswith('lm_head.weight') and lm_head_scale != 1.0:
-                            info_fn(f"Scaling {key} by {lm_head_scale}")
-                            original_dtype = tensor.dtype
-                            tensor = tensor.to(torch.float32)
-                            tensor = tensor * lm_head_scale
-                            tensor = tensor.to(original_dtype)
-                            shard_lm_head_count += 1
-                            scaled_lm_heads += 1
                         # Try to find corresponding LoRA weights
                         lora_A, lora_B = find_lora_weights(lora_state, key)
@@ -241,7 +226,7 @@ def merge_lora_efficient(hf_token, base_model_name, lora_model_name, output_repo
                 output_shard_path = os.path.join(temp_shard_dir, f"processed_{shard_filename}")
                 save_file(tensors, output_shard_path, metadata=metadata)
-                info_fn(f"Shard {shard_filename}:\n- Merged {shard_merged_count} tensors\n- Scaled {shard_lm_head_count} lm_head tensors")
                 # Upload the processed shard
                 api.upload_file(
@@ -261,7 +246,7 @@ def merge_lora_efficient(hf_token, base_model_name, lora_model_name, output_repo
         progress(1.0, desc="Upload completed!")
-        success_msg = f"✓ Successfully merged and uploaded model!\nModel URL: https://huggingface.co/{output_repo_name}\nProcessed {total_shards} shards\nMerged {merged_tensors} layers with LoRA weights\nScaled {scaled_lm_heads} lm_head layers"
         info_fn("Merge completed successfully!")
         return success_msg
@@ -287,24 +272,16 @@ This tool merges LoRA (Low-Rank Adaptation) adapters with base models using a me
 - **Streaming Processing**: Downloads → Processes → Uploads → Deletes each shard sequentially
 - **Automatic Cleanup**: Temporary files are automatically removed after processing
 - **Progress Tracking**: Real-time status updates throughout the merge process
-- **Advanced Options**: Configurable LoRA scaling, LM-head scaling, and multiplicative LoRA support
 """
 DETAILS_TEXT = """
 ### How It Works
-LoRA enables efficient fine-tuning by adding small adapter weights rather than modifying the entire model. This tool applies the LoRA transformation with configurable scaling:
 - **Standard Additive-LoRA**: `W_new = W + scale × B^T @ A`
 - **Multiplicative LoRA**: `W_new = W + scale × B^T @ A @ W`
-Additionally, the model's default temperature behavior can be adjusted by scaling the `lm_head.weight` tensor:
-- **Up-scaling**: Makes the model's outputs more peaked, requiring lower temperature settings for the same output distribution
-- **Down-scaling**: Makes the model's outputs flatter, requiring higher temperature settings for the same output distribution
-- **Examples**:
-  - Scaling `lm_head.weight` by `1.25` makes the new model with `temperature = 1.0` act like the old model with `temperature = 0.8`
-  - Scaling `lm_head.weight` by `0.667` makes the new model with `temperature = 1.0` act like the old model with `temperature = 1.5`
 ### Memory Efficiency
 - **Traditional approach**: Loads entire model (~15GB+ for 7B parameter models)
 - **This approach**: Peak usage determined by largest shard size, not total model size
@@ -351,22 +328,6 @@ with gr.Blocks(title="Memory-Efficient LoRA Merge", theme=gr.themes.Soft()) as d
             )
             gr.Markdown("### Advanced Options")
-            lora_scale = gr.Number(
-                label="LoRA Scale",
-                value=1.0,
-                minimum=0.0,
-                maximum=10.0,
-                step=0.1,
-                info="Multiplier for LoRA strength (1.0 = default)"
-            )
-            lm_head_scale = gr.Number(
-                label="LM Head Scale",
-                value=1.0,
-                minimum=0.1,
-                maximum=5.0,
-                step=0.05,
-                info="Multiplier for lm_head weights (1.0 = default)"
-            )
             multiplicative_lora = gr.Checkbox(
                 label="Multiplicative LoRA",
                 value=False,
@@ -387,8 +348,7 @@ with gr.Blocks(title="Memory-Efficient LoRA Merge", theme=gr.themes.Soft()) as d
     submit_btn.click(
         fn=merge_lora_efficient,
-        inputs=[hf_token, base_model_name, lora_model_name, output_repo_name,
-                lora_scale, lm_head_scale, multiplicative_lora],
         outputs=output_text
     )

         shutil.rmtree(temp_config_dir, ignore_errors=True)
 def merge_lora_efficient(hf_token, base_model_name, lora_model_name, output_repo_name,
+                         multiplicative_lora, progress=gr.Progress()):
     temp_lora_dir = None
     try:
         login(hf_token)
         info_fn("Loading LoRA adapter...")
         # Load LoRA state (this downloads the adapter)
+        lora_state, scale, temp_lora_dir = load_lora_state(lora_model_name)
+        info_fn(f"Using LoRA scale: {scale}")
         progress(0.2, desc="Creating output repository...")
         info_fn(f"Found {len(shard_files)} model shards to process")
         merged_tensors = 0
         total_shards = len(shard_files)
         # Process each shard individually
                 # Process the shard
                 tensors = {}
                 shard_merged_count = 0
                 with safe_open(shard_path, framework='pt', device='cpu') as f:
                     # Get metadata if available
                     for key in f.keys():
                         tensor = f.get_tensor(key)
                         # Try to find corresponding LoRA weights
                         lora_A, lora_B = find_lora_weights(lora_state, key)
                 output_shard_path = os.path.join(temp_shard_dir, f"processed_{shard_filename}")
                 save_file(tensors, output_shard_path, metadata=metadata)
+                info_fn(f"Shard {shard_filename}: Merged {shard_merged_count} tensors")
                 # Upload the processed shard
                 api.upload_file(
         progress(1.0, desc="Upload completed!")
+        success_msg = f"✓ Successfully merged and uploaded model!\nModel URL: https://huggingface.co/{output_repo_name}\nProcessed {total_shards} shards\nMerged {merged_tensors} layers with LoRA weights"
         info_fn("Merge completed successfully!")
         return success_msg
 - **Streaming Processing**: Downloads → Processes → Uploads → Deletes each shard sequentially
 - **Automatic Cleanup**: Temporary files are automatically removed after processing
 - **Progress Tracking**: Real-time status updates throughout the merge process
+- **Advanced Options**: Multiplicative LoRA support
 """
 DETAILS_TEXT = """
 ### How It Works
+LoRA enables efficient fine-tuning by adding small adapter weights rather than modifying the entire model. This tool applies the LoRA transformation:
 - **Standard Additive-LoRA**: `W_new = W + scale × B^T @ A`
 - **Multiplicative LoRA**: `W_new = W + scale × B^T @ A @ W`
 ### Memory Efficiency
 - **Traditional approach**: Loads entire model (~15GB+ for 7B parameter models)
 - **This approach**: Peak usage determined by largest shard size, not total model size
             )
             gr.Markdown("### Advanced Options")
             multiplicative_lora = gr.Checkbox(
                 label="Multiplicative LoRA",
                 value=False,
     submit_btn.click(
         fn=merge_lora_efficient,
+        inputs=[hf_token, base_model_name, lora_model_name, output_repo_name, multiplicative_lora],
         outputs=output_text
     )