jukofyork commited on
Commit
e60537d
Β·
verified Β·
1 Parent(s): cbbd0ce

Removed scale options

Browse files
Files changed (1) hide show
  1. app.py +8 -48
app.py CHANGED
@@ -118,7 +118,7 @@ def download_and_upload_non_model_files(base_model_name, output_repo_name):
118
  shutil.rmtree(temp_config_dir, ignore_errors=True)
119
 
120
  def merge_lora_efficient(hf_token, base_model_name, lora_model_name, output_repo_name,
121
- lora_scale, lm_head_scale, multiplicative_lora, progress=gr.Progress()):
122
  temp_lora_dir = None
123
  try:
124
  login(hf_token)
@@ -127,11 +127,8 @@ def merge_lora_efficient(hf_token, base_model_name, lora_model_name, output_repo
127
  info_fn("Loading LoRA adapter...")
128
 
129
  # Load LoRA state (this downloads the adapter)
130
- lora_state, base_scale, temp_lora_dir = load_lora_state(lora_model_name)
131
-
132
- # Apply LoRA scale multiplier
133
- scale = base_scale * lora_scale
134
- info_fn(f"Using LoRA scale: {scale} (base: {base_scale}, multiplier: {lora_scale})")
135
 
136
  progress(0.2, desc="Creating output repository...")
137
 
@@ -161,7 +158,6 @@ def merge_lora_efficient(hf_token, base_model_name, lora_model_name, output_repo
161
  info_fn(f"Found {len(shard_files)} model shards to process")
162
 
163
  merged_tensors = 0
164
- scaled_lm_heads = 0
165
  total_shards = len(shard_files)
166
 
167
  # Process each shard individually
@@ -186,7 +182,6 @@ def merge_lora_efficient(hf_token, base_model_name, lora_model_name, output_repo
186
  # Process the shard
187
  tensors = {}
188
  shard_merged_count = 0
189
- shard_lm_head_count = 0
190
 
191
  with safe_open(shard_path, framework='pt', device='cpu') as f:
192
  # Get metadata if available
@@ -195,16 +190,6 @@ def merge_lora_efficient(hf_token, base_model_name, lora_model_name, output_repo
195
  for key in f.keys():
196
  tensor = f.get_tensor(key)
197
 
198
- # Apply lm_head scaling if applicable
199
- if key.endswith('lm_head.weight') and lm_head_scale != 1.0:
200
- info_fn(f"Scaling {key} by {lm_head_scale}")
201
- original_dtype = tensor.dtype
202
- tensor = tensor.to(torch.float32)
203
- tensor = tensor * lm_head_scale
204
- tensor = tensor.to(original_dtype)
205
- shard_lm_head_count += 1
206
- scaled_lm_heads += 1
207
-
208
  # Try to find corresponding LoRA weights
209
  lora_A, lora_B = find_lora_weights(lora_state, key)
210
 
@@ -241,7 +226,7 @@ def merge_lora_efficient(hf_token, base_model_name, lora_model_name, output_repo
241
  output_shard_path = os.path.join(temp_shard_dir, f"processed_{shard_filename}")
242
  save_file(tensors, output_shard_path, metadata=metadata)
243
 
244
- info_fn(f"Shard {shard_filename}:\n- Merged {shard_merged_count} tensors\n- Scaled {shard_lm_head_count} lm_head tensors")
245
 
246
  # Upload the processed shard
247
  api.upload_file(
@@ -261,7 +246,7 @@ def merge_lora_efficient(hf_token, base_model_name, lora_model_name, output_repo
261
 
262
  progress(1.0, desc="Upload completed!")
263
 
264
- success_msg = f"βœ“ Successfully merged and uploaded model!\nModel URL: https://huggingface.co/{output_repo_name}\nProcessed {total_shards} shards\nMerged {merged_tensors} layers with LoRA weights\nScaled {scaled_lm_heads} lm_head layers"
265
  info_fn("Merge completed successfully!")
266
 
267
  return success_msg
@@ -287,24 +272,16 @@ This tool merges LoRA (Low-Rank Adaptation) adapters with base models using a me
287
  - **Streaming Processing**: Downloads β†’ Processes β†’ Uploads β†’ Deletes each shard sequentially
288
  - **Automatic Cleanup**: Temporary files are automatically removed after processing
289
  - **Progress Tracking**: Real-time status updates throughout the merge process
290
- - **Advanced Options**: Configurable LoRA scaling, LM-head scaling, and multiplicative LoRA support
291
  """
292
 
293
  DETAILS_TEXT = """
294
  ### How It Works
295
- LoRA enables efficient fine-tuning by adding small adapter weights rather than modifying the entire model. This tool applies the LoRA transformation with configurable scaling:
296
 
297
  - **Standard Additive-LoRA**: `W_new = W + scale Γ— B^T @ A`
298
  - **Multiplicative LoRA**: `W_new = W + scale Γ— B^T @ A @ W`
299
 
300
- Additionally, the model's default temperature behavior can be adjusted by scaling the `lm_head.weight` tensor:
301
-
302
- - **Up-scaling**: Makes the model's outputs more peaked, requiring lower temperature settings for the same output distribution
303
- - **Down-scaling**: Makes the model's outputs flatter, requiring higher temperature settings for the same output distribution
304
- - **Examples**:
305
- - Scaling `lm_head.weight` by `1.25` makes the new model with `temperature = 1.0` act like the old model with `temperature = 0.8`
306
- - Scaling `lm_head.weight` by `0.667` makes the new model with `temperature = 1.0` act like the old model with `temperature = 1.5`
307
-
308
  ### Memory Efficiency
309
  - **Traditional approach**: Loads entire model (~15GB+ for 7B parameter models)
310
  - **This approach**: Peak usage determined by largest shard size, not total model size
@@ -351,22 +328,6 @@ with gr.Blocks(title="Memory-Efficient LoRA Merge", theme=gr.themes.Soft()) as d
351
  )
352
 
353
  gr.Markdown("### Advanced Options")
354
- lora_scale = gr.Number(
355
- label="LoRA Scale",
356
- value=1.0,
357
- minimum=0.0,
358
- maximum=10.0,
359
- step=0.1,
360
- info="Multiplier for LoRA strength (1.0 = default)"
361
- )
362
- lm_head_scale = gr.Number(
363
- label="LM Head Scale",
364
- value=1.0,
365
- minimum=0.1,
366
- maximum=5.0,
367
- step=0.05,
368
- info="Multiplier for lm_head weights (1.0 = default)"
369
- )
370
  multiplicative_lora = gr.Checkbox(
371
  label="Multiplicative LoRA",
372
  value=False,
@@ -387,8 +348,7 @@ with gr.Blocks(title="Memory-Efficient LoRA Merge", theme=gr.themes.Soft()) as d
387
 
388
  submit_btn.click(
389
  fn=merge_lora_efficient,
390
- inputs=[hf_token, base_model_name, lora_model_name, output_repo_name,
391
- lora_scale, lm_head_scale, multiplicative_lora],
392
  outputs=output_text
393
  )
394
 
 
118
  shutil.rmtree(temp_config_dir, ignore_errors=True)
119
 
120
  def merge_lora_efficient(hf_token, base_model_name, lora_model_name, output_repo_name,
121
+ multiplicative_lora, progress=gr.Progress()):
122
  temp_lora_dir = None
123
  try:
124
  login(hf_token)
 
127
  info_fn("Loading LoRA adapter...")
128
 
129
  # Load LoRA state (this downloads the adapter)
130
+ lora_state, scale, temp_lora_dir = load_lora_state(lora_model_name)
131
+ info_fn(f"Using LoRA scale: {scale}")
 
 
 
132
 
133
  progress(0.2, desc="Creating output repository...")
134
 
 
158
  info_fn(f"Found {len(shard_files)} model shards to process")
159
 
160
  merged_tensors = 0
 
161
  total_shards = len(shard_files)
162
 
163
  # Process each shard individually
 
182
  # Process the shard
183
  tensors = {}
184
  shard_merged_count = 0
 
185
 
186
  with safe_open(shard_path, framework='pt', device='cpu') as f:
187
  # Get metadata if available
 
190
  for key in f.keys():
191
  tensor = f.get_tensor(key)
192
 
 
 
 
 
 
 
 
 
 
 
193
  # Try to find corresponding LoRA weights
194
  lora_A, lora_B = find_lora_weights(lora_state, key)
195
 
 
226
  output_shard_path = os.path.join(temp_shard_dir, f"processed_{shard_filename}")
227
  save_file(tensors, output_shard_path, metadata=metadata)
228
 
229
+ info_fn(f"Shard {shard_filename}: Merged {shard_merged_count} tensors")
230
 
231
  # Upload the processed shard
232
  api.upload_file(
 
246
 
247
  progress(1.0, desc="Upload completed!")
248
 
249
+ success_msg = f"βœ“ Successfully merged and uploaded model!\nModel URL: https://huggingface.co/{output_repo_name}\nProcessed {total_shards} shards\nMerged {merged_tensors} layers with LoRA weights"
250
  info_fn("Merge completed successfully!")
251
 
252
  return success_msg
 
272
  - **Streaming Processing**: Downloads β†’ Processes β†’ Uploads β†’ Deletes each shard sequentially
273
  - **Automatic Cleanup**: Temporary files are automatically removed after processing
274
  - **Progress Tracking**: Real-time status updates throughout the merge process
275
+ - **Advanced Options**: Multiplicative LoRA support
276
  """
277
 
278
  DETAILS_TEXT = """
279
  ### How It Works
280
+ LoRA enables efficient fine-tuning by adding small adapter weights rather than modifying the entire model. This tool applies the LoRA transformation:
281
 
282
  - **Standard Additive-LoRA**: `W_new = W + scale Γ— B^T @ A`
283
  - **Multiplicative LoRA**: `W_new = W + scale Γ— B^T @ A @ W`
284
 
 
 
 
 
 
 
 
 
285
  ### Memory Efficiency
286
  - **Traditional approach**: Loads entire model (~15GB+ for 7B parameter models)
287
  - **This approach**: Peak usage determined by largest shard size, not total model size
 
328
  )
329
 
330
  gr.Markdown("### Advanced Options")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
331
  multiplicative_lora = gr.Checkbox(
332
  label="Multiplicative LoRA",
333
  value=False,
 
348
 
349
  submit_btn.click(
350
  fn=merge_lora_efficient,
351
+ inputs=[hf_token, base_model_name, lora_model_name, output_repo_name, multiplicative_lora],
 
352
  outputs=output_text
353
  )
354