Spaces:
Running
Running
Removed scale options
Browse files
app.py
CHANGED
@@ -118,7 +118,7 @@ def download_and_upload_non_model_files(base_model_name, output_repo_name):
|
|
118 |
shutil.rmtree(temp_config_dir, ignore_errors=True)
|
119 |
|
120 |
def merge_lora_efficient(hf_token, base_model_name, lora_model_name, output_repo_name,
|
121 |
-
|
122 |
temp_lora_dir = None
|
123 |
try:
|
124 |
login(hf_token)
|
@@ -127,11 +127,8 @@ def merge_lora_efficient(hf_token, base_model_name, lora_model_name, output_repo
|
|
127 |
info_fn("Loading LoRA adapter...")
|
128 |
|
129 |
# Load LoRA state (this downloads the adapter)
|
130 |
-
lora_state,
|
131 |
-
|
132 |
-
# Apply LoRA scale multiplier
|
133 |
-
scale = base_scale * lora_scale
|
134 |
-
info_fn(f"Using LoRA scale: {scale} (base: {base_scale}, multiplier: {lora_scale})")
|
135 |
|
136 |
progress(0.2, desc="Creating output repository...")
|
137 |
|
@@ -161,7 +158,6 @@ def merge_lora_efficient(hf_token, base_model_name, lora_model_name, output_repo
|
|
161 |
info_fn(f"Found {len(shard_files)} model shards to process")
|
162 |
|
163 |
merged_tensors = 0
|
164 |
-
scaled_lm_heads = 0
|
165 |
total_shards = len(shard_files)
|
166 |
|
167 |
# Process each shard individually
|
@@ -186,7 +182,6 @@ def merge_lora_efficient(hf_token, base_model_name, lora_model_name, output_repo
|
|
186 |
# Process the shard
|
187 |
tensors = {}
|
188 |
shard_merged_count = 0
|
189 |
-
shard_lm_head_count = 0
|
190 |
|
191 |
with safe_open(shard_path, framework='pt', device='cpu') as f:
|
192 |
# Get metadata if available
|
@@ -195,16 +190,6 @@ def merge_lora_efficient(hf_token, base_model_name, lora_model_name, output_repo
|
|
195 |
for key in f.keys():
|
196 |
tensor = f.get_tensor(key)
|
197 |
|
198 |
-
# Apply lm_head scaling if applicable
|
199 |
-
if key.endswith('lm_head.weight') and lm_head_scale != 1.0:
|
200 |
-
info_fn(f"Scaling {key} by {lm_head_scale}")
|
201 |
-
original_dtype = tensor.dtype
|
202 |
-
tensor = tensor.to(torch.float32)
|
203 |
-
tensor = tensor * lm_head_scale
|
204 |
-
tensor = tensor.to(original_dtype)
|
205 |
-
shard_lm_head_count += 1
|
206 |
-
scaled_lm_heads += 1
|
207 |
-
|
208 |
# Try to find corresponding LoRA weights
|
209 |
lora_A, lora_B = find_lora_weights(lora_state, key)
|
210 |
|
@@ -241,7 +226,7 @@ def merge_lora_efficient(hf_token, base_model_name, lora_model_name, output_repo
|
|
241 |
output_shard_path = os.path.join(temp_shard_dir, f"processed_{shard_filename}")
|
242 |
save_file(tensors, output_shard_path, metadata=metadata)
|
243 |
|
244 |
-
info_fn(f"Shard {shard_filename}
|
245 |
|
246 |
# Upload the processed shard
|
247 |
api.upload_file(
|
@@ -261,7 +246,7 @@ def merge_lora_efficient(hf_token, base_model_name, lora_model_name, output_repo
|
|
261 |
|
262 |
progress(1.0, desc="Upload completed!")
|
263 |
|
264 |
-
success_msg = f"β Successfully merged and uploaded model!\nModel URL: https://huggingface.co/{output_repo_name}\nProcessed {total_shards} shards\nMerged {merged_tensors} layers with LoRA weights
|
265 |
info_fn("Merge completed successfully!")
|
266 |
|
267 |
return success_msg
|
@@ -287,24 +272,16 @@ This tool merges LoRA (Low-Rank Adaptation) adapters with base models using a me
|
|
287 |
- **Streaming Processing**: Downloads β Processes β Uploads β Deletes each shard sequentially
|
288 |
- **Automatic Cleanup**: Temporary files are automatically removed after processing
|
289 |
- **Progress Tracking**: Real-time status updates throughout the merge process
|
290 |
-
- **Advanced Options**:
|
291 |
"""
|
292 |
|
293 |
DETAILS_TEXT = """
|
294 |
### How It Works
|
295 |
-
LoRA enables efficient fine-tuning by adding small adapter weights rather than modifying the entire model. This tool applies the LoRA transformation
|
296 |
|
297 |
- **Standard Additive-LoRA**: `W_new = W + scale Γ B^T @ A`
|
298 |
- **Multiplicative LoRA**: `W_new = W + scale Γ B^T @ A @ W`
|
299 |
|
300 |
-
Additionally, the model's default temperature behavior can be adjusted by scaling the `lm_head.weight` tensor:
|
301 |
-
|
302 |
-
- **Up-scaling**: Makes the model's outputs more peaked, requiring lower temperature settings for the same output distribution
|
303 |
-
- **Down-scaling**: Makes the model's outputs flatter, requiring higher temperature settings for the same output distribution
|
304 |
-
- **Examples**:
|
305 |
-
- Scaling `lm_head.weight` by `1.25` makes the new model with `temperature = 1.0` act like the old model with `temperature = 0.8`
|
306 |
-
- Scaling `lm_head.weight` by `0.667` makes the new model with `temperature = 1.0` act like the old model with `temperature = 1.5`
|
307 |
-
|
308 |
### Memory Efficiency
|
309 |
- **Traditional approach**: Loads entire model (~15GB+ for 7B parameter models)
|
310 |
- **This approach**: Peak usage determined by largest shard size, not total model size
|
@@ -351,22 +328,6 @@ with gr.Blocks(title="Memory-Efficient LoRA Merge", theme=gr.themes.Soft()) as d
|
|
351 |
)
|
352 |
|
353 |
gr.Markdown("### Advanced Options")
|
354 |
-
lora_scale = gr.Number(
|
355 |
-
label="LoRA Scale",
|
356 |
-
value=1.0,
|
357 |
-
minimum=0.0,
|
358 |
-
maximum=10.0,
|
359 |
-
step=0.1,
|
360 |
-
info="Multiplier for LoRA strength (1.0 = default)"
|
361 |
-
)
|
362 |
-
lm_head_scale = gr.Number(
|
363 |
-
label="LM Head Scale",
|
364 |
-
value=1.0,
|
365 |
-
minimum=0.1,
|
366 |
-
maximum=5.0,
|
367 |
-
step=0.05,
|
368 |
-
info="Multiplier for lm_head weights (1.0 = default)"
|
369 |
-
)
|
370 |
multiplicative_lora = gr.Checkbox(
|
371 |
label="Multiplicative LoRA",
|
372 |
value=False,
|
@@ -387,8 +348,7 @@ with gr.Blocks(title="Memory-Efficient LoRA Merge", theme=gr.themes.Soft()) as d
|
|
387 |
|
388 |
submit_btn.click(
|
389 |
fn=merge_lora_efficient,
|
390 |
-
inputs=[hf_token, base_model_name, lora_model_name, output_repo_name,
|
391 |
-
lora_scale, lm_head_scale, multiplicative_lora],
|
392 |
outputs=output_text
|
393 |
)
|
394 |
|
|
|
118 |
shutil.rmtree(temp_config_dir, ignore_errors=True)
|
119 |
|
120 |
def merge_lora_efficient(hf_token, base_model_name, lora_model_name, output_repo_name,
|
121 |
+
multiplicative_lora, progress=gr.Progress()):
|
122 |
temp_lora_dir = None
|
123 |
try:
|
124 |
login(hf_token)
|
|
|
127 |
info_fn("Loading LoRA adapter...")
|
128 |
|
129 |
# Load LoRA state (this downloads the adapter)
|
130 |
+
lora_state, scale, temp_lora_dir = load_lora_state(lora_model_name)
|
131 |
+
info_fn(f"Using LoRA scale: {scale}")
|
|
|
|
|
|
|
132 |
|
133 |
progress(0.2, desc="Creating output repository...")
|
134 |
|
|
|
158 |
info_fn(f"Found {len(shard_files)} model shards to process")
|
159 |
|
160 |
merged_tensors = 0
|
|
|
161 |
total_shards = len(shard_files)
|
162 |
|
163 |
# Process each shard individually
|
|
|
182 |
# Process the shard
|
183 |
tensors = {}
|
184 |
shard_merged_count = 0
|
|
|
185 |
|
186 |
with safe_open(shard_path, framework='pt', device='cpu') as f:
|
187 |
# Get metadata if available
|
|
|
190 |
for key in f.keys():
|
191 |
tensor = f.get_tensor(key)
|
192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
# Try to find corresponding LoRA weights
|
194 |
lora_A, lora_B = find_lora_weights(lora_state, key)
|
195 |
|
|
|
226 |
output_shard_path = os.path.join(temp_shard_dir, f"processed_{shard_filename}")
|
227 |
save_file(tensors, output_shard_path, metadata=metadata)
|
228 |
|
229 |
+
info_fn(f"Shard {shard_filename}: Merged {shard_merged_count} tensors")
|
230 |
|
231 |
# Upload the processed shard
|
232 |
api.upload_file(
|
|
|
246 |
|
247 |
progress(1.0, desc="Upload completed!")
|
248 |
|
249 |
+
success_msg = f"β Successfully merged and uploaded model!\nModel URL: https://huggingface.co/{output_repo_name}\nProcessed {total_shards} shards\nMerged {merged_tensors} layers with LoRA weights"
|
250 |
info_fn("Merge completed successfully!")
|
251 |
|
252 |
return success_msg
|
|
|
272 |
- **Streaming Processing**: Downloads β Processes β Uploads β Deletes each shard sequentially
|
273 |
- **Automatic Cleanup**: Temporary files are automatically removed after processing
|
274 |
- **Progress Tracking**: Real-time status updates throughout the merge process
|
275 |
+
- **Advanced Options**: Multiplicative LoRA support
|
276 |
"""
|
277 |
|
278 |
DETAILS_TEXT = """
|
279 |
### How It Works
|
280 |
+
LoRA enables efficient fine-tuning by adding small adapter weights rather than modifying the entire model. This tool applies the LoRA transformation:
|
281 |
|
282 |
- **Standard Additive-LoRA**: `W_new = W + scale Γ B^T @ A`
|
283 |
- **Multiplicative LoRA**: `W_new = W + scale Γ B^T @ A @ W`
|
284 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
285 |
### Memory Efficiency
|
286 |
- **Traditional approach**: Loads entire model (~15GB+ for 7B parameter models)
|
287 |
- **This approach**: Peak usage determined by largest shard size, not total model size
|
|
|
328 |
)
|
329 |
|
330 |
gr.Markdown("### Advanced Options")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
331 |
multiplicative_lora = gr.Checkbox(
|
332 |
label="Multiplicative LoRA",
|
333 |
value=False,
|
|
|
348 |
|
349 |
submit_btn.click(
|
350 |
fn=merge_lora_efficient,
|
351 |
+
inputs=[hf_token, base_model_name, lora_model_name, output_repo_name, multiplicative_lora],
|
|
|
352 |
outputs=output_text
|
353 |
)
|
354 |
|