Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| import gc | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| from peft import PeftModel | |
| import os | |
| # --------------------------------------------------------------------------- | |
| # CONFIGURATION | |
| # --------------------------------------------------------------------------- | |
| # WARNING: On CPU, 8B models are very heavy. | |
| # If this crashes, switch to "unsloth/Llama-3.2-3B-Instruct" | |
| BASE_MODEL_ID = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit" | |
| LORA_ADAPTER_ID = "JPQ24/Natural-synthesis-llama-3.2-1b" | |
| # --------------------------------------------------------------------------- | |
| # LOAD MODEL (State Initialization) | |
| # --------------------------------------------------------------------------- | |
| print("System: Initializing CPU Load Sequence...") | |
| # 1. Load Tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| BASE_MODEL_ID, | |
| token=os.environ.get("HF_TOKEN") | |
| ) | |
| # 2. Load Base Model | |
| # low_cpu_mem_usage=True is critical here to load weights sequentially | |
| print("System: Loading Base Model into RAM...") | |
| base_model = AutoModelForCausalLM.from_pretrained( | |
| BASE_MODEL_ID, | |
| device_map="cpu", | |
| torch_dtype=torch.float32, # Safe default. Use torch.bfloat16 if your Space supports it for speed. | |
| low_cpu_mem_usage=True, | |
| trust_remote_code=True, | |
| token=os.environ.get("HF_TOKEN") | |
| ) | |
| # 3. Attach LoRA Adapter | |
| print("System: Attaching LoRA Adapter...") | |
| model = PeftModel.from_pretrained( | |
| base_model, | |
| LORA_ADAPTER_ID, | |
| token=os.environ.get("HF_TOKEN") | |
| ) | |
| print("System: Ready.") | |
| # --------------------------------------------------------------------------- | |
| # EXECUTION ENGINE | |
| # --------------------------------------------------------------------------- | |
| def run_inference(prompt, use_lora): | |
| """ | |
| Core computation unit. | |
| Accepts state configuration (use_lora) and executes transformation. | |
| """ | |
| # 1. Input Processing | |
| messages = [{"role": "user", "content": prompt}] | |
| try: | |
| inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt") | |
| except Exception: | |
| inputs = tokenizer(prompt, return_tensors="pt").input_ids | |
| # Ensure inputs are on CPU | |
| inputs = inputs.to("cpu") | |
| # 2. Generation Config (Conservative for CPU) | |
| generate_kwargs = dict( | |
| input_ids=inputs, | |
| max_new_tokens=100, # Keep short to prevent timeouts | |
| do_sample=True, | |
| temperature=0.7, | |
| ) | |
| # 3. Execution (With Context Switching) | |
| if not use_lora: | |
| # CONTEXT A: BASE MODEL | |
| # We temporarily disable the LoRA connection | |
| with model.disable_adapter(): | |
| outputs = model.generate(**generate_kwargs) | |
| else: | |
| # CONTEXT B: LORA MODEL | |
| # We use the active adapter | |
| outputs = model.generate(**generate_kwargs) | |
| # 4. Output Decoding | |
| response = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True) | |
| return response | |
| def sequential_generation(prompt): | |
| """ | |
| Orchestrator for Sequential Execution. | |
| Step 1 -> Cleanup -> Step 2 | |
| """ | |
| # --- PHASE 1: BASE MODEL --- | |
| yield "Generating Base Model response... (Please wait)", "Waiting for Base to finish..." | |
| base_result = run_inference(prompt, use_lora=False) | |
| # --- INTERMEDIATE: CLEANUP --- | |
| # This is a 'heuristic' step to help the CPU breathe. | |
| # We force a garbage collection to clear the computation graph from memory. | |
| gc.collect() | |
| # --- PHASE 2: LORA MODEL --- | |
| # We yield the first result so the user can read it while the second runs | |
| yield base_result, "Generating LoRA response... (Please wait)" | |
| lora_result = run_inference(prompt, use_lora=True) | |
| # --- FINAL: COMPLETE --- | |
| yield base_result, lora_result | |
| # --------------------------------------------------------------------------- | |
| # INTERFACE | |
| # --------------------------------------------------------------------------- | |
| custom_css = """ | |
| .container { max-width: 1100px; margin: auto; } | |
| .output-box { height: 400px; overflow-y: scroll; } | |
| """ | |
| with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# Sequential Model Comparison (CPU)") | |
| gr.Markdown(f"**Architecture:** Base (`{BASE_MODEL_ID}`) + Adapter (`{LORA_ADAPTER_ID}`)") | |
| gr.Markdown("ℹ️ **Process:** This space runs the Base Model first, clears memory, and then runs the LoRA Model.") | |
| with gr.Row(): | |
| input_text = gr.Textbox(label="Prompt", placeholder="e.g. Write a poem about rust...", lines=2) | |
| submit_btn = gr.Button("Start Comparison", variant="primary") | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### 1. Base Model Output") | |
| output_base = gr.Textbox(label="Base Result", lines=10, interactive=False) | |
| with gr.Column(): | |
| gr.Markdown("### 2. LoRA Model Output") | |
| output_lora = gr.Textbox(label="Fine-Tuned Result", lines=10, interactive=False) | |
| submit_btn.click( | |
| fn=sequential_generation, | |
| inputs=input_text, | |
| outputs=[output_base, output_lora] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |