Spaces:

sagar007
/

Lava_phi_model

Sleeping

App Files Files Community

sagar007 commited on Mar 24

Commit

8d741e2

verified ·

1 Parent(s): 0612b41

Update app.py

Browse files

Files changed (1) hide show

app.py +138 -117

app.py CHANGED Viewed

@@ -1,134 +1,155 @@
-import os
 import gradio as gr
 import torch
-from peft import LoraConfig, get_peft_model
-import torch.nn as nn
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from peft import PeftModel, PeftConfig
 from PIL import Image
-import clip
-import spaces
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-class MultimodalPhi(nn.Module):
-    def __init__(self, phi_model):
-        super().__init__()
-        self.phi_model = phi_model
-        self.embedding_projection = nn.Linear(512, phi_model.config.hidden_size)
-    def forward(self, image_embeddings, input_ids, attention_mask):
-        projected_embeddings = self.embedding_projection(image_embeddings).unsqueeze(1)
-        inputs_embeds = self.phi_model.get_input_embeddings()(input_ids)
-        combined_embeds = torch.cat([projected_embeddings, inputs_embeds], dim=1)
-        extended_attention_mask = torch.cat([torch.ones(attention_mask.shape[0], 1).to(attention_mask.device), attention_mask], dim=1)
-        outputs = self.phi_model(inputs_embeds=combined_embeds, attention_mask=extended_attention_mask)
-        return outputs.logits[:, 1:, :]  # Exclude the image token from output
-def load_models():
-    try:
-        print("Loading models...")
-        peft_model_name = "sagar007/phi-1_5-finetuned"
-        # Manually load and create LoraConfig, ignoring unknown arguments
-        config_dict = LoraConfig.from_pretrained(peft_model_name).to_dict()
-        # Remove 'layer_replication' if present
-        config_dict.pop('layer_replication', None)
-        lora_config = LoraConfig(**config_dict)
-        print("PEFT config loaded")
-        base_model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5", torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32)
-        print("Base model loaded")
-        phi_model = get_peft_model(base_model, lora_config)
-        phi_model.load_state_dict(torch.load(peft_model_name + '/adapter_model.bin', map_location=device), strict=False)
-        print("PEFT model loaded")
-        multimodal_model = MultimodalPhi(phi_model)
-        multimodal_model.load_state_dict(torch.load('multimodal_phi_small_gpu.pth', map_location=device))
-        multimodal_model.to(device)
-        multimodal_model.eval()
-        print("Multimodal model loaded")
-        tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5")
-        tokenizer.pad_token = tokenizer.eos_token
-        print("Tokenizer loaded")
-        audio_model = whisper.load_model("base").to(device)
-        print("Audio model loaded")
-        clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)
-        print("CLIP model loaded")
-        return multimodal_model, tokenizer, audio_model, clip_model, clip_preprocess
-    except Exception as e:
-        print(f"Error in load_models: {str(e)}")
-        raise
-model, tokenizer, audio_model, clip_model, clip_preprocess = load_models()
-@spaces.GPU
-def get_clip_embedding(image):
-    image = clip_preprocess(Image.open(image)).unsqueeze(0).to(device)
-    with torch.no_grad():
-        image_features = clip_model.encode_image(image)
-    return image_features.squeeze(0)
-@spaces.GPU
-def process_text(text):
     try:
-        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128, padding='max_length').to(device)
-        dummy_image_embedding = torch.zeros(512).to(device)  # Dummy image embedding for text-only input
         with torch.no_grad():
-            outputs = model(dummy_image_embedding.unsqueeze(0), inputs.input_ids, inputs.attention_mask)
-        return tokenizer.decode(outputs[0].argmax(dim=-1), skip_special_tokens=True)
     except Exception as e:
-        return f"Error in process_text: {str(e)}"
-@spaces.GPU
-def process_image(image):
     try:
-        clip_embedding = get_clip_embedding(image)
-        prompt = "Describe this image:"
-        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=128, padding='max_length').to(device)
         with torch.no_grad():
-            outputs = model(clip_embedding.unsqueeze(0), inputs.input_ids, inputs.attention_mask)
-        return tokenizer.decode(outputs[0].argmax(dim=-1), skip_special_tokens=True)
-    except Exception as e:
-        return f"Error in process_image: {str(e)}"
-@spaces.GPU
-def process_audio(audio):
-    try:
-        result = audio_model.transcribe(audio)
-        transcription = result["text"]
-        return process_text(f"Transcription: {transcription}\nPlease respond to this:")
     except Exception as e:
-        return f"Error in process_audio: {str(e)}"
-def chat(message, image, audio):
-    if audio is not None:
-        return process_audio(audio)
-    elif image is not None:
-        return process_image(image)
-    else:
-        return process_text(message)
-iface = gr.Interface(
-    fn=chat,
-    inputs=[
-        gr.Textbox(placeholder="Enter text here..."),
-        gr.Image(type="pil"),
-        gr.Audio(type="filepath")
-    ],
-    outputs="text",
-    title="Multi-Modal Assistant",
-    description="Chat with an AI using text, images, or audio!"
-)
 if __name__ == "__main__":
-    print("Starting Gradio interface...")
-    iface.launch(share=True)

 import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor
 import torch
 from PIL import Image
+import os
+# Check if CUDA is available, otherwise use CPU
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using device: {device}")
+# Load model and tokenizer with optimizations for CPU deployment
+def load_model():
+    print("Loading model and tokenizer...")
+    model = AutoModelForCausalLM.from_pretrained(
+        "sagar007/Lava_phi",
+        torch_dtype=torch.float32 if device == "cpu" else torch.bfloat16,
+        low_cpu_mem_usage=True,
+    )
+    model = model.to(device)
+    tokenizer = AutoTokenizer.from_pretrained("sagar007/Lava_phi")
+    processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+    print("Model and tokenizer loaded successfully!")
+    return model, tokenizer, processor
+# Load models
+model, tokenizer, processor = load_model()
+# For text-only generation
+def generate_text(prompt, max_length=128):
     try:
+        inputs = tokenizer(f"human: {prompt}\ngpt:", return_tensors="pt").to(device)
+        # Generate with low memory footprint settings
         with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=max_length,
+                do_sample=True,
+                temperature=0.7,
+                top_p=0.9,
+            )
+        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Extract only the model's response
+        if "gpt:" in generated_text:
+            generated_text = generated_text.split("gpt:", 1)[1].strip()
+        return generated_text
     except Exception as e:
+        return f"Error generating text: {str(e)}"
+# For image and text processing
+def process_image_and_prompt(image, prompt, max_length=128):
     try:
+        if image is None:
+            return "No image provided. Please upload an image."
+        # Process image
+        image_tensor = processor(images=image, return_tensors="pt").pixel_values.to(device)
+        # Tokenize input with image token
+        inputs = tokenizer(f"human: <image>\n{prompt}\ngpt:", return_tensors="pt").to(device)
+        # Generate with memory optimizations
         with torch.no_grad():
+            outputs = model.generate(
+                input_ids=inputs["input_ids"],
+                attention_mask=inputs["attention_mask"],
+                images=image_tensor,
+                max_new_tokens=max_length,
+                do_sample=True,
+                temperature=0.7,
+                top_p=0.9,
+            )
+        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Extract only the model's response
+        if "gpt:" in generated_text:
+            generated_text = generated_text.split("gpt:", 1)[1].strip()
+        return generated_text
     except Exception as e:
+        return f"Error processing image: {str(e)}"
+# Create Gradio Interface
+with gr.Blocks(title="LLaVA-Phi: Vision-Language Model") as demo:
+    gr.Markdown("# LLaVA-Phi: Vision-Language Model")
+    gr.Markdown("This model can generate text responses from text prompts or analyze images with text prompts.")
+    with gr.Tab("Text Generation"):
+        with gr.Row():
+            with gr.Column():
+                text_input = gr.Textbox(label="Enter your prompt", lines=3, placeholder="What is artificial intelligence?")
+                text_max_length = gr.Slider(minimum=16, maximum=512, value=128, step=8, label="Maximum response length")
+                text_button = gr.Button("Generate")
+            text_output = gr.Textbox(label="Generated response", lines=8)
+        text_button.click(
+            fn=generate_text,
+            inputs=[text_input, text_max_length],
+            outputs=text_output
+        )
+    with gr.Tab("Image + Text Analysis"):
+        with gr.Row():
+            with gr.Column():
+                image_input = gr.Image(type="pil", label="Upload an image")
+                image_text_input = gr.Textbox(label="Enter your prompt about the image",
+                                              lines=2,
+                                              placeholder="Describe this image in detail.")
+                image_max_length = gr.Slider(minimum=16, maximum=512, value=128, step=8, label="Maximum response length")
+                image_button = gr.Button("Analyze")
+            image_output = gr.Textbox(label="Model response", lines=8)
+        image_button.click(
+            fn=process_image_and_prompt,
+            inputs=[image_input, image_text_input, image_max_length],
+            outputs=image_output
+        )
+    # Example inputs for each tab
+    gr.Examples(
+        examples=["What is the advantage of vision-language models?",
+                  "Explain how multimodal AI models work.",
+                  "Tell me a short story about robots."],
+        inputs=text_input
+    )
+    # Add examples for image tab if you have example images
+    # gr.Examples(
+    #     examples=[["example1.jpg", "What's in this image?"]],
+    #     inputs=[image_input, image_text_input]
+    # )
+# Launch the app with memory optimizations
 if __name__ == "__main__":
+    # Memory cleanup before launch
+    torch.cuda.empty_cache() if torch.cuda.is_available() else None
+    # Set low CPU thread usage to reduce memory
+    os.environ["OMP_NUM_THREADS"] = "4"
+    # Launch with minimal resource usage
+    demo.launch(
+        share=True,  # Set to False in production
+        enable_queue=True,
+        max_threads=4,
+        show_error=True
+    )