Spaces:

broadfield-dev
/

gemma_test

Sleeping

App Files Files Community

gemma_test / app.py

broadfield-dev

Update app.py

0cfbef9 verified 3 months ago

raw

history blame contribute delete

5.35 kB

	import gradio as gr
	from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
	from PIL import Image
	import requests
	import torch
	import io
	import os
	from huggingface_hub import login
	import tempfile
	from threading import Thread

	# Log in to Hugging Face for gated model access
	login(token=os.getenv("HF_TOKEN", "your_hf_token_here")) # Replace with your HF token or set HF_TOKEN env variable

	# Load model and processor
	model_id = "google/gemma-3-4b-it"
	model = Gemma3ForConditionalGeneration.from_pretrained(
	model_id, device_map="auto", torch_dtype=torch.bfloat16
	).eval()
	processor = AutoProcessor.from_pretrained(model_id)

	def process_input(chat_history, image_file, image_url, text_input):
	try:
	# Validate text input
	if not text_input:
	yield chat_history + [(None, "Error: Please enter a text prompt.")]
	return

	# Handle image input
	image_path = None
	if image_file is not None:
	image = Image.open(image_file).convert("RGB")
	with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp:
	image.save(tmp.name)
	image_path = tmp.name
	elif image_url:
	response = requests.get(image_url)
	response.raise_for_status()
	image = Image.open(io.BytesIO(response.content)).convert("RGB")
	with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp:
	image.save(tmp.name)
	image_path = tmp.name
	else:
	yield chat_history + [(text_input, "Error: Please provide an image file or a valid image URL.")]
	return

	# Prepare chat template (based on original code)
	messages = [
	{
	"role": "system",
	"content": [{"type": "text", "text": "You are a helpful assistant capable of analyzing images."}]
	},
	{
	"role": "user",
	"content": [
	{"type": "image", "image": image_path},
	{"type": "text", "text": text_input}
	]
	}
	]

	# Process inputs with chat template
	inputs = processor.apply_chat_template(
	messages, add_generation_prompt=True, tokenize=True,
	return_dict=True, return_tensors="pt"
	).to(model.device, dtype=torch.bfloat16)

	# Debug: Verify input tensors
	if "pixel_values" not in inputs:
	yield chat_history + [(text_input, "Error: Image data not processed correctly (missing pixel_values).")]
	return
	# print(f"Input keys: {inputs.keys()}, pixel_values shape: {inputs['pixel_values'].shape}") # Uncomment for debugging

	# Initialize streamer
	streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
	generation_kwargs = {
	"input_ids": inputs["input_ids"],
	"attention_mask": inputs["attention_mask"],
	"pixel_values": inputs.get("pixel_values"),
	"max_new_tokens": 500,
	"do_sample": True,
	}

	# Update chat history with user message
	user_message = text_input
	bot_message = ""
	new_history = chat_history + [(user_message, bot_message)]
	yield new_history

	# Run generation in a separate thread for streaming
	with torch.inference_mode():
	thread = Thread(target=model.generate, kwargs={**generation_kwargs, "streamer": streamer})
	thread.start()

	for new_text in streamer:
	bot_message += new_text
	new_history[-1] = (user_message, bot_message)
	yield new_history

	thread.join()

	# Clean up temporary file
	if image_path and os.path.exists(image_path):
	os.remove(image_path)

	except Exception as e:
	import traceback
	yield chat_history + [(text_input, f"Error: {str(e)}\n{traceback.format_exc()}")]

	# Create Gradio interface
	with gr.Blocks() as demo:
	gr.Markdown("# Multimodal Streaming Chat with Gemma-3-4b-it")
	gr.Markdown("Upload an image or provide an image URL, then enter a text prompt (e.g., 'Describe this image in detail').")

	with gr.Row():
	with gr.Column():
	image_file_input = gr.Image(label="Upload Image (Drag-and-Drop)", type="filepath")
	image_url_input = gr.Textbox(label="Or Enter Image URL", placeholder="e.g., https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/bee.jpg")
	with gr.Column():
	chatbot = gr.Chatbot(label="Chat with Gemma", height=400)
	text_input = gr.Textbox(
	label="Enter your prompt here",
	placeholder="e.g., Describe this image in detail",
	lines=2,
	submit_btn=True
	)

	#submit_button = gr.Button("Submit")

	text_input.submit(
	fn=process_input,
	inputs=[chatbot, image_file_input, image_url_input, text_input],
	outputs=chatbot
	)
	'''submit_button.click(
	fn=process_input,
	inputs=[chatbot, image_file_input, image_url_input, text_input],
	outputs=chatbot
	)'''

	# Launch the app
	demo.launch()