from PIL import Image import json import gradio as gr import requests from transformers import CLIPProcessor, CLIPModel, pipeline, BlipProcessor, BlipForConditionalGeneration model = CLIPModel.from_pretrained("model") processor = CLIPProcessor.from_pretrained("tokenizer") vqa_pipeline = pipeline("visual-question-answering",model="vqa") space_type_labels = ["living room", "bedroom", "kitchen", "terrace", "closet","bathroom", "dining room", "office", "garage", "garden", "balcony", "attic", "hallway","gym", "playroom", "storage room", "studio","is_exterior","swimming pool","others"] equipment_questions = [ "Does the image show outdoor furniture?", "Does the image show a parasol?", "Does the image show a pergola?", "Does the image show a grill?", "Does the image show a heater?", "Does the image show outdoor lighting?", "Does the image show planters?", "Does the image show water features?", "Does the image show floor coverings?", "Does the image show decorative items?", "Does the image show entertainment equipment?", "Does the image show protective materials?" ] weights = { "Does the image show outdoor furniture?": 0.15, "Does the image show a parasol?": 0.05, "Does the image show a pergola?": 0.1, "Does the image show a grill?": 0.15, "Does the image show a heater?": 0.1, "Does the image show outdoor lighting?": 0.1, "Does the image show planters?": 0.05, "Does the image show water features?": 0.1, "Does the image show floor coverings?": 0.05, "Does the image show decorative items?": 0.05, "Does the image show entertainment equipment?": 0.05, "Does the image show protective materials?": 0.05 } luminosity_classes = [ 'A well-lit room with abundant natural light, showcasing windows or a balcony through which sunlight passes unobstructed.', 'A room depicted in darkness, where there is minimal or no visible light source.', 'A room illuminated by artificial light sources such as lamps or ceiling lights.' ] #luminosity_classes = [ # "A room filled with natural daylight.", # "A room lit by artificial lights.", # "A dark room with no lights." #] luminosity_labels = ['natural_light', 'no_light', 'artificial_light'] #view_questions = [ #"Is this a panoramic view?", # "Is this a city view?", # "Is this a view of greenery?", # "Is this a mountain view?", # "Is this a view of the sea?" #] view_questions = [ # "This is a panoramic view, showing a wide expanse of the surroundings.", "This is a city view, showing buildings, streets, and urban areas.", "This is a view of greenery, including trees, parks, or gardens.", "This is a mountain view, showing mountains and hilly landscapes.", "This is a view of the sea" ] view_labels = ['city', 'greenery', 'mountain', 'sea'] certainty_classes = [ 'Windows, balconies, or terraces with an unobstructed outward view', 'exterior view of a building or appearance of a house or apartment', 'Artificial or fake view of any city or sea', 'View obstructed by objects such as buildings, trees, or other structures', 'Hallway or interior view with no outdoor visibility' ] #certainty_classes = ['Windows, balconies, or terraces with an unobstructed outward view','Exterior view appearance of a house or apartment','unreal picture or fake of any city or sea view','view unfree from any obstructive objects such as buildings, trees, or other structures, and ideally seen through windows, balconies, or terraces','hallway'] render_classes = [ "This is a realistic photo of an interior.", "This is a computer-generated render of an interior.", "This is a realistic photo of an exterior.", "This is a computer-generated render of an exterior." ] threshold = 0 def calculate_equipment_score(image_results, weights): score = sum(weights[question] for question, present in image_results.items() if present) return score def calculate_luminosity_score(processed_image): inputs = processor(text=luminosity_classes, images=processed_image, return_tensors="pt", padding=True) outputs = model(**inputs) logits_per_image = outputs.logits_per_image probs = logits_per_image.softmax(dim=1) probabilities_list = probs.squeeze().tolist() luminosity_score = {class_name: probability for class_name, probability in zip(luminosity_labels, probabilities_list)} return luminosity_score def calculate_space_type(processed_image): inputs = processor(text=space_type_labels, images=processed_image, return_tensors="pt", padding=True) outputs = model(**inputs) logits_per_image = outputs.logits_per_image probs = logits_per_image.softmax(dim=1) probabilities_list = probs.squeeze().tolist() space_type_score = {class_name: probability for class_name, probability in zip(space_type_labels, probabilities_list)} return space_type_score def certainty(processed_image): inputs = processor(text=certainty_classes, images=processed_image, return_tensors="pt", padding=True) outputs = model(**inputs) logits_per_image = outputs.logits_per_image probs = logits_per_image.softmax(dim=1) probabilities_list = probs.squeeze().tolist() is_fake_score = {class_name: probability for class_name, probability in zip(certainty_classes, probabilities_list)} return is_fake_score def views(processed_image): inputs = processor(text=view_questions, images=processed_image, return_tensors="pt", padding=True) outputs = model(**inputs) logits_per_image = outputs.logits_per_image probs = logits_per_image.softmax(dim=1) probabilities_list = probs.squeeze().tolist() views_score = {class_name: probability for class_name, probability in zip(view_labels, probabilities_list)} return views_score def calculate_is_render(processed_image): render_inputs = processor(text=render_classes, images=processed_image, return_tensors="pt", padding=True) render_outputs = model(**render_inputs) render_logits = render_outputs.logits_per_image render_probs = render_logits.softmax(dim=1) render_probabilities_list = render_probs.squeeze().tolist() render_score = {class_name: probability for class_name, probability in zip(render_classes, render_probabilities_list)} is_render_prob = render_score["This is a realistic photo of an interior."]+render_score["This is a realistic photo of an exterior."] return is_render_prob def generate_answer(image): processed_image = image image_data = { "image_context": None, "validation": None, "equipment_score": None, "luminosity_score": {"score": None}, "view_type": {"views": None, "certainty_score": None} } space_type_score = calculate_space_type(processed_image) max_space_type = max(space_type_score, key=space_type_score.get) if space_type_score[max_space_type] >= 0: space_type = max_space_type.lower() if space_type == "patio": space_type = "terrace" image_data["image_context"] = space_type_score image_results = {} if max_space_type == "terrace": for question in equipment_questions: result = vqa_pipeline(processed_image, question, top_k=1) answer = result[0]['answer'].lower() == "yes" image_results[question] = answer equipment_score = calculate_equipment_score(image_results, weights) image_data["equipment_score"] = equipment_score result = vqa_pipeline(processed_image, "Is there a real window?", top_k=1) has_window = result[0] image_data["validation"] = "pass validation" if has_window['score'] > 0.9 else "No candidate" window_exists = has_window["answer"].lower() == "yes" and has_window["score"] > 0.9 if max_space_type in ["bedroom", "living room", "kitchen"] and window_exists: luminosity_score = calculate_luminosity_score(processed_image) image_data["luminosity_score"]['score'] = luminosity_score['natural_light'] view = views(processed_image) image_data["view_type"]["views"] = view certainty_score = certainty(processed_image) certainty_score = list(certainty_score.values())[0] image_data["view_type"]["certainty_score"] = certainty_score #is_render = calculate_is_render(processed_image) #image_data["is_render"] = is_render return json.dumps(image_data, indent=4) image_input = gr.Image(type="pil", label="Upload Image") iface = gr.Interface( fn=generate_answer, inputs=[image_input], outputs="text", title="Vision intelligence", description="Upload an image" ) iface.launch()