import torch from clip_interrogator import Config, Interrogator from PIL import Image import json from shared import path_manager, settings from transformers import AutoProcessor, AutoModelForCausalLM from modules.util import TimeIt import os from transformers.dynamic_module_utils import get_imports from unittest.mock import patch def brainblip_look(image, prompt, gr): from transformers import AutoProcessor, BlipForConditionalGeneration from PIL import Image gr.Info("BrainBlip is creating Your Prompt") print(f"Loading BrainBlip.") processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base") model = BlipForConditionalGeneration.from_pretrained("braintacles/brainblip").to("cpu") print(f"Processing...") inputs = processor(image, return_tensors="pt").to("cpu") out = model.generate(**inputs, min_length=40, max_new_tokens=150, num_beams=5, repetition_penalty=1.40) caption = processor.decode(out[0], skip_special_tokens=True) return caption def clip_look(image, prompt, gr): text = "Lets interrogate" # Unload models, if needed? #state["pipeline"] = None gr.Info("Clip is reading Your Prompt") conf = Config( device=torch.device("cuda"), clip_model_name="ViT-L-14/openai", cache_path=path_manager.model_paths["clip_path"], ) conf.apply_low_vram_defaults() i = Interrogator(conf) text = i.interrogate(image) return text def florence_look(image, prompt, gr): def fixed_get_imports(filename: str | os.PathLike) -> list[str]: """Work around for https://huggingface.co/microsoft/Florence-2-large-ft/discussions/4 .""" if os.path.basename(filename) != "modeling_florence2.py": return get_imports(filename) imports = get_imports(filename) try: imports.remove("flash_attn") except: pass return imports text = "Lets interrogate" print(f"Looking...") image = image.convert('RGB') #state["pipeline"] = None gr.Info("Florence is creating Your Prompt") with TimeIt(""): device = "cpu" torch_dtype = torch.float32 prompt = "" with patch("transformers.dynamic_module_utils.get_imports", fixed_get_imports): model = AutoModelForCausalLM.from_pretrained( "microsoft/Florence-2-large", torch_dtype=torch_dtype, trust_remote_code=True ).to(device) processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True) inputs = processor(text=prompt, images=image, return_tensors="pt").to(device, torch_dtype) print(f"Judging...") generated_ids = model.generate( input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_new_tokens=2048, num_beams=6, do_sample=False ) generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0] result = processor.post_process_generation(generated_text, task=prompt, image_size=(image.width, image.height)) text = result[prompt] return text looks = { "brainblip": brainblip_look, "clip": clip_look, "florence": florence_look, } def look(image, prompt, gr): if prompt.strip(" :") in looks: text = looks[prompt.strip()](image, prompt, gr) else: if prompt != "": return prompt try: info = image.info params = info.get("parameters", "") text = json.dumps(json.loads(params)) except: # Default interrogator interrogator = settings.default_settings.get("interrogator", "florence") text = looks[interrogator](image, prompt, gr) return text