Spaces:

Cicici1109
/

IEAP

Running on Zero

App Files Files Community

Yollm commited on 9 days ago

Commit

f08d17a

0 Parent(s):

Initial commit

Browse files

Files changed (34) hide show

.gitattributes +35 -0
LICENSE +201 -0
README.md +14 -0
app.py +112 -0
instructions.json +4 -0
main.py +95 -0
main_json.py +92 -0
requirements.txt +15 -0
src/flux/block.py +339 -0
src/flux/condition.py +133 -0
src/flux/generate.py +322 -0
src/flux/lora_controller.py +77 -0
src/flux/pipeline_tools.py +52 -0
src/flux/transformer.py +252 -0
src/gradio/gradio_app.py +118 -0
src/train/callbacks.py +268 -0
src/train/data.py +401 -0
src/train/model.py +185 -0
src/train/train.py +214 -0
train/README.md +138 -0
train/config/canny_512.yaml +48 -0
train/config/cartoon_512.yaml +44 -0
train/config/fill_1024.yaml +47 -0
train/config/scene_512.yaml +45 -0
train/config/sr_512.yaml +48 -0
train/config/subject_512.yaml +44 -0
train/requirements.txt +15 -0
train/script/data_download/data_download1.sh +1 -0
train/script/data_download/data_download2.sh +3 -0
train/script/train_canny.sh +13 -0
train/script/train_cartoon.sh +15 -0
train/script/train_scene.sh +13 -0
train/script/train_subject.sh +13 -0
utils.py +591 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: IEAP
+emoji: 👀
+colorFrom: gray
+colorTo: yellow
+sdk: gradio
+sdk_version: 5.32.0
+app_file: app.py
+pinned: false
+license: apache-2.0
+short_description: A demo for IEAP
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import gradio as gr
+from PIL import Image
+from utils import encode_image_to_datauri, cot_with_gpt, extract_instructions, infer_with_DiT, roi_localization, fusion
+import openai
+import os
+import uuid
+from src.flux.generate import generate, seed_everything
+def process_image(api_key, seed, image, prompt):
+    if not api_key:
+        raise gr.Error("❌ Please enter a valid OpenAI API key.")
+    openai.api_key = api_key
+    # Generate a unique image ID to avoid file name conflict
+    image_id = str(uuid.uuid4())
+    seed_everything(seed)
+    input_path = f"input_{image_id}.png"
+    image.save(input_path)
+    try:
+        uri = encode_image_to_datauri(input_path)
+        categories, instructions = cot_with_gpt(uri, prompt)
+        # categories = ['Tone Transfer', 'Style Change']
+        # instructions = ['Change the time to night', 'Change the style to watercolor']
+        if not categories or not instructions:
+            raise gr.Error("No editing steps returned by GPT. Try a more specific instruction.")
+        intermediate_images = []
+        current_image_path = input_path
+        for i, (category, instruction) in enumerate(zip(categories, instructions)):
+            print(f"[Step {i}] Category: {category} | Instruction: {instruction}")
+            step_prefix = f"{image_id}_{i}"
+            if category in ('Add', 'Remove', 'Replace'):
+                if category == 'Add':
+                    edited_image = infer_with_DiT('RoI Editing', current_image_path, instruction, category)
+                else:
+                    mask_image = roi_localization(current_image_path, instruction, category)
+                    edited_image = infer_with_DiT('RoI Inpainting', mask_image, instruction, category)
+            elif category == 'Action Change':
+                mask_image = roi_localization(current_image_path, instruction, category)
+                inpainted = infer_with_DiT('RoI Inpainting', mask_image, instruction, 'Remove')
+                changed_instance, x0, y1, scale = infer_with_DiT('RoI Editing', current_image_path, instruction, category)
+                fusion_image = fusion(inpainted, changed_instance, x0, y1, scale)
+                edited_image = infer_with_DiT('RoI Compositioning', fusion_image, instruction, None)
+            elif category in ('Move', 'Resize'):
+                mask_image, changed_instance, x0, y1, scale = roi_localization(current_image_path, instruction, category)
+                inpainted = infer_with_DiT('RoI Inpainting', mask_image, instruction, 'Remove')
+                fusion_image = fusion(inpainted, changed_instance, x0, y1, scale)
+                edited_image = infer_with_DiT('RoI Compositioning', fusion_image, instruction, None)
+            elif category in ('Appearance Change', 'Background Change', 'Color Change', 'Material Change', 'Expression Change'):
+                edited_image = infer_with_DiT('RoI Editing', current_image_path, instruction, category)
+            elif category in ('Tone Transfer', 'Style Change'):
+                edited_image = infer_with_DiT('Global Transformation', current_image_path, instruction, category)
+            else:
+                raise gr.Error(f"Invalid category returned: '{category}'")
+            current_image_path = f"{step_prefix}.png"
+            edited_image.save(current_image_path)
+            intermediate_images.append(edited_image.copy())
+        final_result = intermediate_images[-1] if intermediate_images else image
+        return intermediate_images, final_result
+    except Exception as e:
+        raise gr.Error(f"Processing failed: {str(e)}")
+# Gradio UI
+with gr.Blocks() as demo:
+    gr.Markdown("## 🖼️ IEAP: Image Editing As Programs")
+    with gr.Row():
+        api_key_input = gr.Textbox(label="🔑 OpenAI API Key", type="password", placeholder="sk-...")
+    with gr.Row():
+        seed_slider = gr.Slider(
+            label="🎲 Random Seed",
+            minimum=0,
+            maximum=1000000,
+            value=3407,
+            step=1,
+            info="Drag to set the random seed for reproducibility"
+        )
+    with gr.Row():
+        with gr.Column():
+            image_input = gr.Image(type="pil", label="Upload Image")
+            prompt_input = gr.Textbox(label="Instruction", placeholder="e.g., Move the dog to the left and change its color to blue")
+            submit_button = gr.Button("Submit")
+        with gr.Column():
+            result_gallery = gr.Gallery(label="Intermediate Steps", columns=2, height="auto")
+            final_output = gr.Image(label="✅ Final Result")
+    submit_button.click(
+        fn=process_image,
+        inputs=[api_key_input, seed_slider, image_input, prompt_input],
+        outputs=[result_gallery, final_output]
+    )
+if __name__ == "__main__":
+    demo.launch(
+    )

instructions.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+    "categories": ["Move", "Resize"],
+    "instructions": ["Move the woman to the right", "Minify the woman"]
+}

main.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import os
+import argparse
+from PIL import Image
+import openai
+from tenacity import retry, wait_exponential, stop_after_attempt, retry_if_exception_type
+from utils import encode_image_to_datauri, cot_with_gpt, extract_instructions, infer_with_DiT, roi_localization, fusion
+from src.flux.generate import generate, seed_everything
+def main():
+    parser = argparse.ArgumentParser(description="Evaluate single image + instruction using GPT-4o")
+    parser.add_argument("image_path", help="Path to input image")
+    parser.add_argument("prompt", help="Original instruction")
+    parser.add_argument("--seed", type=int, default=3407, help="Random seed for reproducibility")
+    args = parser.parse_args()
+    seed_everything(args.seed)
+    openai.api_key = "YOUR_API_KEY"
+    if not openai.api_key:
+        raise ValueError("OPENAI_API_KEY environment variable not set.")
+    os.makedirs("results", exist_ok=True)
+    ###########################################
+    ###         CoT -> instructions         ###
+    ###########################################
+    uri = encode_image_to_datauri(args.image_path)
+    categories, instructions = cot_with_gpt(uri, args.prompt)
+    print(categories)
+    print(instructions)
+    # categories = ['Move', 'Resize']
+    # instructions = ['Move the woman to the right', 'Minify the woman']
+    ###########################################
+    ###      Neural Program Interpreter     ###
+    ###########################################
+    for i in range(len(categories)):
+        if i == 0:
+            image = args.image_path
+        else:
+            image = f"results/{i-1}.png"
+        category = categories[i]
+        instruction = instructions[i]
+        if category in ('Add', 'Remove', 'Replace', 'Action Change', 'Move', 'Resize'):
+            if category in ('Add', 'Remove', 'Replace'):
+                if category == 'Add':
+                    edited_image = infer_with_DiT('RoI Editing', image, instruction, category)
+                else:
+                    ### RoI Localization
+                    mask_image = roi_localization(image, instruction, category)
+                    # mask_image.save("mask.png")
+                    ### RoI Inpainting
+                    edited_image = infer_with_DiT('RoI Inpainting', mask_image, instruction, category)
+            elif category == 'Action Change':
+                ### RoI Localization
+                mask_image = roi_localization(image, instruction, category)
+                ### RoI Inpainting
+                edited_image = infer_with_DiT('RoI Inpainting', mask_image, instruction, 'Remove') # inpainted bg
+                ### RoI Editing
+                changed_instance, x0, y1, scale = infer_with_DiT('RoI Editing', image, instruction, category) # action change
+                fusion_image = fusion(edited_image, changed_instance, x0, y1, scale)
+                ### RoI Compositioning
+                edited_image = infer_with_DiT('RoI Compositioning', fusion_image, instruction, None)
+            elif category in ('Move', 'Resize'):
+                ### RoI Localization
+                mask_image, changed_instance, x0, y1, scale  = roi_localization(image, instruction, category)
+                ### RoI Inpainting
+                edited_image= infer_with_DiT('RoI Inpainting', mask_image, instruction, 'Remove') # inpainted bg
+                # changed_instance, bottom_left, scale = layout_change(image, instruction) # move/resize
+                fusion_image = fusion(edited_image, changed_instance, x0, y1, scale)
+                fusion_image.save("fusion.png")
+                ### RoI Compositioning
+                edited_image = infer_with_DiT('RoI Compositioning', fusion_image, instruction, None)
+        elif category in ('Appearance Change', 'Background Change', 'Color Change', 'Material Change', 'Expression Change'):
+            ### RoI Editing
+            edited_image = infer_with_DiT('RoI Editing', image, instruction, category)
+        elif category in ('Tone Transfer', 'Style Change'):
+            ### Global Transformation
+            edited_image = infer_with_DiT('Global Transformation', image, instruction, category)
+        else:
+            raise ValueError(f"Invalid category: '{category}'")
+        image = edited_image
+        image.save(f"results/{i}.png")
+if __name__ == "__main__":
+    main()

main_json.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import os
+import argparse
+import json
+from PIL import Image
+import openai
+from tenacity import retry, wait_exponential, stop_after_attempt, retry_if_exception_type
+from utils import encode_image_to_datauri, cot_with_gpt, extract_instructions, infer_with_DiT, roi_localization, fusion
+def main():
+    parser = argparse.ArgumentParser(description="Evaluate single image + instruction using GPT-4o")
+    parser.add_argument("image_path", help="Path to input image")
+    parser.add_argument("json_path", help="Path to JSON file containing categories and instructions")
+    args = parser.parse_args()
+    openai.api_key = "YOUR_API_KEY"
+    if not openai.api_key:
+        raise ValueError("OPENAI_API_KEY environment variable not set.")
+    os.makedirs("results", exist_ok=True)
+    #######################################################
+    ###         Load instructions from JSON             ###
+    #######################################################
+    try:
+        with open(args.json_path, 'r') as f:
+            data = json.load(f)
+            categories = data.get('categories', [])
+            instructions = data.get('instructions', [])
+            if not categories or not instructions:
+                raise ValueError("JSON file must contain 'categories' and 'instructions' arrays.")
+            if len(categories) != len(instructions):
+                raise ValueError("Length of 'categories' and 'instructions' must match.")
+            print("Loaded instructions from JSON:")
+            for i, (cat, instr) in enumerate(zip(categories, instructions)):
+                print(f"Step {i+1}: [{cat}] {instr}")
+    except Exception as e:
+        raise ValueError(f"Failed to load JSON file: {str(e)}")
+    ###################################################
+    ###          Neural Program Interpreter         ###
+    ###################################################
+    for i in range(len(categories)):
+        if i == 0:
+            image = args.image_path
+        else:
+            image = f"results/{i-1}.png"
+        category = categories[i]
+        instruction = instructions[i]
+        if category in ('Add', 'Remove', 'Replace', 'Action Change', 'Move', 'Resize'):
+            if category in ('Add', 'Remove', 'Replace'):
+                if category == 'Add':
+                    edited_image = infer_with_DiT('RoI Editing', image, instruction, category)
+                else:
+                    mask_image = roi_localization(image, instruction, category)
+                    edited_image = infer_with_DiT('RoI Inpainting', mask_image, instruction, category)
+            elif category == 'Action Change':
+                mask_image = roi_localization(image, instruction, category)
+                edited_image = infer_with_DiT('RoI Inpainting', mask_image, instruction, 'Remove')
+                changed_instance, x0, y1, scale = infer_with_DiT('RoI Editing', image, instruction, category)
+                fusion_image = fusion(edited_image, changed_instance, x0, y1, scale)
+                edited_image = infer_with_DiT('RoI Compositioning', fusion_image, instruction, None)
+            elif category in ('Move', 'Resize'):
+                mask_image, changed_instance, x0, y1, scale = roi_localization(image, instruction, category)
+                edited_image = infer_with_DiT('RoI Inpainting', mask_image, instruction, 'Remove')
+                fusion_image = fusion(edited_image, changed_instance, x0, y1, scale)
+                fusion_image.save("fusion.png")
+                edited_image = infer_with_DiT('RoI Compositioning', fusion_image, instruction, None)
+        elif category in ('Appearance Change', 'Background Change', 'Color Change', 'Material Change', 'Expression Change'):
+            edited_image = infer_with_DiT('RoI Editing', image, instruction, category)
+        elif category in ('Tone Transfer', 'Style Change'):
+            edited_image = infer_with_DiT('Global Transformation', image, instruction, category)
+        else:
+            raise ValueError(f"Invalid category: '{category}'")
+        image = edited_image
+        image.save(f"results/{i}.png")
+        print(f"Step {i+1} completed: {category} - {instruction}")
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+diffusers==0.32.0
+transformers==4.42.3
+xtuner[deepspeed]==0.1.23
+timm==1.0.9
+mmdet==3.3.0
+hydra-core==1.3.2
+ninja==1.11.1
+decord==0.6.0
+peft==0.11.1
+protobuf==5.29.4
+sentencepiece==0.2.0
+tornado==6.4.2
+openai==0.28.0
+gradio==5.32.0
+opencv-python

src/flux/block.py ADDED Viewed

	@@ -0,0 +1,339 @@

+import torch
+from typing import List, Union, Optional, Dict, Any, Callable
+from diffusers.models.attention_processor import Attention, F
+from .lora_controller import enable_lora
+def attn_forward(
+    attn: Attention,
+    hidden_states: torch.FloatTensor,
+    encoder_hidden_states: torch.FloatTensor = None,
+    condition_latents: torch.FloatTensor = None,
+    attention_mask: Optional[torch.FloatTensor] = None,
+    image_rotary_emb: Optional[torch.Tensor] = None,
+    cond_rotary_emb: Optional[torch.Tensor] = None,
+    model_config: Optional[Dict[str, Any]] = {},
+) -> torch.FloatTensor:
+    batch_size, _, _ = (
+        hidden_states.shape
+        if encoder_hidden_states is None
+        else encoder_hidden_states.shape
+    )
+    with enable_lora(
+        (attn.to_q, attn.to_k, attn.to_v), model_config.get("latent_lora", False)
+    ):
+        # `sample` projections.
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+    inner_dim = key.shape[-1]
+    head_dim = inner_dim // attn.heads
+    query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+    key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+    value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+    if attn.norm_q is not None:
+        query = attn.norm_q(query)
+    if attn.norm_k is not None:
+        key = attn.norm_k(key)
+    # the attention in FluxSingleTransformerBlock does not use `encoder_hidden_states`
+    if encoder_hidden_states is not None:
+        # `context` projections.
+        encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
+        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+        encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
+            batch_size, -1, attn.heads, head_dim
+        ).transpose(1, 2)
+        encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
+            batch_size, -1, attn.heads, head_dim
+        ).transpose(1, 2)
+        encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
+            batch_size, -1, attn.heads, head_dim
+        ).transpose(1, 2)
+        if attn.norm_added_q is not None:
+            encoder_hidden_states_query_proj = attn.norm_added_q(
+                encoder_hidden_states_query_proj
+            )
+        if attn.norm_added_k is not None:
+            encoder_hidden_states_key_proj = attn.norm_added_k(
+                encoder_hidden_states_key_proj
+            )
+        # attention
+        query = torch.cat([encoder_hidden_states_query_proj, query], dim=2)
+        key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
+        value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)
+    if image_rotary_emb is not None:
+        from diffusers.models.embeddings import apply_rotary_emb
+        query = apply_rotary_emb(query, image_rotary_emb)
+        key = apply_rotary_emb(key, image_rotary_emb)
+    if condition_latents is not None:
+        cond_query = attn.to_q(condition_latents)
+        cond_key = attn.to_k(condition_latents)
+        cond_value = attn.to_v(condition_latents)
+        cond_query = cond_query.view(batch_size, -1, attn.heads, head_dim).transpose(
+            1, 2
+        )
+        cond_key = cond_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        cond_value = cond_value.view(batch_size, -1, attn.heads, head_dim).transpose(
+            1, 2
+        )
+        if attn.norm_q is not None:
+            cond_query = attn.norm_q(cond_query)
+        if attn.norm_k is not None:
+            cond_key = attn.norm_k(cond_key)
+    if cond_rotary_emb is not None:
+        cond_query = apply_rotary_emb(cond_query, cond_rotary_emb)
+        cond_key = apply_rotary_emb(cond_key, cond_rotary_emb)
+    if condition_latents is not None:
+        query = torch.cat([query, cond_query], dim=2)
+        key = torch.cat([key, cond_key], dim=2)
+        value = torch.cat([value, cond_value], dim=2)
+    if not model_config.get("union_cond_attn", True):
+        # If we don't want to use the union condition attention, we need to mask the attention
+        # between the hidden states and the condition latents
+        attention_mask = torch.ones(
+            query.shape[2], key.shape[2], device=query.device, dtype=torch.bool
+        )
+        condition_n = cond_query.shape[2]
+        attention_mask[-condition_n:, :-condition_n] = False
+        attention_mask[:-condition_n, -condition_n:] = False
+    elif model_config.get("independent_condition", False):
+        attention_mask = torch.ones(
+            query.shape[2], key.shape[2], device=query.device, dtype=torch.bool
+        )
+        condition_n = cond_query.shape[2]
+        attention_mask[-condition_n:, :-condition_n] = False
+    if hasattr(attn, "c_factor"):
+        attention_mask = torch.zeros(
+            query.shape[2], key.shape[2], device=query.device, dtype=query.dtype
+        )
+        condition_n = cond_query.shape[2]
+        bias = torch.log(attn.c_factor[0])
+        attention_mask[-condition_n:, :-condition_n] = bias
+        attention_mask[:-condition_n, -condition_n:] = bias
+    hidden_states = F.scaled_dot_product_attention(
+        query, key, value, dropout_p=0.0, is_causal=False, attn_mask=attention_mask
+    )
+    hidden_states = hidden_states.transpose(1, 2).reshape(
+        batch_size, -1, attn.heads * head_dim
+    )
+    hidden_states = hidden_states.to(query.dtype)
+    if encoder_hidden_states is not None:
+        if condition_latents is not None:
+            encoder_hidden_states, hidden_states, condition_latents = (
+                hidden_states[:, : encoder_hidden_states.shape[1]],
+                hidden_states[
+                    :, encoder_hidden_states.shape[1] : -condition_latents.shape[1]
+                ],
+                hidden_states[:, -condition_latents.shape[1] :],
+            )
+        else:
+            encoder_hidden_states, hidden_states = (
+                hidden_states[:, : encoder_hidden_states.shape[1]],
+                hidden_states[:, encoder_hidden_states.shape[1] :],
+            )
+        with enable_lora((attn.to_out[0],), model_config.get("latent_lora", False)):
+            # linear proj
+            hidden_states = attn.to_out[0](hidden_states)
+            # dropout
+            hidden_states = attn.to_out[1](hidden_states)
+        encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+        if condition_latents is not None:
+            condition_latents = attn.to_out[0](condition_latents)
+            condition_latents = attn.to_out[1](condition_latents)
+        return (
+            (hidden_states, encoder_hidden_states, condition_latents)
+            if condition_latents is not None
+            else (hidden_states, encoder_hidden_states)
+        )
+    elif condition_latents is not None:
+        # if there are condition_latents, we need to separate the hidden_states and the condition_latents
+        hidden_states, condition_latents = (
+            hidden_states[:, : -condition_latents.shape[1]],
+            hidden_states[:, -condition_latents.shape[1] :],
+        )
+        return hidden_states, condition_latents
+    else:
+        return hidden_states
+def block_forward(
+    self,
+    hidden_states: torch.FloatTensor,
+    encoder_hidden_states: torch.FloatTensor,
+    condition_latents: torch.FloatTensor,
+    temb: torch.FloatTensor,
+    cond_temb: torch.FloatTensor,
+    cond_rotary_emb=None,
+    image_rotary_emb=None,
+    model_config: Optional[Dict[str, Any]] = {},
+):
+    use_cond = condition_latents is not None
+    with enable_lora((self.norm1.linear,), model_config.get("latent_lora", False)):
+        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+            hidden_states, emb=temb
+        )
+    norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = (
+        self.norm1_context(encoder_hidden_states, emb=temb)
+    )
+    if use_cond:
+        (
+            norm_condition_latents,
+            cond_gate_msa,
+            cond_shift_mlp,
+            cond_scale_mlp,
+            cond_gate_mlp,
+        ) = self.norm1(condition_latents, emb=cond_temb)
+    # Attention.
+    result = attn_forward(
+        self.attn,
+        model_config=model_config,
+        hidden_states=norm_hidden_states,
+        encoder_hidden_states=norm_encoder_hidden_states,
+        condition_latents=norm_condition_latents if use_cond else None,
+        image_rotary_emb=image_rotary_emb,
+        cond_rotary_emb=cond_rotary_emb if use_cond else None,
+    )
+    attn_output, context_attn_output = result[:2]
+    cond_attn_output = result[2] if use_cond else None
+    # Process attention outputs for the `hidden_states`.
+    # 1. hidden_states
+    attn_output = gate_msa.unsqueeze(1) * attn_output
+    hidden_states = hidden_states + attn_output
+    # 2. encoder_hidden_states
+    context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output
+    encoder_hidden_states = encoder_hidden_states + context_attn_output
+    # 3. condition_latents
+    if use_cond:
+        cond_attn_output = cond_gate_msa.unsqueeze(1) * cond_attn_output
+        condition_latents = condition_latents + cond_attn_output
+        if model_config.get("add_cond_attn", False):
+            hidden_states += cond_attn_output
+    # LayerNorm + MLP.
+    # 1. hidden_states
+    norm_hidden_states = self.norm2(hidden_states)
+    norm_hidden_states = (
+        norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+    )
+    # 2. encoder_hidden_states
+    norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
+    norm_encoder_hidden_states = (
+        norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
+    )
+    # 3. condition_latents
+    if use_cond:
+        norm_condition_latents = self.norm2(condition_latents)
+        norm_condition_latents = (
+            norm_condition_latents * (1 + cond_scale_mlp[:, None])
+            + cond_shift_mlp[:, None]
+        )
+    # Feed-forward.
+    with enable_lora((self.ff.net[2],), model_config.get("latent_lora", False)):
+        # 1. hidden_states
+        ff_output = self.ff(norm_hidden_states)
+        ff_output = gate_mlp.unsqueeze(1) * ff_output
+    # 2. encoder_hidden_states
+    context_ff_output = self.ff_context(norm_encoder_hidden_states)
+    context_ff_output = c_gate_mlp.unsqueeze(1) * context_ff_output
+    # 3. condition_latents
+    if use_cond:
+        cond_ff_output = self.ff(norm_condition_latents)
+        cond_ff_output = cond_gate_mlp.unsqueeze(1) * cond_ff_output
+    # Process feed-forward outputs.
+    hidden_states = hidden_states + ff_output
+    encoder_hidden_states = encoder_hidden_states + context_ff_output
+    if use_cond:
+        condition_latents = condition_latents + cond_ff_output
+    # Clip to avoid overflow.
+    if encoder_hidden_states.dtype == torch.float16:
+        encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
+    return encoder_hidden_states, hidden_states, condition_latents if use_cond else None
+def single_block_forward(
+    self,
+    hidden_states: torch.FloatTensor,
+    temb: torch.FloatTensor,
+    image_rotary_emb=None,
+    condition_latents: torch.FloatTensor = None,
+    cond_temb: torch.FloatTensor = None,
+    cond_rotary_emb=None,
+    model_config: Optional[Dict[str, Any]] = {},
+):
+    using_cond = condition_latents is not None
+    residual = hidden_states
+    with enable_lora(
+        (
+            self.norm.linear,
+            self.proj_mlp,
+        ),
+        model_config.get("latent_lora", False),
+    ):
+        norm_hidden_states, gate = self.norm(hidden_states, emb=temb)
+        mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
+    if using_cond:
+        residual_cond = condition_latents
+        norm_condition_latents, cond_gate = self.norm(condition_latents, emb=cond_temb)
+        mlp_cond_hidden_states = self.act_mlp(self.proj_mlp(norm_condition_latents))
+    attn_output = attn_forward(
+        self.attn,
+        model_config=model_config,
+        hidden_states=norm_hidden_states,
+        image_rotary_emb=image_rotary_emb,
+        **(
+            {
+                "condition_latents": norm_condition_latents,
+                "cond_rotary_emb": cond_rotary_emb if using_cond else None,
+            }
+            if using_cond
+            else {}
+        ),
+    )
+    if using_cond:
+        attn_output, cond_attn_output = attn_output
+    with enable_lora((self.proj_out,), model_config.get("latent_lora", False)):
+        hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2)
+        gate = gate.unsqueeze(1)
+        hidden_states = gate * self.proj_out(hidden_states)
+        hidden_states = residual + hidden_states
+    if using_cond:
+        condition_latents = torch.cat([cond_attn_output, mlp_cond_hidden_states], dim=2)
+        cond_gate = cond_gate.unsqueeze(1)
+        condition_latents = cond_gate * self.proj_out(condition_latents)
+        condition_latents = residual_cond + condition_latents
+    if hidden_states.dtype == torch.float16:
+        hidden_states = hidden_states.clip(-65504, 65504)
+    return hidden_states if not using_cond else (hidden_states, condition_latents)

src/flux/condition.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import torch
+from typing import Optional, Union, List, Tuple
+from diffusers.pipelines import FluxPipeline
+from PIL import Image, ImageFilter
+import numpy as np
+import cv2
+from .pipeline_tools import encode_images
+condition_dict = {
+    "depth": 0,
+    "canny": 1,
+    "subject": 4,
+    "coloring": 6,
+    "deblurring": 7,
+    "depth_pred": 8,
+    "fill": 9,
+    "sr": 10,
+    "cartoon": 11,
+    "scene": 12
+}
+class Condition(object):
+    def __init__(
+        self,
+        condition_type: str,
+        raw_img: Union[Image.Image, torch.Tensor] = None,
+        condition: Union[Image.Image, torch.Tensor] = None,
+        mask=None,
+        position_delta=None,
+        position_scale=1.0,
+    ) -> None:
+        self.condition_type = condition_type
+        assert raw_img is not None or condition is not None
+        if raw_img is not None:
+            self.condition = self.get_condition(condition_type, raw_img)
+        else:
+            self.condition = condition
+        self.position_delta = position_delta
+        self.position_scale = position_scale
+        # TODO: Add mask support
+        assert mask is None, "Mask not supported yet"
+    def get_condition(
+        self, condition_type: str, raw_img: Union[Image.Image, torch.Tensor]
+    ) -> Union[Image.Image, torch.Tensor]:
+        """
+        Returns the condition image.
+        """
+        if condition_type == "depth":
+            from transformers import pipeline
+            depth_pipe = pipeline(
+                task="depth-estimation",
+                model="LiheYoung/depth-anything-small-hf",
+                device="cuda",
+            )
+            source_image = raw_img.convert("RGB")
+            condition_img = depth_pipe(source_image)["depth"].convert("RGB")
+            return condition_img
+        elif condition_type == "canny":
+            img = np.array(raw_img)
+            edges = cv2.Canny(img, 100, 200)
+            edges = Image.fromarray(edges).convert("RGB")
+            return edges
+        elif condition_type == "subject":
+            return raw_img
+        elif condition_type == "coloring":
+            return raw_img.convert("L").convert("RGB")
+        elif condition_type == "deblurring":
+            condition_image = (
+                raw_img.convert("RGB")
+                .filter(ImageFilter.GaussianBlur(10))
+                .convert("RGB")
+            )
+            return condition_image
+        elif condition_type == "fill":
+            return raw_img.convert("RGB")
+        elif condition_type == "cartoon":
+            return raw_img.convert("RGB")
+        elif condition_type == "scene":
+            return raw_img.convert("RGB")
+        return self.condition
+    @property
+    def type_id(self) -> int:
+        """
+        Returns the type id of the condition.
+        """
+        return condition_dict[self.condition_type]
+    @classmethod
+    def get_type_id(cls, condition_type: str) -> int:
+        """
+        Returns the type id of the condition.
+        """
+        return condition_dict[condition_type]
+    def encode(self, pipe: FluxPipeline) -> Tuple[torch.Tensor, torch.Tensor, int]:
+        """
+        Encodes the condition into tokens, ids and type_id.
+        """
+        if self.condition_type in [
+            "depth",
+            "canny",
+            "subject",
+            "coloring",
+            "deblurring",
+            "depth_pred",
+            "fill",
+            "sr",
+            "cartoon",
+            "scene"
+        ]:
+            tokens, ids = encode_images(pipe, self.condition)
+        else:
+            raise NotImplementedError(
+                f"Condition type {self.condition_type} not implemented"
+            )
+        if self.position_delta is None and self.condition_type == "subject":
+            self.position_delta = [0, -self.condition.size[0] // 16]
+        if self.position_delta is not None:
+            ids[:, 1] += self.position_delta[0]
+            ids[:, 2] += self.position_delta[1]
+        if self.position_scale != 1.0:
+            scale_bias = (self.position_scale - 1.0) / 2
+            ids[:, 1] *= self.position_scale
+            ids[:, 2] *= self.position_scale
+            ids[:, 1] += scale_bias
+            ids[:, 2] += scale_bias
+        type_id = torch.ones_like(ids[:, :1]) * self.type_id
+        return tokens, ids, type_id

src/flux/generate.py ADDED Viewed

	@@ -0,0 +1,322 @@

+import torch
+import yaml, os
+from diffusers.pipelines import FluxPipeline
+from typing import List, Union, Optional, Dict, Any, Callable
+from .transformer import tranformer_forward
+from .condition import Condition
+from diffusers.pipelines.flux.pipeline_flux import (
+    FluxPipelineOutput,
+    calculate_shift,
+    retrieve_timesteps,
+    np,
+)
+def get_config(config_path: str = None):
+    config_path = config_path or os.environ.get("XFL_CONFIG")
+    if not config_path:
+        return {}
+    with open(config_path, "r") as f:
+        config = yaml.safe_load(f)
+    return config
+def prepare_params(
+    prompt: Union[str, List[str]] = None,
+    prompt_2: Optional[Union[str, List[str]]] = None,
+    height: Optional[int] = 512,
+    width: Optional[int] = 512,
+    num_inference_steps: int = 28,
+    timesteps: List[int] = None,
+    guidance_scale: float = 3.5,
+    num_images_per_prompt: Optional[int] = 1,
+    generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+    latents: Optional[torch.FloatTensor] = None,
+    prompt_embeds: Optional[torch.FloatTensor] = None,
+    pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+    output_type: Optional[str] = "pil",
+    return_dict: bool = True,
+    joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+    callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+    max_sequence_length: int = 512,
+    **kwargs: dict,
+):
+    return (
+        prompt,
+        prompt_2,
+        height,
+        width,
+        num_inference_steps,
+        timesteps,
+        guidance_scale,
+        num_images_per_prompt,
+        generator,
+        latents,
+        prompt_embeds,
+        pooled_prompt_embeds,
+        output_type,
+        return_dict,
+        joint_attention_kwargs,
+        callback_on_step_end,
+        callback_on_step_end_tensor_inputs,
+        max_sequence_length,
+    )
+def seed_everything(seed: int = 42):
+    torch.backends.cudnn.deterministic = True
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+@torch.no_grad()
+def generate(
+    pipeline: FluxPipeline,
+    conditions: List[Condition] = None,
+    config_path: str = None,
+    model_config: Optional[Dict[str, Any]] = {},
+    condition_scale: float = 1.0,
+    default_lora: bool = False,
+    image_guidance_scale: float = 1.0,
+    **params: dict,
+):
+    model_config = model_config or get_config(config_path).get("model", {})
+    # print(model_config)
+    if condition_scale != 1:
+        for name, module in pipeline.transformer.named_modules():
+            if not name.endswith(".attn"):
+                continue
+            module.c_factor = torch.ones(1, 1) * condition_scale
+    self = pipeline
+    (
+        prompt,
+        prompt_2,
+        height,
+        width,
+        num_inference_steps,
+        timesteps,
+        guidance_scale,
+        num_images_per_prompt,
+        generator,
+        latents,
+        prompt_embeds,
+        pooled_prompt_embeds,
+        output_type,
+        return_dict,
+        joint_attention_kwargs,
+        callback_on_step_end,
+        callback_on_step_end_tensor_inputs,
+        max_sequence_length,
+    ) = prepare_params(**params)
+    height = height or self.default_sample_size * self.vae_scale_factor
+    width = width or self.default_sample_size * self.vae_scale_factor
+    # 1. Check inputs. Raise error if not correct
+    self.check_inputs(
+        prompt,
+        prompt_2,
+        height,
+        width,
+        prompt_embeds=prompt_embeds,
+        pooled_prompt_embeds=pooled_prompt_embeds,
+        callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+        max_sequence_length=max_sequence_length,
+    )
+    self._guidance_scale = guidance_scale
+    self._joint_attention_kwargs = joint_attention_kwargs
+    self._interrupt = False
+    # 2. Define call parameters
+    if prompt is not None and isinstance(prompt, str):
+        batch_size = 1
+    elif prompt is not None and isinstance(prompt, list):
+        batch_size = len(prompt)
+    else:
+        batch_size = prompt_embeds.shape[0]
+    device = self._execution_device
+    lora_scale = (
+        self.joint_attention_kwargs.get("scale", None)
+        if self.joint_attention_kwargs is not None
+        else None
+    )
+    (
+        prompt_embeds,
+        pooled_prompt_embeds,
+        text_ids,
+    ) = self.encode_prompt(
+        prompt=prompt,
+        prompt_2=prompt_2,
+        prompt_embeds=prompt_embeds,
+        pooled_prompt_embeds=pooled_prompt_embeds,
+        device=device,
+        num_images_per_prompt=num_images_per_prompt,
+        max_sequence_length=max_sequence_length,
+        lora_scale=lora_scale,
+    )
+    # 4. Prepare latent variables
+    num_channels_latents = self.transformer.config.in_channels // 4
+    latents, latent_image_ids = self.prepare_latents(
+        batch_size * num_images_per_prompt,
+        num_channels_latents,
+        height,
+        width,
+        prompt_embeds.dtype,
+        device,
+        generator,
+        latents,
+    )
+    # 4.1. Prepare conditions
+    condition_latents, condition_ids, condition_type_ids = ([] for _ in range(3))
+    use_condition = conditions is not None or []
+    if use_condition:
+        assert len(conditions) <= 1, "Only one condition is supported for now."
+        if not default_lora:
+            pipeline.set_adapters(conditions[0].condition_type)
+        for condition in conditions:
+            tokens, ids, type_id = condition.encode(self)
+            condition_latents.append(tokens)  # [batch_size, token_n, token_dim]
+            condition_ids.append(ids)  # [token_n, id_dim(3)]
+            condition_type_ids.append(type_id)  # [token_n, 1]
+        condition_latents = torch.cat(condition_latents, dim=1)
+        condition_ids = torch.cat(condition_ids, dim=0)
+        condition_type_ids = torch.cat(condition_type_ids, dim=0)
+    # 5. Prepare timesteps
+    sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+    image_seq_len = latents.shape[1]
+    mu = calculate_shift(
+        image_seq_len,
+        self.scheduler.config.base_image_seq_len,
+        self.scheduler.config.max_image_seq_len,
+        self.scheduler.config.base_shift,
+        self.scheduler.config.max_shift,
+    )
+    timesteps, num_inference_steps = retrieve_timesteps(
+        self.scheduler,
+        num_inference_steps,
+        device,
+        timesteps,
+        sigmas,
+        mu=mu,
+    )
+    num_warmup_steps = max(
+        len(timesteps) - num_inference_steps * self.scheduler.order, 0
+    )
+    self._num_timesteps = len(timesteps)
+    # 6. Denoising loop
+    with self.progress_bar(total=num_inference_steps) as progress_bar:
+        for i, t in enumerate(timesteps):
+            if self.interrupt:
+                continue
+            # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+            timestep = t.expand(latents.shape[0]).to(latents.dtype)
+            # handle guidance
+            if self.transformer.config.guidance_embeds:
+                guidance = torch.tensor([guidance_scale], device=device)
+                guidance = guidance.expand(latents.shape[0])
+            else:
+                guidance = None
+            noise_pred = tranformer_forward(
+                self.transformer,
+                model_config=model_config,
+                # Inputs of the condition (new feature)
+                condition_latents=condition_latents if use_condition else None,
+                condition_ids=condition_ids if use_condition else None,
+                condition_type_ids=condition_type_ids if use_condition else None,
+                # Inputs to the original transformer
+                hidden_states=latents,
+                # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transforme rmodel (we should not keep it but I want to keep the inputs same for the model for testing)
+                timestep=timestep / 1000,
+                guidance=guidance,
+                pooled_projections=pooled_prompt_embeds,
+                encoder_hidden_states=prompt_embeds,
+                txt_ids=text_ids,
+                img_ids=latent_image_ids,
+                joint_attention_kwargs=self.joint_attention_kwargs,
+                return_dict=False,
+            )[0]
+            if image_guidance_scale != 1.0:
+                uncondition_latents = condition.encode(self, empty=True)[0]
+                unc_pred = tranformer_forward(
+                    self.transformer,
+                    model_config=model_config,
+                    # Inputs of the condition (new feature)
+                    condition_latents=uncondition_latents if use_condition else None,
+                    condition_ids=condition_ids if use_condition else None,
+                    condition_type_ids=condition_type_ids if use_condition else None,
+                    # Inputs to the original transformer
+                    hidden_states=latents,
+                    # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transforme rmodel (we should not keep it but I want to keep the inputs same for the model for testing)
+                    timestep=timestep / 1000,
+                    guidance=torch.ones_like(guidance),
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )[0]
+                noise_pred = unc_pred + image_guidance_scale * (noise_pred - unc_pred)
+            # compute the previous noisy sample x_t -> x_t-1
+            latents_dtype = latents.dtype
+            latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+            if latents.dtype != latents_dtype:
+                if torch.backends.mps.is_available():
+                    # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                    latents = latents.to(latents_dtype)
+            if callback_on_step_end is not None:
+                callback_kwargs = {}
+                for k in callback_on_step_end_tensor_inputs:
+                    callback_kwargs[k] = locals()[k]
+                callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                latents = callback_outputs.pop("latents", latents)
+                prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+            # call the callback, if provided
+            if i == len(timesteps) - 1 or (
+                (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+            ):
+                progress_bar.update()
+    if output_type == "latent":
+        image = latents
+    else:
+        latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+        latents = (
+            latents / self.vae.config.scaling_factor
+        ) + self.vae.config.shift_factor
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = self.image_processor.postprocess(image, output_type=output_type)
+    # Offload all models
+    self.maybe_free_model_hooks()
+    if condition_scale != 1:
+        for name, module in pipeline.transformer.named_modules():
+            if not name.endswith(".attn"):
+                continue
+            del module.c_factor
+    if not return_dict:
+        return (image,)
+    return FluxPipelineOutput(images=image)

src/flux/lora_controller.py ADDED Viewed

	@@ -0,0 +1,77 @@

+from peft.tuners.tuners_utils import BaseTunerLayer
+from typing import List, Any, Optional, Type
+from .condition import condition_dict
+class enable_lora:
+    def __init__(self, lora_modules: List[BaseTunerLayer], activated: bool) -> None:
+        self.activated: bool = activated
+        if activated:
+            return
+        self.lora_modules: List[BaseTunerLayer] = [
+            each for each in lora_modules if isinstance(each, BaseTunerLayer)
+        ]
+        self.scales = [
+            {
+                active_adapter: lora_module.scaling[active_adapter]
+                for active_adapter in lora_module.active_adapters
+            }
+            for lora_module in self.lora_modules
+        ]
+    def __enter__(self) -> None:
+        if self.activated:
+            return
+        for lora_module in self.lora_modules:
+            if not isinstance(lora_module, BaseTunerLayer):
+                continue
+            for active_adapter in lora_module.active_adapters:
+                if active_adapter in condition_dict.keys():
+                    lora_module.scaling[active_adapter] = 0.0
+    def __exit__(
+        self,
+        exc_type: Optional[Type[BaseException]],
+        exc_val: Optional[BaseException],
+        exc_tb: Optional[Any],
+    ) -> None:
+        if self.activated:
+            return
+        for i, lora_module in enumerate(self.lora_modules):
+            if not isinstance(lora_module, BaseTunerLayer):
+                continue
+            for active_adapter in lora_module.active_adapters:
+                lora_module.scaling[active_adapter] = self.scales[i][active_adapter]
+class set_lora_scale:
+    def __init__(self, lora_modules: List[BaseTunerLayer], scale: float) -> None:
+        self.lora_modules: List[BaseTunerLayer] = [
+            each for each in lora_modules if isinstance(each, BaseTunerLayer)
+        ]
+        self.scales = [
+            {
+                active_adapter: lora_module.scaling[active_adapter]
+                for active_adapter in lora_module.active_adapters
+            }
+            for lora_module in self.lora_modules
+        ]
+        self.scale = scale
+    def __enter__(self) -> None:
+        for lora_module in self.lora_modules:
+            if not isinstance(lora_module, BaseTunerLayer):
+                continue
+            lora_module.scale_layer(self.scale)
+    def __exit__(
+        self,
+        exc_type: Optional[Type[BaseException]],
+        exc_val: Optional[BaseException],
+        exc_tb: Optional[Any],
+    ) -> None:
+        for i, lora_module in enumerate(self.lora_modules):
+            if not isinstance(lora_module, BaseTunerLayer):
+                continue
+            for active_adapter in lora_module.active_adapters:
+                lora_module.scaling[active_adapter] = self.scales[i][active_adapter]

src/flux/pipeline_tools.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from diffusers.pipelines import FluxPipeline
+from diffusers.utils import logging
+from diffusers.pipelines.flux.pipeline_flux import logger
+from torch import Tensor
+def encode_images(pipeline: FluxPipeline, images: Tensor):
+    images = pipeline.image_processor.preprocess(images)
+    images = images.to(pipeline.device).to(pipeline.dtype)
+    images = pipeline.vae.encode(images).latent_dist.sample()
+    images = (
+        images - pipeline.vae.config.shift_factor
+    ) * pipeline.vae.config.scaling_factor
+    images_tokens = pipeline._pack_latents(images, *images.shape)
+    images_ids = pipeline._prepare_latent_image_ids(
+        images.shape[0],
+        images.shape[2],
+        images.shape[3],
+        pipeline.device,
+        pipeline.dtype,
+    )
+    if images_tokens.shape[1] != images_ids.shape[0]:
+        images_ids = pipeline._prepare_latent_image_ids(
+            images.shape[0],
+            images.shape[2] // 2,
+            images.shape[3] // 2,
+            pipeline.device,
+            pipeline.dtype,
+        )
+    return images_tokens, images_ids
+def prepare_text_input(pipeline: FluxPipeline, prompts, max_sequence_length=512):
+    # Turn off warnings (CLIP overflow)
+    logger.setLevel(logging.ERROR)
+    (
+        prompt_embeds,
+        pooled_prompt_embeds,
+        text_ids,
+    ) = pipeline.encode_prompt(
+        prompt=prompts,
+        prompt_2=None,
+        prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        device=pipeline.device,
+        num_images_per_prompt=1,
+        max_sequence_length=max_sequence_length,
+        lora_scale=None,
+    )
+    # Turn on warnings
+    logger.setLevel(logging.WARNING)
+    return prompt_embeds, pooled_prompt_embeds, text_ids

src/flux/transformer.py ADDED Viewed

	@@ -0,0 +1,252 @@

+import torch
+from diffusers.pipelines import FluxPipeline
+from typing import List, Union, Optional, Dict, Any, Callable
+from .block import block_forward, single_block_forward
+from .lora_controller import enable_lora
+from accelerate.utils import is_torch_version
+from diffusers.models.transformers.transformer_flux import (
+    FluxTransformer2DModel,
+    Transformer2DModelOutput,
+    USE_PEFT_BACKEND,
+    scale_lora_layers,
+    unscale_lora_layers,
+    logger,
+)
+import numpy as np
+def prepare_params(
+    hidden_states: torch.Tensor,
+    encoder_hidden_states: torch.Tensor = None,
+    pooled_projections: torch.Tensor = None,
+    timestep: torch.LongTensor = None,
+    img_ids: torch.Tensor = None,
+    txt_ids: torch.Tensor = None,
+    guidance: torch.Tensor = None,
+    joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    controlnet_block_samples=None,
+    controlnet_single_block_samples=None,
+    return_dict: bool = True,
+    **kwargs: dict,
+):
+    return (
+        hidden_states,
+        encoder_hidden_states,
+        pooled_projections,
+        timestep,
+        img_ids,
+        txt_ids,
+        guidance,
+        joint_attention_kwargs,
+        controlnet_block_samples,
+        controlnet_single_block_samples,
+        return_dict,
+    )
+def tranformer_forward(
+    transformer: FluxTransformer2DModel,
+    condition_latents: torch.Tensor,
+    condition_ids: torch.Tensor,
+    condition_type_ids: torch.Tensor,
+    model_config: Optional[Dict[str, Any]] = {},
+    c_t=0,
+    **params: dict,
+):
+    self = transformer
+    use_condition = condition_latents is not None
+    (
+        hidden_states,
+        encoder_hidden_states,
+        pooled_projections,
+        timestep,
+        img_ids,
+        txt_ids,
+        guidance,
+        joint_attention_kwargs,
+        controlnet_block_samples,
+        controlnet_single_block_samples,
+        return_dict,
+    ) = prepare_params(**params)
+    if joint_attention_kwargs is not None:
+        joint_attention_kwargs = joint_attention_kwargs.copy()
+        lora_scale = joint_attention_kwargs.pop("scale", 1.0)
+    else:
+        lora_scale = 1.0
+    if USE_PEFT_BACKEND:
+        # weight the lora layers by setting `lora_scale` for each PEFT layer
+        scale_lora_layers(self, lora_scale)
+    else:
+        if (
+            joint_attention_kwargs is not None
+            and joint_attention_kwargs.get("scale", None) is not None
+        ):
+            logger.warning(
+                "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
+            )
+    with enable_lora((self.x_embedder,), model_config.get("latent_lora", False)):
+        hidden_states = self.x_embedder(hidden_states)
+    condition_latents = self.x_embedder(condition_latents) if use_condition else None
+    timestep = timestep.to(hidden_states.dtype) * 1000
+    if guidance is not None:
+        guidance = guidance.to(hidden_states.dtype) * 1000
+    else:
+        guidance = None
+    temb = (
+        self.time_text_embed(timestep, pooled_projections)
+        if guidance is None
+        else self.time_text_embed(timestep, guidance, pooled_projections)
+    )
+    cond_temb = (
+        self.time_text_embed(torch.ones_like(timestep) * c_t * 1000, pooled_projections)
+        if guidance is None
+        else self.time_text_embed(
+            torch.ones_like(timestep) * c_t * 1000, torch.ones_like(guidance) * 1000, pooled_projections
+        )
+    )
+    encoder_hidden_states = self.context_embedder(encoder_hidden_states)
+    if txt_ids.ndim == 3:
+        logger.warning(
+            "Passing `txt_ids` 3d torch.Tensor is deprecated."
+            "Please remove the batch dimension and pass it as a 2d torch Tensor"
+        )
+        txt_ids = txt_ids[0]
+    if img_ids.ndim == 3:
+        logger.warning(
+            "Passing `img_ids` 3d torch.Tensor is deprecated."
+            "Please remove the batch dimension and pass it as a 2d torch Tensor"
+        )
+        img_ids = img_ids[0]
+    ids = torch.cat((txt_ids, img_ids), dim=0)
+    image_rotary_emb = self.pos_embed(ids)
+    if use_condition:
+        # condition_ids[:, :1] = condition_type_ids
+        cond_rotary_emb = self.pos_embed(condition_ids)
+    # hidden_states = torch.cat([hidden_states, condition_latents], dim=1)
+    for index_block, block in enumerate(self.transformer_blocks):
+        if self.training and self.gradient_checkpointing:
+            ckpt_kwargs: Dict[str, Any] = (
+                {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+            )
+            encoder_hidden_states, hidden_states, condition_latents = (
+                torch.utils.checkpoint.checkpoint(
+                    block_forward,
+                    self=block,
+                    model_config=model_config,
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    condition_latents=condition_latents if use_condition else None,
+                    temb=temb,
+                    cond_temb=cond_temb if use_condition else None,
+                    cond_rotary_emb=cond_rotary_emb if use_condition else None,
+                    image_rotary_emb=image_rotary_emb,
+                    **ckpt_kwargs,
+                )
+            )
+        else:
+            encoder_hidden_states, hidden_states, condition_latents = block_forward(
+                block,
+                model_config=model_config,
+                hidden_states=hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                condition_latents=condition_latents if use_condition else None,
+                temb=temb,
+                cond_temb=cond_temb if use_condition else None,
+                cond_rotary_emb=cond_rotary_emb if use_condition else None,
+                image_rotary_emb=image_rotary_emb,
+            )
+        # controlnet residual
+        if controlnet_block_samples is not None:
+            interval_control = len(self.transformer_blocks) / len(
+                controlnet_block_samples
+            )
+            interval_control = int(np.ceil(interval_control))
+            hidden_states = (
+                hidden_states
+                + controlnet_block_samples[index_block // interval_control]
+            )
+    hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+    for index_block, block in enumerate(self.single_transformer_blocks):
+        if self.training and self.gradient_checkpointing:
+            ckpt_kwargs: Dict[str, Any] = (
+                {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+            )
+            result = torch.utils.checkpoint.checkpoint(
+                single_block_forward,
+                self=block,
+                model_config=model_config,
+                hidden_states=hidden_states,
+                temb=temb,
+                image_rotary_emb=image_rotary_emb,
+                **(
+                    {
+                        "condition_latents": condition_latents,
+                        "cond_temb": cond_temb,
+                        "cond_rotary_emb": cond_rotary_emb,
+                    }
+                    if use_condition
+                    else {}
+                ),
+                **ckpt_kwargs,
+            )
+        else:
+            result = single_block_forward(
+                block,
+                model_config=model_config,
+                hidden_states=hidden_states,
+                temb=temb,
+                image_rotary_emb=image_rotary_emb,
+                **(
+                    {
+                        "condition_latents": condition_latents,
+                        "cond_temb": cond_temb,
+                        "cond_rotary_emb": cond_rotary_emb,
+                    }
+                    if use_condition
+                    else {}
+                ),
+            )
+        if use_condition:
+            hidden_states, condition_latents = result
+        else:
+            hidden_states = result
+        # controlnet residual
+        if controlnet_single_block_samples is not None:
+            interval_control = len(self.single_transformer_blocks) / len(
+                controlnet_single_block_samples
+            )
+            interval_control = int(np.ceil(interval_control))
+            hidden_states[:, encoder_hidden_states.shape[1] :, ...] = (
+                hidden_states[:, encoder_hidden_states.shape[1] :, ...]
+                + controlnet_single_block_samples[index_block // interval_control]
+            )
+    hidden_states = hidden_states[:, encoder_hidden_states.shape[1] :, ...]
+    hidden_states = self.norm_out(hidden_states, temb)
+    output = self.proj_out(hidden_states)
+    if USE_PEFT_BACKEND:
+        # remove `lora_scale` from each PEFT layer
+        unscale_lora_layers(self, lora_scale)
+    if not return_dict:
+        return (output,)
+    return Transformer2DModelOutput(sample=output)

src/gradio/gradio_app.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import gradio as gr
+import torch
+from PIL import Image, ImageDraw, ImageFont
+from diffusers.pipelines import FluxPipeline
+from diffusers import FluxTransformer2DModel
+import numpy as np
+from ..flux.condition import Condition
+from ..flux.generate import seed_everything, generate
+pipe = None
+use_int8 = False
+def get_gpu_memory():
+    return torch.cuda.get_device_properties(0).total_memory / 1024**3
+def init_pipeline():
+    global pipe
+    if use_int8 or get_gpu_memory() < 33:
+        transformer_model = FluxTransformer2DModel.from_pretrained(
+            "sayakpaul/flux.1-schell-int8wo-improved",
+            torch_dtype=torch.bfloat16,
+            use_safetensors=False,
+        )
+        pipe = FluxPipeline.from_pretrained(
+            "black-forest-labs/FLUX.1-schnell",
+            transformer=transformer_model,
+            torch_dtype=torch.bfloat16,
+        )
+    else:
+        pipe = FluxPipeline.from_pretrained(
+            "black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16
+        )
+    pipe = pipe.to("cuda")
+    pipe.load_lora_weights(
+        "Yuanshi/OminiControl",
+        weight_name="omini/subject_512.safetensors",
+        adapter_name="subject",
+    )
+    # Optional: Load additional LoRA weights
+    #pipe.load_lora_weights("XLabs-AI/flux-RealismLora", adapter_name="realism")
+def process_image_and_text(image, text):
+    # center crop image
+    w, h, min_size = image.size[0], image.size[1], min(image.size)
+    image = image.crop(
+        (
+            (w - min_size) // 2,
+            (h - min_size) // 2,
+            (w + min_size) // 2,
+            (h + min_size) // 2,
+        )
+    )
+    image = image.resize((512, 512))
+    condition = Condition("subject", image, position_delta=(0, 32))
+    if pipe is None:
+        init_pipeline()
+    result_img = generate(
+        pipe,
+        prompt=text.strip(),
+        conditions=[condition],
+        num_inference_steps=8,
+        height=512,
+        width=512,
+    ).images[0]
+    return result_img
+def get_samples():
+    sample_list = [
+        {
+            "image": "assets/oranges.jpg",
+            "text": "A very close up view of this item. It is placed on a wooden table. The background is a dark room, the TV is on, and the screen is showing a cooking show. With text on the screen that reads 'Omini Control!'",
+        },
+        {
+            "image": "assets/penguin.jpg",
+            "text": "On Christmas evening, on a crowded sidewalk, this item sits on the road, covered in snow and wearing a Christmas hat, holding a sign that reads 'Omini Control!'",
+        },
+        {
+            "image": "assets/rc_car.jpg",
+            "text": "A film style shot. On the moon, this item drives across the moon surface. The background is that Earth looms large in the foreground.",
+        },
+        {
+            "image": "assets/clock.jpg",
+            "text": "In a Bauhaus style room, this item is placed on a shiny glass table, with a vase of flowers next to it. In the afternoon sun, the shadows of the blinds are cast on the wall.",
+        },
+        {
+            "image": "assets/tshirt.jpg",
+            "text": "On the beach, a lady sits under a beach umbrella with 'Omini' written on it. She's wearing this shirt and has a big smile on her face, with her surfboard hehind her.",
+        },
+    ]
+    return [[Image.open(sample["image"]), sample["text"]] for sample in sample_list]
+demo = gr.Interface(
+    fn=process_image_and_text,
+    inputs=[
+        gr.Image(type="pil"),
+        gr.Textbox(lines=2),
+    ],
+    outputs=gr.Image(type="pil"),
+    title="OminiControl / Subject driven generation",
+    examples=get_samples(),
+)
+if __name__ == "__main__":
+    init_pipeline()
+    demo.launch(
+        debug=True,
+    )

src/train/callbacks.py ADDED Viewed

	@@ -0,0 +1,268 @@

+import lightning as L
+from PIL import Image, ImageFilter, ImageDraw
+import numpy as np
+from transformers import pipeline
+import cv2
+import torch
+import os
+try:
+    import wandb
+except ImportError:
+    wandb = None
+from ..flux.condition import Condition
+from ..flux.generate import generate
+class TrainingCallback(L.Callback):
+    def __init__(self, run_name, training_config: dict = {}):
+        self.run_name, self.training_config = run_name, training_config
+        self.print_every_n_steps = training_config.get("print_every_n_steps", 10)
+        self.save_interval = training_config.get("save_interval", 1000)
+        self.sample_interval = training_config.get("sample_interval", 1000)
+        self.save_path = training_config.get("save_path", "./output")
+        self.wandb_config = training_config.get("wandb", None)
+        self.use_wandb = (
+            wandb is not None and os.environ.get("WANDB_API_KEY") is not None
+        )
+        self.total_steps = 0
+    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
+        gradient_size = 0
+        max_gradient_size = 0
+        count = 0
+        for _, param in pl_module.named_parameters():
+            if param.grad is not None:
+                gradient_size += param.grad.norm(2).item()
+                max_gradient_size = max(max_gradient_size, param.grad.norm(2).item())
+                count += 1
+        if count > 0:
+            gradient_size /= count
+        self.total_steps += 1
+        # Print training progress every n steps
+        if self.use_wandb:
+            report_dict = {
+                "steps": batch_idx,
+                "steps": self.total_steps,
+                "epoch": trainer.current_epoch,
+                "gradient_size": gradient_size,
+            }
+            loss_value = outputs["loss"].item() * trainer.accumulate_grad_batches
+            report_dict["loss"] = loss_value
+            report_dict["t"] = pl_module.last_t
+            wandb.log(report_dict)
+        if self.total_steps % self.print_every_n_steps == 0:
+            print(
+                f"Epoch: {trainer.current_epoch}, Steps: {self.total_steps}, Batch: {batch_idx}, Loss: {pl_module.log_loss:.4f}, Gradient size: {gradient_size:.4f}, Max gradient size: {max_gradient_size:.4f}"
+            )
+        # Save LoRA weights at specified intervals
+        if self.total_steps % self.save_interval == 0:
+            print(
+                f"Epoch: {trainer.current_epoch}, Steps: {self.total_steps} - Saving LoRA weights"
+            )
+            pl_module.save_lora(
+                f"{self.save_path}/{self.run_name}/ckpt/{self.total_steps}"
+            )
+        # Generate and save a sample image at specified intervals
+        if self.total_steps % self.sample_interval == 0:
+            print(
+                f"Epoch: {trainer.current_epoch}, Steps: {self.total_steps} - Generating a sample"
+            )
+            self.generate_a_sample(
+                trainer,
+                pl_module,
+                f"{self.save_path}/{self.run_name}/output",
+                f"lora_{self.total_steps}",
+                batch["condition_type"][
+                    0
+                ],  # Use the condition type from the current batch
+            )
+    @torch.no_grad()
+    def generate_a_sample(
+        self,
+        trainer,
+        pl_module,
+        save_path,
+        file_name,
+        condition_type="super_resolution",
+    ):
+        # TODO: change this two variables to parameters
+        condition_size = trainer.training_config["dataset"]["condition_size"]
+        target_size = trainer.training_config["dataset"]["target_size"]
+        position_scale = trainer.training_config["dataset"].get("position_scale", 1.0)
+        generator = torch.Generator(device=pl_module.device)
+        generator.manual_seed(42)
+        test_list = []
+        if condition_type == "subject":
+            test_list.extend(
+                [
+                    (
+                        Image.open("assets/test_in.jpg"),
+                        [0, -32],
+                        "Resting on the picnic table at a lakeside campsite, it's caught in the golden glow of early morning, with mist rising from the water and tall pines casting long shadows behind the scene.",
+                    ),
+                    (
+                        Image.open("assets/test_out.jpg"),
+                        [0, -32],
+                        "In a bright room. It is placed on a table.",
+                    ),
+                ]
+            )
+        elif condition_type == "scene":
+            test_list.extend(
+                [
+                        (
+                        Image.open("assets/a2759.jpg"),
+                        [0, -32],
+                        "change the color of the plane to red",
+                    ),
+                    (
+                        Image.open("assets/clock.jpg"),
+                        [0, -32],
+                        "turn the color of the clock to blue",
+                    ),
+                ]
+            )
+        elif condition_type == "canny":
+            condition_img = Image.open("assets/vase_hq.jpg").resize(
+                (condition_size, condition_size)
+            )
+            condition_img = np.array(condition_img)
+            condition_img = cv2.Canny(condition_img, 100, 200)
+            condition_img = Image.fromarray(condition_img).convert("RGB")
+            test_list.append(
+                (
+                    condition_img,
+                    [0, 0],
+                    "A beautiful vase on a table.",
+                    {"position_scale": position_scale} if position_scale != 1.0 else {},
+                )
+            )
+        elif condition_type == "coloring":
+            condition_img = (
+                Image.open("assets/vase_hq.jpg")
+                .resize((condition_size, condition_size))
+                .convert("L")
+                .convert("RGB")
+            )
+            test_list.append((condition_img, [0, 0], "A beautiful vase on a table."))
+        elif condition_type == "depth":
+            if not hasattr(self, "deepth_pipe"):
+                self.deepth_pipe = pipeline(
+                    task="depth-estimation",
+                    model="LiheYoung/depth-anything-small-hf",
+                    device="cpu",
+                )
+            condition_img = (
+                Image.open("assets/vase_hq.jpg")
+                .resize((condition_size, condition_size))
+                .convert("RGB")
+            )
+            condition_img = self.deepth_pipe(condition_img)["depth"].convert("RGB")
+            test_list.append(
+                (
+                    condition_img,
+                    [0, 0],
+                    "A beautiful vase on a table.",
+                    {"position_scale": position_scale} if position_scale != 1.0 else {},
+                )
+            )
+        elif condition_type == "depth_pred":
+            condition_img = (
+                Image.open("assets/vase_hq.jpg")
+                .resize((condition_size, condition_size))
+                .convert("RGB")
+            )
+            test_list.append((condition_img, [0, 0], "A beautiful vase on a table."))
+        elif condition_type == "deblurring":
+            blur_radius = 5
+            image = Image.open("./assets/vase_hq.jpg")
+            condition_img = (
+                image.convert("RGB")
+                .resize((condition_size, condition_size))
+                .filter(ImageFilter.GaussianBlur(blur_radius))
+                .convert("RGB")
+            )
+            test_list.append(
+                (
+                    condition_img,
+                    [0, 0],
+                    "A beautiful vase on a table.",
+                    {"position_scale": position_scale} if position_scale != 1.0 else {},
+                )
+            )
+        elif condition_type == "fill":
+            condition_img = (
+                Image.open("./assets/vase_hq.jpg")
+                .resize((condition_size, condition_size))
+                .convert("RGB")
+            )
+            mask = Image.new("L", condition_img.size, 0)
+            draw = ImageDraw.Draw(mask)
+            a = condition_img.size[0] // 4
+            b = a * 3
+            draw.rectangle([a, a, b, b], fill=255)
+            condition_img = Image.composite(
+                condition_img, Image.new("RGB", condition_img.size, (0, 0, 0)), mask
+            )
+            test_list.append((condition_img, [0, 0], "A beautiful vase on a table."))
+        elif condition_type == "sr":
+            condition_img = (
+                Image.open("assets/vase_hq.jpg")
+                .resize((condition_size, condition_size))
+                .convert("RGB")
+            )
+            test_list.append((condition_img, [0, -16], "A beautiful vase on a table."))
+        elif condition_type == "cartoon":
+            condition_img = (
+                Image.open("assets/cartoon_boy.png")
+                .resize((condition_size, condition_size))
+                .convert("RGB")
+            )
+            test_list.append(
+                (
+                    condition_img,
+                    [0, -16],
+                    "A cartoon character in a white background. He is looking right, and running.",
+                )
+            )
+        else:
+            raise NotImplementedError
+        if not os.path.exists(save_path):
+            os.makedirs(save_path)
+        for i, (condition_img, position_delta, prompt, *others) in enumerate(test_list):
+            condition = Condition(
+                condition_type=condition_type,
+                condition=condition_img.resize(
+                    (condition_size, condition_size)
+                ).convert("RGB"),
+                position_delta=position_delta,
+                **(others[0] if others else {}),
+            )
+            res = generate(
+                pl_module.flux_pipe,
+                prompt=prompt,
+                conditions=[condition],
+                height=target_size,
+                width=target_size,
+                generator=generator,
+                model_config=pl_module.model_config,
+                default_lora=True,
+            )
+            res.images[0].save(
+                os.path.join(save_path, f"{file_name}_{condition_type}_{i}.jpg")
+            )

src/train/data.py ADDED Viewed

	@@ -0,0 +1,401 @@

+from PIL import Image, ImageFilter, ImageDraw
+import cv2
+import numpy as np
+from torch.utils.data import Dataset
+import torchvision.transforms as T
+import random
+class Subject200KDataset(Dataset):
+    def __init__(
+        self,
+        base_dataset,
+        condition_size: int = 512,
+        target_size: int = 512,
+        image_size: int = 512,
+        padding: int = 0,
+        condition_type: str = "subject",
+        drop_text_prob: float = 0.1,
+        drop_image_prob: float = 0.1,
+        return_pil_image: bool = False,
+    ):
+        self.base_dataset = base_dataset
+        self.condition_size = condition_size
+        self.target_size = target_size
+        self.image_size = image_size
+        self.padding = padding
+        self.condition_type = condition_type
+        self.drop_text_prob = drop_text_prob
+        self.drop_image_prob = drop_image_prob
+        self.return_pil_image = return_pil_image
+        self.to_tensor = T.ToTensor()
+    def __len__(self):
+        return len(self.base_dataset) * 2
+    def __getitem__(self, idx):
+        # If target is 0, left image is target, right image is condition
+        target = idx % 2
+        item = self.base_dataset[idx // 2]
+        # Crop the image to target and condition
+        image = item["image"]
+        left_img = image.crop(
+            (
+                self.padding,
+                self.padding,
+                self.image_size + self.padding,
+                self.image_size + self.padding,
+            )
+        )
+        right_img = image.crop(
+            (
+                self.image_size + self.padding * 2,
+                self.padding,
+                self.image_size * 2 + self.padding * 2,
+                self.image_size + self.padding,
+            )
+        )
+        # Get the target and condition image
+        target_image, condition_img = (
+            (left_img, right_img) if target == 0 else (right_img, left_img)
+        )
+        # Resize the image
+        condition_img = condition_img.resize(
+            (self.condition_size, self.condition_size)
+        ).convert("RGB")
+        target_image = target_image.resize(
+            (self.target_size, self.target_size)
+        ).convert("RGB")
+        # Get the description
+        description = item["description"][
+            "description_0" if target == 0 else "description_1"
+        ]
+        # Randomly drop text or image
+        drop_text = random.random() < self.drop_text_prob
+        drop_image = random.random() < self.drop_image_prob
+        if drop_text:
+            description = ""
+        if drop_image:
+            condition_img = Image.new(
+                "RGB", (self.condition_size, self.condition_size), (0, 0, 0)
+            )
+        return {
+            "image": self.to_tensor(target_image),
+            "condition": self.to_tensor(condition_img),
+            "condition_type": self.condition_type,
+            "description": description,
+            # 16 is the downscale factor of the image
+            "position_delta": np.array([0, -self.condition_size // 16]),
+            **({"pil_image": image} if self.return_pil_image else {}),
+        }
+class SceneDataset(Dataset):
+    def __init__(
+        self,
+        base_dataset,
+        condition_size: int = 512,
+        target_size: int = 512,
+        image_size: int = 512,
+        padding: int = 0,
+        condition_type: str = "scene",
+        drop_text_prob: float = 0.1,
+        drop_image_prob: float = 0.1,
+        return_pil_image: bool = False,
+    ):
+        self.base_dataset = base_dataset
+        self.condition_size = condition_size
+        self.target_size = target_size
+        self.image_size = image_size
+        self.padding = padding
+        self.condition_type = condition_type
+        self.drop_text_prob = drop_text_prob
+        self.drop_image_prob = drop_image_prob
+        self.return_pil_image = return_pil_image
+        self.to_tensor = T.ToTensor()
+    def __len__(self):
+        return len(self.base_dataset)
+    def __getitem__(self, idx):
+        # If target is 0, left image is target, right image is condition
+        # target = idx % 2
+        target = 1
+        item = self.base_dataset[idx // 2]
+        # Crop the image to target and condition
+        imageA = item["imageA"]
+        imageB = item["imageB"]
+        left_img = imageA
+        right_img = imageB
+        # Get the target and condition image
+        target_image, condition_img = (
+            (left_img, right_img) if target == 0 else (right_img, left_img)
+        )
+        # Resize the image
+        condition_img = condition_img.resize(
+            (self.condition_size, self.condition_size)
+        ).convert("RGB")
+        target_image = target_image.resize(
+            (self.target_size, self.target_size)
+        ).convert("RGB")
+        # Get the description
+        description = item["prompt"]
+        # Randomly drop text or image
+        drop_text = random.random() < self.drop_text_prob
+        drop_image = random.random() < self.drop_image_prob
+        if drop_text:
+            description = ""
+        if drop_image:
+            condition_img = Image.new(
+                "RGB", (self.condition_size, self.condition_size), (0, 0, 0)
+            )
+        return {
+                "image": self.to_tensor(target_image),
+                "condition": self.to_tensor(condition_img),
+                "condition_type": self.condition_type,
+                "description": description,
+                "position_delta": np.array([0,  -self.condition_size // 16]),
+                **({"pil_image": [target_image, condition_img]} if self.return_pil_image else {}),
+            }
+class ImageConditionDataset(Dataset):
+    def __init__(
+        self,
+        base_dataset,
+        condition_size: int = 512,
+        target_size: int = 512,
+        condition_type: str = "canny",
+        drop_text_prob: float = 0.1,
+        drop_image_prob: float = 0.1,
+        return_pil_image: bool = False,
+        position_scale=1.0,
+    ):
+        self.base_dataset = base_dataset
+        self.condition_size = condition_size
+        self.target_size = target_size
+        self.condition_type = condition_type
+        self.drop_text_prob = drop_text_prob
+        self.drop_image_prob = drop_image_prob
+        self.return_pil_image = return_pil_image
+        self.position_scale = position_scale
+        self.to_tensor = T.ToTensor()
+    def __len__(self):
+        return len(self.base_dataset)
+    @property
+    def depth_pipe(self):
+        if not hasattr(self, "_depth_pipe"):
+            from transformers import pipeline
+            self._depth_pipe = pipeline(
+                task="depth-estimation",
+                model="LiheYoung/depth-anything-small-hf",
+                device="cpu",
+            )
+        return self._depth_pipe
+    def _get_canny_edge(self, img):
+        resize_ratio = self.condition_size / max(img.size)
+        img = img.resize(
+            (int(img.size[0] * resize_ratio), int(img.size[1] * resize_ratio))
+        )
+        img_np = np.array(img)
+        img_gray = cv2.cvtColor(img_np, cv2.COLOR_RGB2GRAY)
+        edges = cv2.Canny(img_gray, 100, 200)
+        return Image.fromarray(edges).convert("RGB")
+    def __getitem__(self, idx):
+        image = self.base_dataset[idx]["jpg"]
+        image = image.resize((self.target_size, self.target_size)).convert("RGB")
+        description = self.base_dataset[idx]["json"]["prompt"]
+        enable_scale = random.random() < 1
+        if not enable_scale:
+            condition_size = int(self.condition_size * self.position_scale)
+            position_scale = 1.0
+        else:
+            condition_size = self.condition_size
+            position_scale = self.position_scale
+        # Get the condition image
+        position_delta = np.array([0, 0])
+        if self.condition_type == "canny":
+            condition_img = self._get_canny_edge(image)
+        elif self.condition_type == "coloring":
+            condition_img = (
+                image.resize((condition_size, condition_size))
+                .convert("L")
+                .convert("RGB")
+            )
+        elif self.condition_type == "deblurring":
+            blur_radius = random.randint(1, 10)
+            condition_img = (
+                image.convert("RGB")
+                .filter(ImageFilter.GaussianBlur(blur_radius))
+                .resize((condition_size, condition_size))
+                .convert("RGB")
+            )
+        elif self.condition_type == "depth":
+            condition_img = self.depth_pipe(image)["depth"].convert("RGB")
+            condition_img = condition_img.resize((condition_size, condition_size))
+        elif self.condition_type == "depth_pred":
+            condition_img = image
+            image = self.depth_pipe(condition_img)["depth"].convert("RGB")
+            description = f"[depth] {description}"
+        elif self.condition_type == "fill":
+            condition_img = image.resize((condition_size, condition_size)).convert(
+                "RGB"
+            )
+            w, h = image.size
+            x1, x2 = sorted([random.randint(0, w), random.randint(0, w)])
+            y1, y2 = sorted([random.randint(0, h), random.randint(0, h)])
+            mask = Image.new("L", image.size, 0)
+            draw = ImageDraw.Draw(mask)
+            draw.rectangle([x1, y1, x2, y2], fill=255)
+            if random.random() > 0.5:
+                mask = Image.eval(mask, lambda a: 255 - a)
+            condition_img = Image.composite(
+                image, Image.new("RGB", image.size, (0, 0, 0)), mask
+            )
+        elif self.condition_type == "sr":
+            condition_img = image.resize((condition_size, condition_size)).convert(
+                "RGB"
+            )
+            position_delta = np.array([0, -condition_size // 16])
+        else:
+            raise ValueError(f"Condition type {self.condition_type} not implemented")
+        # Randomly drop text or image
+        drop_text = random.random() < self.drop_text_prob
+        drop_image = random.random() < self.drop_image_prob
+        if drop_text:
+            description = ""
+        if drop_image:
+            condition_img = Image.new(
+                "RGB", (condition_size, condition_size), (0, 0, 0)
+            )
+        return {
+            "image": self.to_tensor(image),
+            "condition": self.to_tensor(condition_img),
+            "condition_type": self.condition_type,
+            "description": description,
+            "position_delta": position_delta,
+            **({"pil_image": [image, condition_img]} if self.return_pil_image else {}),
+            **({"position_scale": position_scale} if position_scale != 1.0 else {}),
+        }
+class CartoonDataset(Dataset):
+    def __init__(
+        self,
+        base_dataset,
+        condition_size: int = 1024,
+        target_size: int = 1024,
+        image_size: int = 1024,
+        padding: int = 0,
+        condition_type: str = "cartoon",
+        drop_text_prob: float = 0.1,
+        drop_image_prob: float = 0.1,
+        return_pil_image: bool = False,
+    ):
+        self.base_dataset = base_dataset
+        self.condition_size = condition_size
+        self.target_size = target_size
+        self.image_size = image_size
+        self.padding = padding
+        self.condition_type = condition_type
+        self.drop_text_prob = drop_text_prob
+        self.drop_image_prob = drop_image_prob
+        self.return_pil_image = return_pil_image
+        self.to_tensor = T.ToTensor()
+    def __len__(self):
+        return len(self.base_dataset)
+    def __getitem__(self, idx):
+        data = self.base_dataset[idx]
+        condition_img = data["condition"]
+        target_image = data["target"]
+        # Tag
+        tag = data["tags"][0]
+        target_description = data["target_description"]
+        description = {
+            "lion": "lion like animal",
+            "bear": "bear like animal",
+            "gorilla": "gorilla like animal",
+            "dog": "dog like animal",
+            "elephant": "elephant like animal",
+            "eagle": "eagle like bird",
+            "tiger": "tiger like animal",
+            "owl": "owl like bird",
+            "woman": "woman",
+            "parrot": "parrot like bird",
+            "mouse": "mouse like animal",
+            "man": "man",
+            "pigeon": "pigeon like bird",
+            "girl": "girl",
+            "panda": "panda like animal",
+            "crocodile": "crocodile like animal",
+            "rabbit": "rabbit like animal",
+            "boy": "boy",
+            "monkey": "monkey like animal",
+            "cat": "cat like animal",
+        }
+        # Resize the image
+        condition_img = condition_img.resize(
+            (self.condition_size, self.condition_size)
+        ).convert("RGB")
+        target_image = target_image.resize(
+            (self.target_size, self.target_size)
+        ).convert("RGB")
+        # Process datum to create description
+        description = data.get(
+            "description",
+            f"Photo of a {description[tag]} cartoon character in a white background. Character is facing {target_description['facing_direction']}. Character pose is {target_description['pose']}.",
+        )
+        # Randomly drop text or image
+        drop_text = random.random() < self.drop_text_prob
+        drop_image = random.random() < self.drop_image_prob
+        if drop_text:
+            description = ""
+        if drop_image:
+            condition_img = Image.new(
+                "RGB", (self.condition_size, self.condition_size), (0, 0, 0)
+            )
+        return {
+            "image": self.to_tensor(target_image),
+            "condition": self.to_tensor(condition_img),
+            "condition_type": self.condition_type,
+            "description": description,
+            # 16 is the downscale factor of the image
+            "position_delta": np.array([0, -16]),
+        }

src/train/model.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import lightning as L
+from diffusers.pipelines import FluxPipeline
+import torch
+from peft import LoraConfig, get_peft_model_state_dict
+import prodigyopt
+from ..flux.transformer import tranformer_forward
+from ..flux.condition import Condition
+from ..flux.pipeline_tools import encode_images, prepare_text_input
+class OminiModel(L.LightningModule):
+    def __init__(
+        self,
+        flux_pipe_id: str,
+        lora_path: str = None,
+        lora_config: dict = None,
+        device: str = "cuda",
+        dtype: torch.dtype = torch.bfloat16,
+        model_config: dict = {},
+        optimizer_config: dict = None,
+        gradient_checkpointing: bool = False,
+    ):
+        # Initialize the LightningModule
+        super().__init__()
+        self.model_config = model_config
+        self.optimizer_config = optimizer_config
+        # Load the Flux pipeline
+        self.flux_pipe: FluxPipeline = (
+            FluxPipeline.from_pretrained(flux_pipe_id).to(dtype=dtype).to(device)
+        )
+        self.transformer = self.flux_pipe.transformer
+        self.transformer.gradient_checkpointing = gradient_checkpointing
+        self.transformer.train()
+        # Freeze the Flux pipeline
+        self.flux_pipe.text_encoder.requires_grad_(False).eval()
+        self.flux_pipe.text_encoder_2.requires_grad_(False).eval()
+        self.flux_pipe.vae.requires_grad_(False).eval()
+        # Initialize LoRA layers
+        self.lora_layers = self.init_lora(lora_path, lora_config)
+        self.to(device).to(dtype)
+    def init_lora(self, lora_path: str, lora_config: dict):
+        assert lora_path or lora_config
+        if lora_path:
+            # TODO: Implement this
+            raise NotImplementedError
+        else:
+            self.transformer.add_adapter(LoraConfig(**lora_config))
+            # TODO: Check if this is correct (p.requires_grad)
+            lora_layers = filter(
+                lambda p: p.requires_grad, self.transformer.parameters()
+            )
+        return list(lora_layers)
+    def save_lora(self, path: str):
+        FluxPipeline.save_lora_weights(
+            save_directory=path,
+            transformer_lora_layers=get_peft_model_state_dict(self.transformer),
+            safe_serialization=True,
+        )
+    def configure_optimizers(self):
+        # Freeze the transformer
+        self.transformer.requires_grad_(False)
+        opt_config = self.optimizer_config
+        # Set the trainable parameters
+        self.trainable_params = self.lora_layers
+        # Unfreeze trainable parameters
+        for p in self.trainable_params:
+            p.requires_grad_(True)
+        # Initialize the optimizer
+        if opt_config["type"] == "AdamW":
+            optimizer = torch.optim.AdamW(self.trainable_params, **opt_config["params"])
+        elif opt_config["type"] == "Prodigy":
+            optimizer = prodigyopt.Prodigy(
+                self.trainable_params,
+                **opt_config["params"],
+            )
+        elif opt_config["type"] == "SGD":
+            optimizer = torch.optim.SGD(self.trainable_params, **opt_config["params"])
+        else:
+            raise NotImplementedError
+        return optimizer
+    def training_step(self, batch, batch_idx):
+        step_loss = self.step(batch)
+        self.log_loss = (
+            step_loss.item()
+            if not hasattr(self, "log_loss")
+            else self.log_loss * 0.95 + step_loss.item() * 0.05
+        )
+        return step_loss
+    def step(self, batch):
+        imgs = batch["image"]
+        conditions = batch["condition"]
+        condition_types = batch["condition_type"]
+        prompts = batch["description"]
+        position_delta = batch["position_delta"][0]
+        position_scale = float(batch.get("position_scale", [1.0])[0])
+        # Prepare inputs
+        with torch.no_grad():
+            # Prepare image input
+            x_0, img_ids = encode_images(self.flux_pipe, imgs)
+            # Prepare text input
+            prompt_embeds, pooled_prompt_embeds, text_ids = prepare_text_input(
+                self.flux_pipe, prompts
+            )
+            # Prepare t and x_t
+            t = torch.sigmoid(torch.randn((imgs.shape[0],), device=self.device))
+            x_1 = torch.randn_like(x_0).to(self.device)
+            t_ = t.unsqueeze(1).unsqueeze(1)
+            x_t = ((1 - t_) * x_0 + t_ * x_1).to(self.dtype)
+            # Prepare conditions
+            condition_latents, condition_ids = encode_images(self.flux_pipe, conditions)
+            # Add position delta
+            condition_ids[:, 1] += position_delta[0]
+            condition_ids[:, 2] += position_delta[1]
+            if position_scale != 1.0:
+                scale_bias = (position_scale - 1.0) / 2
+                condition_ids[:, 1] *= position_scale
+                condition_ids[:, 2] *= position_scale
+                condition_ids[:, 1] += scale_bias
+                condition_ids[:, 2] += scale_bias
+            # Prepare condition type
+            condition_type_ids = torch.tensor(
+                [
+                    Condition.get_type_id(condition_type)
+                    for condition_type in condition_types
+                ]
+            ).to(self.device)
+            condition_type_ids = (
+                torch.ones_like(condition_ids[:, 0]) * condition_type_ids[0]
+            ).unsqueeze(1)
+            # Prepare guidance
+            guidance = (
+                torch.ones_like(t).to(self.device)
+                if self.transformer.config.guidance_embeds
+                else None
+            )
+        # Forward pass
+        transformer_out = tranformer_forward(
+            self.transformer,
+            # Model config
+            model_config=self.model_config,
+            # Inputs of the condition (new feature)
+            condition_latents=condition_latents,
+            condition_ids=condition_ids,
+            condition_type_ids=condition_type_ids,
+            # Inputs to the original transformer
+            hidden_states=x_t,
+            timestep=t,
+            guidance=guidance,
+            pooled_projections=pooled_prompt_embeds,
+            encoder_hidden_states=prompt_embeds,
+            txt_ids=text_ids,
+            img_ids=img_ids,
+            joint_attention_kwargs=None,
+            return_dict=False,
+        )
+        pred = transformer_out[0]
+        # Compute loss
+        loss = torch.nn.functional.mse_loss(pred, (x_1 - x_0), reduction="mean")
+        self.last_t = t.mean().item()
+        return loss

src/train/train.py ADDED Viewed

	@@ -0,0 +1,214 @@

+from torch.utils.data import DataLoader
+import torch
+import lightning as L
+import yaml
+import os
+import time
+import re
+from datasets import load_dataset
+from .data import ImageConditionDataset, Subject200KDataset, CartoonDataset, SceneDataset
+from .model import OminiModel
+from .callbacks import TrainingCallback
+import safetensors.torch
+from peft import PeftModel
+import os
+from PIL import Image
+import pandas as pd
+from torch.utils.data import Dataset
+from torchvision import transforms
+from torch.utils.data import DataLoader
+class LocalSubjectsDataset(Dataset):
+    def __init__(self, csv_file, image_dir, transform=None):
+        self.data = pd.read_csv(csv_file)
+        self.image_dir = image_dir
+        self.transform = transform
+        self.features = {
+            'imageA': 'PIL.Image',
+            'prompt': 'str',
+            'imageB': 'PIL.Image'
+        }
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        # 获取图片A、描述和图片B的文件名
+        imgA_value = self.data.iloc[idx]['imageA']
+        if isinstance(imgA_value, pd.Series):
+            imgA_value = imgA_value.values[0]
+        imgA_name = os.path.join(self.image_dir, str(imgA_value))
+        prompt = self.data.iloc[idx]['prompt']
+        imgB_value = self.data.iloc[idx]['imageB']
+        if isinstance(imgB_value, pd.Series):
+            imgB_value = imgB_value.values[0]
+        imgB_name = os.path.join(self.image_dir, str(imgB_value))
+        imageA = Image.open(imgA_name).convert("RGB")
+        imageB = Image.open(imgB_name).convert("RGB")
+        if self.transform:
+            imageA = self.transform(imageA)
+            imageB = self.transform(imageB)
+        sample = {'imageA': imageA, 'prompt': prompt, 'imageB': imageB}
+        return sample
+transform = transforms.Compose([
+    transforms.Resize((600, 600)),
+    # transforms.ToTensor(),
+])
+def get_rank():
+    try:
+        rank = int(os.environ.get("LOCAL_RANK"))
+    except:
+        rank = 0
+    return rank
+def get_config():
+    config_path = os.environ.get("XFL_CONFIG")
+    assert config_path is not None, "Please set the XFL_CONFIG environment variable"
+    with open(config_path, "r") as f:
+        config = yaml.safe_load(f)
+    return config
+def init_wandb(wandb_config, run_name):
+    import wandb
+    wandb.init(
+            project=wandb_config["project"],
+            name=run_name,
+            config={},
+        )
+def main():
+    # Initialize
+    is_main_process, rank = get_rank() == 0, get_rank()
+    torch.cuda.set_device(rank)
+    config = get_config()
+    training_config = config["train"]
+    run_name = time.strftime("%Y%m%d-%H%M%S")
+    # Initialize WanDB
+    wandb_config = training_config.get("wandb", None)
+    if wandb_config is not None and is_main_process:
+        init_wandb(wandb_config, run_name)
+    print("Rank:", rank)
+    if is_main_process:
+        print("Config:", config)
+    # Initialize dataset and dataloader
+    if training_config["dataset"]["type"] == "scene":
+        dataset = LocalSubjectsDataset(csv_file='csv_path', image_dir='images_path', transform=transform)
+        data_valid = dataset
+        print(data_valid.features)
+        print(len(data_valid))
+        print(training_config["dataset"])
+        dataset = SceneDataset(
+            data_valid,
+            condition_size=training_config["dataset"]["condition_size"],
+            target_size=training_config["dataset"]["target_size"],
+            image_size=training_config["dataset"]["image_size"],
+            padding=training_config["dataset"]["padding"],
+            condition_type=training_config["condition_type"],
+            drop_text_prob=training_config["dataset"]["drop_text_prob"],
+            drop_image_prob=training_config["dataset"]["drop_image_prob"],
+        )
+    elif training_config["dataset"]["type"] == "img":
+        # Load dataset text-to-image-2M
+        dataset = load_dataset(
+            "webdataset",
+            data_files={"train": training_config["dataset"]["urls"]},
+            split="train",
+            cache_dir="cache/t2i2m",
+            num_proc=32,
+        )
+        dataset = ImageConditionDataset(
+            dataset,
+            condition_size=training_config["dataset"]["condition_size"],
+            target_size=training_config["dataset"]["target_size"],
+            condition_type=training_config["condition_type"],
+            drop_text_prob=training_config["dataset"]["drop_text_prob"],
+            drop_image_prob=training_config["dataset"]["drop_image_prob"],
+            position_scale=training_config["dataset"].get("position_scale", 1.0),
+        )
+    elif training_config["dataset"]["type"] == "cartoon":
+        dataset = load_dataset("saquiboye/oye-cartoon", split="train")
+        dataset = CartoonDataset(
+            dataset,
+            condition_size=training_config["dataset"]["condition_size"],
+            target_size=training_config["dataset"]["target_size"],
+            image_size=training_config["dataset"]["image_size"],
+            padding=training_config["dataset"]["padding"],
+            condition_type=training_config["condition_type"],
+            drop_text_prob=training_config["dataset"]["drop_text_prob"],
+            drop_image_prob=training_config["dataset"]["drop_image_prob"],
+        )
+    elif training_config["dataset"]["type"] == "scene":
+        dataset = dataset
+    else:
+        raise NotImplementedError
+    print("Dataset length:", len(dataset))
+    train_loader = DataLoader(
+        dataset,
+        batch_size=training_config["batch_size"],
+        shuffle=True,
+        num_workers=training_config["dataloader_workers"],
+    )
+    print("Trainloader generated.")
+    # Initialize model
+    trainable_model = OminiModel(
+        flux_pipe_id=config["flux_path"],
+        lora_config=training_config["lora_config"],
+        device=f"cuda",
+        dtype=getattr(torch, config["dtype"]),
+        optimizer_config=training_config["optimizer"],
+        model_config=config.get("model", {}),
+        gradient_checkpointing=training_config.get("gradient_checkpointing", False),
+    )
+    training_callbacks = (
+        [TrainingCallback(run_name, training_config=training_config)]
+        if is_main_process
+        else []
+    )
+    # Initialize trainer
+    trainer = L.Trainer(
+        accumulate_grad_batches=training_config["accumulate_grad_batches"],
+        callbacks=training_callbacks,
+        enable_checkpointing=False,
+        enable_progress_bar=False,
+        logger=False,
+        max_steps=training_config.get("max_steps", -1),
+        max_epochs=training_config.get("max_epochs", -1),
+        gradient_clip_val=training_config.get("gradient_clip_val", 0.5),
+    )
+    setattr(trainer, "training_config", training_config)
+    # Save config
+    save_path = training_config.get("save_path", "./output")
+    if is_main_process:
+        os.makedirs(f"{save_path}/{run_name}")
+        with open(f"{save_path}/{run_name}/config.yaml", "w") as f:
+            yaml.dump(config, f)
+    # Start training
+    trainer.fit(trainable_model, train_loader)
+if __name__ == "__main__":
+    main()

train/README.md ADDED Viewed

	@@ -0,0 +1,138 @@

+# OminiControl Training 🛠️
+## Preparation
+### Setup
+1. **Environment**
+    ```bash
+    conda create -n omini python=3.10
+    conda activate omini
+    ```
+2. **Requirements**
+    ```bash
+    pip install -r train/requirements.txt
+    ```
+### Dataset
+1. Download dataset [Subject200K](https://huggingface.co/datasets/Yuanshi/Subjects200K). (**subject-driven generation**)
+    ```
+    bash train/script/data_download/data_download1.sh
+    ```
+2. Download dataset [text-to-image-2M](https://huggingface.co/datasets/jackyhate/text-to-image-2M). (**spatial control task**)
+    ```
+    bash train/script/data_download/data_download2.sh
+    ```
+    **Note:** By default, only a few files are downloaded. You can modify `data_download2.sh` to download additional datasets. Remember to update the config file to specify the training data accordingly.
+## Training
+### Start training training
+**Config file path**: `./train/config`
+**Scripts path**: `./train/script`
+1. Subject-driven generation
+    ```bash
+    bash train/script/train_subject.sh
+    ```
+2. Spatial control task
+    ```bash
+    bash train/script/train_canny.sh
+    ```
+**Note**: Detailed WanDB settings and GPU settings can be found in the script files and the config files.
+### Other spatial control tasks
+This repository supports 5 spatial control tasks:
+1. Canny edge to image (`canny`)
+2. Image colorization (`coloring`)
+3. Image deblurring (`deblurring`)
+4. Depth map to image (`depth`)
+5. Image to depth map  (`depth_pred`)
+6. Image inpainting (`fill`)
+7. Super resolution (`sr`)
+You can modify the `condition_type` parameter in config file `config/canny_512.yaml` to switch between different tasks.
+### Customize your own task
+You can customize your own task by constructing a new dataset and modifying the training code.
+<details>
+<summary>Instructions</summary>
+1. **Dataset** :
+   Construct a new dataset with the following format: (`src/train/data.py`)
+    ```python
+    class MyDataset(Dataset):
+        def __init__(self, ...):
+            ...
+        def __len__(self):
+            ...
+        def __getitem__(self, idx):
+            ...
+            return {
+                "image": image,
+                "condition": condition_img,
+                "condition_type": "your_condition_type",
+                "description": description,
+                "position_delta": position_delta
+            }
+    ```
+    **Note:** For spatial control tasks, set the `position_delta` to be `[0, 0]`. For non-spatial control tasks, set `position_delta` to be `[0, -condition_width // 16]`.
+2. **Condition**:
+   Add a new condition type in the `Condition` class. (`src/flux/condition.py`)
+    ```python
+    condition_dict = {
+        ...
+        "your_condition_type": your_condition_id_number, # Add your condition type here
+    }
+    ...
+    if condition_type in [
+        ...
+        "your_condition_type", # Add your condition type here
+    ]:
+        ...
+    ```
+3. **Test**:
+   Add a new test function for your task. (`src/train/callbacks.py`)
+    ```python
+    if self.condition_type == "your_condition_type":
+        condition_img = (
+            Image.open("images/vase.jpg")
+            .resize((condition_size, condition_size))
+            .convert("RGB")
+        )
+        ...
+        test_list.append((condition_img, [0, 0], "A beautiful vase on a table."))
+    ```
+4. **Import relevant dataset in the training script**
+   Update the file in the following section. (`src/train/train.py`)
+   ```python
+    from .data import (
+        ImageConditionDataset,
+        Subject200KDateset,
+        MyDataset
+    )
+    ...
+    # Initialize dataset and dataloader
+    if training_config["dataset"]["type"] == "your_condition_type":
+       ...
+   ```
+</details>
+## Hardware requirement
+**Note**: Memory optimization (like dynamic T5 model loading) is pending implementation.
+**Recommanded**
+- Hardware: 2x NVIDIA H100 GPUs
+- Memory: ~80GB GPU memory
+**Minimal**
+- Hardware: 1x NVIDIA L20 GPU
+- Memory: ~48GB GPU memory

train/config/canny_512.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+flux_path: "black-forest-labs/FLUX.1-dev"
+dtype: "bfloat16"
+model:
+  union_cond_attn: true
+  add_cond_attn: false
+  latent_lora: false
+train:
+  batch_size: 1
+  accumulate_grad_batches: 1
+  dataloader_workers: 5
+  save_interval: 1000
+  sample_interval: 100
+  max_steps: -1
+  gradient_checkpointing: true
+  save_path: "runs"
+  # Specify the type of condition to use.
+  # Options: ["canny", "coloring", "deblurring", "depth", "depth_pred", "fill"]
+  condition_type: "canny"
+  dataset:
+    type: "img"
+    urls:
+      - "https://huggingface.co/datasets/jackyhate/text-to-image-2M/resolve/main/data_512_2M/data_000046.tar"
+      - "https://huggingface.co/datasets/jackyhate/text-to-image-2M/resolve/main/data_512_2M/data_000045.tar"
+    cache_name: "data_512_2M"
+    condition_size: 512
+    target_size: 512
+    drop_text_prob: 0.1
+    drop_image_prob: 0.1
+  wandb:
+    project: "OminiControl"
+  lora_config:
+    r: 4
+    lora_alpha: 4
+    init_lora_weights: "gaussian"
+    target_modules: "(.*x_embedder|.*(?<!single_)transformer_blocks\\.[0-9]+\\.norm1\\.linear|.*(?<!single_)transformer_blocks\\.[0-9]+\\.attn\\.to_k|.*(?<!single_)transformer_blocks\\.[0-9]+\\.attn\\.to_q|.*(?<!single_)transformer_blocks\\.[0-9]+\\.attn\\.to_v|.*(?<!single_)transformer_blocks\\.[0-9]+\\.attn\\.to_out\\.0|.*(?<!single_)transformer_blocks\\.[0-9]+\\.ff\\.net\\.2|.*single_transformer_blocks\\.[0-9]+\\.norm\\.linear|.*single_transformer_blocks\\.[0-9]+\\.proj_mlp|.*single_transformer_blocks\\.[0-9]+\\.proj_out|.*single_transformer_blocks\\.[0-9]+\\.attn.to_k|.*single_transformer_blocks\\.[0-9]+\\.attn.to_q|.*single_transformer_blocks\\.[0-9]+\\.attn.to_v|.*single_transformer_blocks\\.[0-9]+\\.attn.to_out)"
+  optimizer:
+    type: "Prodigy"
+    params:
+      lr: 1
+      use_bias_correction: true
+      safeguard_warmup: true
+      weight_decay: 0.01

train/config/cartoon_512.yaml ADDED Viewed

	@@ -0,0 +1,44 @@

+flux_path: "black-forest-labs/FLUX.1-dev"
+dtype: "bfloat16"
+model:
+  union_cond_attn: true
+  add_cond_attn: false
+  latent_lora: false
+train:
+  batch_size: 1
+  accumulate_grad_batches: 1
+  dataloader_workers: 8
+  save_interval: 1000
+  sample_interval: 100
+  max_steps: 15000
+  gradient_checkpointing: false
+  save_path: "runs"
+  condition_type: "cartoon"
+  dataset:
+    type: "cartoon"
+    condition_size: 512
+    target_size: 512
+    image_size: 512
+    padding: 0
+    drop_text_prob: 0.1
+    drop_image_prob: 0.0
+  wandb:
+    project: "OminiControl"
+  lora_config:
+    r: 4
+    lora_alpha: 4
+    init_lora_weights: "gaussian"
+    target_modules: "(.*x_embedder|.*(?<!single_)transformer_blocks\\.[0-9]+\\.norm1\\.linear|.*(?<!single_)transformer_blocks\\.[0-9]+\\.attn\\.to_k|.*(?<!single_)transformer_blocks\\.[0-9]+\\.attn\\.to_q|.*(?<!single_)transformer_blocks\\.[0-9]+\\.attn\\.to_v|.*(?<!single_)transformer_blocks\\.[0-9]+\\.attn\\.to_out\\.0|.*(?<!single_)transformer_blocks\\.[0-9]+\\.ff\\.net\\.2|.*single_transformer_blocks\\.[0-9]+\\.norm\\.linear|.*single_transformer_blocks\\.[0-9]+\\.proj_mlp|.*single_transformer_blocks\\.[0-9]+\\.proj_out|.*single_transformer_blocks\\.[0-9]+\\.attn.to_k|.*single_transformer_blocks\\.[0-9]+\\.attn.to_q|.*single_transformer_blocks\\.[0-9]+\\.attn.to_v|.*single_transformer_blocks\\.[0-9]+\\.attn.to_out)"
+  optimizer:
+    type: "Prodigy"
+    params:
+      lr: 2
+      use_bias_correction: true
+      safeguard_warmup: true
+      weight_decay: 0.01

train/config/fill_1024.yaml ADDED Viewed

	@@ -0,0 +1,47 @@

+flux_path: "black-forest-labs/FLUX.1-dev"
+dtype: "bfloat16"
+model:
+  union_cond_attn: true
+  add_cond_attn: false
+  latent_lora: false
+train:
+  batch_size: 1
+  accumulate_grad_batches: 1
+  dataloader_workers: 5
+  save_interval: 1000
+  sample_interval: 100
+  max_steps: -1
+  gradient_checkpointing: true
+  save_path: "runs"
+  # Specify the type of condition to use.
+  # Options: ["canny", "coloring", "deblurring", "depth", "depth_pred", "fill"]
+  condition_type: "fill"
+  dataset:
+    type: "img"
+    urls:
+      - "https://huggingface.co/datasets/jackyhate/text-to-image-2M/resolve/main/data_1024_10K/data_000000.tar"
+    cache_name: "data_1024_10K"
+    condition_size: 1024
+    target_size: 1024
+    drop_text_prob: 0.1
+    drop_image_prob: 0.1
+  wandb:
+    project: "OminiControl"
+  lora_config:
+    r: 4
+    lora_alpha: 4
+    init_lora_weights: "gaussian"
+    target_modules: "(.*x_embedder|.*(?<!single_)transformer_blocks\\.[0-9]+\\.norm1\\.linear|.*(?<!single_)transformer_blocks\\.[0-9]+\\.attn\\.to_k|.*(?<!single_)transformer_blocks\\.[0-9]+\\.attn\\.to_q|.*(?<!single_)transformer_blocks\\.[0-9]+\\.attn\\.to_v|.*(?<!single_)transformer_blocks\\.[0-9]+\\.attn\\.to_out\\.0|.*(?<!single_)transformer_blocks\\.[0-9]+\\.ff\\.net\\.2|.*single_transformer_blocks\\.[0-9]+\\.norm\\.linear|.*single_transformer_blocks\\.[0-9]+\\.proj_mlp|.*single_transformer_blocks\\.[0-9]+\\.proj_out|.*single_transformer_blocks\\.[0-9]+\\.attn.to_k|.*single_transformer_blocks\\.[0-9]+\\.attn.to_q|.*single_transformer_blocks\\.[0-9]+\\.attn.to_v|.*single_transformer_blocks\\.[0-9]+\\.attn.to_out)"
+  optimizer:
+    type: "Prodigy"
+    params:
+      lr: 1
+      use_bias_correction: true
+      safeguard_warmup: true
+      weight_decay: 0.01

train/config/scene_512.yaml ADDED Viewed

	@@ -0,0 +1,45 @@

+flux_path: "black-forest-labs/FLUX.1-dev"
+dtype: "bfloat16"
+model:
+  union_cond_attn: true
+  add_cond_attn: false
+  latent_lora: true
+train:
+  batch_size: 1
+  accumulate_grad_batches: 1
+  dataloader_workers: 5
+  save_interval: 2000
+  sample_interval: 100
+  max_steps: -1
+  gradient_checkpointing: false
+  save_path: "save_path"
+  condition_type: "scene"
+  dataset:
+    type: "scene"
+    condition_size: 512
+    target_size: 512
+    image_size: 512
+    padding: 8
+    drop_text_prob: 0.1
+    drop_image_prob: 0.1
+  wandb:
+    project: "OminiControl"
+  lora_config:
+    r: 128
+    lora_alpha: 128
+    init_lora_weights: "gaussian"
+    target_modules: "(.*x_embedder|.*(?<!single_)transformer_blocks\\.[0-9]+\\.norm1\\.linear|.*(?<!single_)transformer_blocks\\.[0-9]+\\.attn\\.to_k|.*(?<!single_)transformer_blocks\\.[0-9]+\\.attn\\.to_q|.*(?<!single_)transformer_blocks\\.[0-9]+\\.attn\\.to_v|.*(?<!single_)transformer_blocks\\.[0-9]+\\.attn\\.to_out\\.0|.*(?<!single_)transformer_blocks\\.[0-9]+\\.ff\\.net\\.2|.*single_transformer_blocks\\.[0-9]+\\.norm\\.linear|.*single_transformer_blocks\\.[0-9]+\\.proj_mlp|.*single_transformer_blocks\\.[0-9]+\\.proj_out|.*single_transformer_blocks\\.[0-9]+\\.attn.to_k|.*single_transformer_blocks\\.[0-9]+\\.attn.to_q|.*single_transformer_blocks\\.[0-9]+\\.attn.to_v|.*single_transformer_blocks\\.[0-9]+\\.attn.to_out)"
+  optimizer:
+    type: "Prodigy"
+    params:
+      lr: 1
+      use_bias_correction: true
+      safeguard_warmup: true
+      weight_decay: 0.01

train/config/sr_512.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+flux_path: "black-forest-labs/FLUX.1-dev"
+dtype: "bfloat16"
+model:
+  union_cond_attn: true
+  add_cond_attn: false
+  latent_lora: false
+train:
+  batch_size: 1
+  accumulate_grad_batches: 1
+  dataloader_workers: 5
+  save_interval: 1000
+  sample_interval: 100
+  max_steps: -1
+  gradient_checkpointing: true
+  save_path: "runs"
+  # Specify the type of condition to use.
+  # Options: ["canny", "coloring", "deblurring", "depth", "depth_pred", "fill", "sr"]
+  condition_type: "sr"
+  dataset:
+    type: "img"
+    urls:
+      - "https://huggingface.co/datasets/jackyhate/text-to-image-2M/resolve/main/data_512_2M/data_000046.tar"
+      - "https://huggingface.co/datasets/jackyhate/text-to-image-2M/resolve/main/data_512_2M/data_000045.tar"
+    cache_name: "data_512_2M"
+    condition_size: 256
+    target_size: 512
+    drop_text_prob: 0.1
+    drop_image_prob: 0.1
+  wandb:
+    project: "OminiControl"
+  lora_config:
+    r: 4
+    lora_alpha: 4
+    init_lora_weights: "gaussian"
+    target_modules: "(.*x_embedder|.*(?<!single_)transformer_blocks\\.[0-9]+\\.norm1\\.linear|.*(?<!single_)transformer_blocks\\.[0-9]+\\.attn\\.to_k|.*(?<!single_)transformer_blocks\\.[0-9]+\\.attn\\.to_q|.*(?<!single_)transformer_blocks\\.[0-9]+\\.attn\\.to_v|.*(?<!single_)transformer_blocks\\.[0-9]+\\.attn\\.to_out\\.0|.*(?<!single_)transformer_blocks\\.[0-9]+\\.ff\\.net\\.2|.*single_transformer_blocks\\.[0-9]+\\.norm\\.linear|.*single_transformer_blocks\\.[0-9]+\\.proj_mlp|.*single_transformer_blocks\\.[0-9]+\\.proj_out|.*single_transformer_blocks\\.[0-9]+\\.attn.to_k|.*single_transformer_blocks\\.[0-9]+\\.attn.to_q|.*single_transformer_blocks\\.[0-9]+\\.attn.to_v|.*single_transformer_blocks\\.[0-9]+\\.attn.to_out)"
+  optimizer:
+    type: "Prodigy"
+    params:
+      lr: 1
+      use_bias_correction: true
+      safeguard_warmup: true
+      weight_decay: 0.01

train/config/subject_512.yaml ADDED Viewed

	@@ -0,0 +1,44 @@

+flux_path: "black-forest-labs/FLUX.1-dev"
+dtype: "bfloat16"
+model:
+  union_cond_attn: true
+  add_cond_attn: false
+  latent_lora: true
+train:
+  batch_size: 1
+  accumulate_grad_batches: 1
+  dataloader_workers: 5
+  save_interval: 1000
+  sample_interval: 100
+  max_steps: -1
+  gradient_checkpointing: true
+  save_path: "runs"
+  condition_type: "subject"
+  dataset:
+    type: "subject"
+    condition_size: 512
+    target_size: 512
+    image_size: 512
+    padding: 8
+    drop_text_prob: 0.1
+    drop_image_prob: 0.1
+  wandb:
+    project: "OminiControl"
+  lora_config:
+    r: 4
+    lora_alpha: 4
+    init_lora_weights: "gaussian"
+    target_modules: "(.*x_embedder|.*(?<!single_)transformer_blocks\\.[0-9]+\\.norm1\\.linear|.*(?<!single_)transformer_blocks\\.[0-9]+\\.attn\\.to_k|.*(?<!single_)transformer_blocks\\.[0-9]+\\.attn\\.to_q|.*(?<!single_)transformer_blocks\\.[0-9]+\\.attn\\.to_v|.*(?<!single_)transformer_blocks\\.[0-9]+\\.attn\\.to_out\\.0|.*(?<!single_)transformer_blocks\\.[0-9]+\\.ff\\.net\\.2|.*single_transformer_blocks\\.[0-9]+\\.norm\\.linear|.*single_transformer_blocks\\.[0-9]+\\.proj_mlp|.*single_transformer_blocks\\.[0-9]+\\.proj_out|.*single_transformer_blocks\\.[0-9]+\\.attn.to_k|.*single_transformer_blocks\\.[0-9]+\\.attn.to_q|.*single_transformer_blocks\\.[0-9]+\\.attn.to_v|.*single_transformer_blocks\\.[0-9]+\\.attn.to_out)"
+  optimizer:
+    type: "Prodigy"
+    params:
+      lr: 1
+      use_bias_correction: true
+      safeguard_warmup: true
+      weight_decay: 0.01

train/requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+diffusers==0.31.0
+transformers
+peft
+opencv-python
+protobuf
+sentencepiece
+gradio
+jupyter
+torchao
+lightning
+datasets
+torchvision
+prodigyopt
+wandb

train/script/data_download/data_download1.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ huggingface-cli download --repo-type dataset Yuanshi/Subjects200K

train/script/data_download/data_download2.sh ADDED Viewed

	@@ -0,0 +1,3 @@

+huggingface-cli download --repo-type dataset jackyhate/text-to-image-2M data_512_2M/data_000045.tar
+huggingface-cli download --repo-type dataset jackyhate/text-to-image-2M data_512_2M/data_000046.tar
+huggingface-cli download --repo-type dataset jackyhate/text-to-image-2M data_1024_10K/data_000000.tar

train/script/train_canny.sh ADDED Viewed

	@@ -0,0 +1,13 @@

+# Specify the config file path and the GPU devices to use
+# export CUDA_VISIBLE_DEVICES=0,1
+# Specify the config file path
+export XFL_CONFIG=./train/config/canny_512.yaml
+# Specify the WANDB API key
+# export WANDB_API_KEY='YOUR_WANDB_API_KEY'
+echo $XFL_CONFIG
+export TOKENIZERS_PARALLELISM=true
+accelerate launch --main_process_port 41353 -m src.train.train

train/script/train_cartoon.sh ADDED Viewed

	@@ -0,0 +1,15 @@

+# Specify the config file path and the GPU devices to use
+# export CUDA_VISIBLE_DEVICES=0,1
+# Specify the config file path
+export XFL_CONFIG=./train/config/cartoon_512.yaml
+export HF_HUB_CACHE=./cache
+# Specify the WANDB API key
+# export WANDB_API_KEY='YOUR_WANDB_API_KEY'
+echo $XFL_CONFIG
+export TOKENIZERS_PARALLELISM=true
+accelerate launch --main_process_port 41353 -m src.train.train

train/script/train_scene.sh ADDED Viewed

	@@ -0,0 +1,13 @@

+# Specify the config file path and the GPU devices to use
+# export CUDA_VISIBLE_DEVICES=0,1
+# Specify the config file path
+export XFL_CONFIG=./train/config/scene_512.yaml
+# Specify the WANDB API key
+# export WANDB_API_KEY='YOUR_WANDB_API_KEY'
+echo $XFL_CONFIG
+export TOKENIZERS_PARALLELISM=true
+accelerate launch --main_process_port 41353 -m src.train.train

train/script/train_subject.sh ADDED Viewed

	@@ -0,0 +1,13 @@

+# Specify the config file path and the GPU devices to use
+# export CUDA_VISIBLE_DEVICES=0,1
+# Specify the config file path
+export XFL_CONFIG=./train/config/subject_512.yaml
+# Specify the WANDB API key
+# export WANDB_API_KEY='YOUR_WANDB_API_KEY'
+echo $XFL_CONFIG
+export TOKENIZERS_PARALLELISM=true
+accelerate launch --main_process_port 41353 -m src.train.train

utils.py ADDED Viewed

	@@ -0,0 +1,591 @@

+import torch
+import numpy as np
+from diffusers.pipelines import FluxPipeline
+from src.flux.condition import Condition
+from PIL import Image
+import argparse
+import os
+import json
+import base64
+import io
+import re
+from PIL import Image, ImageFilter
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from scipy.ndimage import binary_dilation
+import cv2
+import openai
+from tenacity import retry, wait_exponential, stop_after_attempt, retry_if_exception_type
+from src.flux.generate import generate, seed_everything
+try:
+    from mmengine.visualization import Visualizer
+except ImportError:
+    Visualizer = None
+    print("Warning: mmengine is not installed, visualization is disabled.")
+import re
+def encode_image_to_datauri(path, size=(512, 512)):
+    with Image.open(path).convert('RGB') as img:
+        img = img.resize(size, Image.LANCZOS)
+        buffer = io.BytesIO()
+        img.save(buffer, format='PNG')
+    b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
+    return b64
+    # return f"data:image/png;base64,{b64}"
+@retry(
+    reraise=True,
+    wait=wait_exponential(min=1, max=60),
+    stop=stop_after_attempt(6),
+    retry=retry_if_exception_type((openai.error.RateLimitError, openai.error.APIError))
+)
+def cot_with_gpt(image_uri, instruction):
+    response = openai.ChatCompletion.create(
+        model="gpt-4o",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": f'''
+                    Now you are an expert in image editing. Based on the given single image, what atomic image editing instructions should be if the user wants to {instruction}? Let's think step by step.
+                    Atomic instructions include 13 categories as follows:
+                    - Add: e.g.: add a car on the road
+                    - Remove: e.g.: remove the sofa in the image
+                    - Color Change: e.g.: change the color of the shoes to blue
+                    - Material Change: e.g.: change the material of the sign like stone
+                    - Action Change: e.g.: change the action of the boy to raising hands
+                    - Expression Change: e.g.: change the expression to smile
+                    - Replace: e.g.: replace the coffee with an apple
+                    - Background Change: e.g.: change the background into forest
+                    - Appearance Change: e.g.: make the cup have a floral pattern
+                    - Move: e.g.: move the plane to the left
+                    - Resize: e.g.: enlarge the clock
+                    - Tone Transfer: e.g.: change the weather to foggy
+                    - Style Change: e.g.: make the style of the image to cartoon
+                    Respond *only* with a numbered list.
+                    Each line must begin with the category in square brackets, then the instruction. Please strictly follow the atomic categories.
+                    The operation (what) and the target (to what) are crystal clear.
+                    Do not split replace to add and remove.
+                    For example:
+                    “1. [Add] add a car on the road\n
+                    2. [Color Change] change the color of the shoes to blue\n
+                    3. [Move] move the lamp to the left\n"
+                    Do not include any extra text, explanations, JSON or markdown—just the list.
+                    '''},
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/jpeg;base64,{image_uri}"
+                        }
+                    },
+                ],
+            }
+        ],
+        max_tokens=300,
+    )
+    text = response.choices[0].message.content.strip()
+    print(text)
+    categories, instructions = extract_instructions(text)
+    return categories, instructions
+def extract_instructions(text):
+    categories = []
+    instructions = []
+    pattern = r'^\s*\d+\.\s*\[(.*?)\]\s*(.*?)$'
+    for line in text.split('\n'):
+        line = line.strip()
+        if not line:
+            continue
+        match = re.match(pattern, line)
+        if match:
+            category = match.group(1).strip()
+            instruction = match.group(2).strip()
+            if category and instruction:
+                categories.append(category)
+                instructions.append(instruction)
+    return categories, instructions
+def extract_last_bbox(result):
+    pattern = r'\[?<span data-type="inline-math" data-value="XCcoW15cJ10rKVwnLFxzKlxbXHMqKFxkKylccyosXHMqKFxkKylccyosXHMqKFxkKylccyosXHMqKFxkKylccypcXQ=="></span>\]?'
+    matches = re.findall(pattern, result)
+    if not matches:
+        simple_pattern = r'\[\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\]'
+        simple_matches = re.findall(simple_pattern, result)
+        if simple_matches:
+            x0, y0, x1, y1 = map(int, simple_matches[-1])
+            return [x0, y0, x1, y1]
+        else:
+            print(f"No bounding boxes found, please try again: {result}")
+            return None
+    last_match = matches[-1]
+    x0, y0, x1, y1 = map(int, last_match[1:])
+    return x0, y0, x1, y1
+def infer_with_DiT(task, image, instruction, category):
+    # seed_everything(3407)
+    if task == 'RoI Inpainting':
+        if category == 'Add' or category == 'Replace':
+            lora_path = "weights/add.safetensors"
+            added = extract_object_with_gpt(instruction)
+            instruction_dit = f"add {added} on the black region"
+        elif category == 'Remove' or category == 'Action Change':
+            lora_path = "weights/remove.safetensors"
+            instruction_dit = f"Fill the hole of the image"
+        condition = Condition("scene", image, position_delta=(0, 0))
+    elif task == 'RoI Editing':
+        image = Image.open(image).convert('RGB').resize((512, 512))
+        condition = Condition("scene", image, position_delta=(0, -32))
+        instruction_dit = instruction
+        if category == 'Action Change':
+            lora_path = "weights/action.safetensors"
+        elif category == 'Expression Change':
+            lora_path = "weights/expression.safetensors"
+        elif category == 'Add':
+            lora_path = "weights/addition.safetensors"
+        elif category == 'Material Change':
+            lora_path = "weights/material.safetensors"
+        elif category == 'Color Change':
+            lora_path = "weights/color.safetensors"
+        elif category == 'Background Change':
+            lora_path = "weights/bg.safetensors"
+        elif category == 'Appearance Change':
+            lora_path = "weights/appearance.safetensors"
+    elif task == 'RoI Compositioning':
+        lora_path = "weights/fusion.safetensors"
+        condition = Condition("scene", image, position_delta=(0, 0))
+        instruction_dit = "inpaint the black-bordered region so that the object's edges blend smoothly with the background"
+    elif task == 'Global Transformation':
+        image = Image.open(image).convert('RGB').resize((512, 512))
+        instruction_dit = instruction
+        lora_path = "weights/overall.safetensors"
+        condition = Condition("scene", image, position_delta=(0, -32))
+    else:
+        raise ValueError(f"Invalid task: '{task}'")
+    pipe = FluxPipeline.from_pretrained(
+        "black-forest-labs/FLUX.1-dev",
+        torch_dtype=torch.bfloat16
+    )
+    pipe = pipe.to("cuda")
+    pipe.load_lora_weights(
+        "Cicici1109/IEAP",
+        weight_name=lora_path,
+        adapter_name="scene",
+    )
+    result_img = generate(
+        pipe,
+        prompt=instruction_dit,
+        conditions=[condition],
+        config_path = "train/config/scene_512.yaml",
+        num_inference_steps=28,
+        height=512,
+        width=512,
+    ).images[0]
+    # result_img
+    if task == 'RoI Editing' and category == 'Action Change':
+        text_roi = extract_object_with_gpt(instruction)
+        instruction_loc = f"<image>Please segment {text_roi}."
+        # (model, tokenizer, image_path, instruction, work_dir, dilate):
+        img = result_img
+        # print(f"Instruction: {instruction_loc}")
+        model, tokenizer = load_model("ByteDance/Sa2VA-8B")
+        result = model.predict_forward(
+            image=img,
+            text=instruction_loc,
+            tokenizer=tokenizer,
+        )
+        prediction = result['prediction']
+        # print(f"Model Output: {prediction}")
+        if '[SEG]' in prediction and 'prediction_masks' in result:
+            pred_mask = result['prediction_masks'][0]
+            pred_mask_np = np.squeeze(np.array(pred_mask))
+            ## obtain region bbox
+            rows = np.any(pred_mask_np, axis=1)
+            cols = np.any(pred_mask_np, axis=0)
+            if not np.any(rows) or not np.any(cols):
+                print("Warning: Mask is empty, cannot compute bounding box")
+                return img
+            y0, y1 = np.where(rows)[0][[0, -1]]
+            x0, x1 = np.where(cols)[0][[0, -1]]
+            changed_instance = crop_masked_region(result_img, pred_mask_np)
+            return changed_instance, x0, y1, 1
+    return result_img
+def load_model(model_path):
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        torch_dtype="auto",
+        device_map="auto",
+        trust_remote_code=True
+    ).eval()
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    return model, tokenizer
+def extract_object_with_gpt(instruction):
+    system_prompt = (
+        "You are a helpful assistant that extracts the object or target being edited in an image editing instruction. "
+        "Only return a concise noun phrase describing the object. "
+        "Examples:\n"
+        "- Input: 'Remove the dog' → Output: 'the dog'\n"
+        "- Input: 'Add a hat on the dog' → Output: 'a hat'\n"
+        "- Input: 'Replace the biggest bear with a tiger' → Output: 'the biggest bear'\n"
+        "- Input: 'Change the action of the girl to riding' → Output: 'the girl'\n"
+        "- Input: 'Move the red car on the lake' → Output: 'the red car'\n"
+        "- Input: 'Minify the carrot on the rabbit's hand' → Output: 'the carrot on the rabbit's hand'\n"
+        "- Input: 'Swap the location of the dog and the cat' → Output: 'the dog and the cat'\n"
+        "Now extract the object for this instruction:"
+    )
+    try:
+        response = openai.ChatCompletion.create(
+            model="gpt-3.5-turbo",
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": instruction}
+            ],
+            temperature=0.2,
+            max_tokens=20,
+        )
+        object_phrase = response.choices[0].message['content'].strip().strip('"')
+        print(f"Identified object: {object_phrase}")
+        return object_phrase
+    except Exception as e:
+        print(f"GPT extraction failed: {e}")
+        return instruction
+def extract_region_with_gpt(instruction):
+    system_prompt = (
+        "You are a helpful assistant that extracts target region being edited in an image editing instruction. "
+        "Only return a concise noun phrase describing the target region. "
+        "Examples:\n"
+        "- Input: 'Add a red hat to the man on the left' → Output: 'the man on the left'\n"
+        "- Input: 'Add a cat beside the dog' → Output: 'the dog'\n"
+        "Now extract the target region for this instruction:"
+    )
+    try:
+        response = openai.ChatCompletion.create(
+            model="gpt-3.5-turbo",
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": instruction}
+            ],
+            temperature=0.2,
+            max_tokens=20,
+        )
+        object_phrase = response.choices[0].message['content'].strip().strip('"')
+        # print(f"Identified object: {object_phrase}")
+        return object_phrase
+    except Exception as e:
+        print(f"GPT extraction failed: {e}")
+        return instruction
+def get_masked(mask, image):
+    if mask.shape[:2] != image.size[::-1]:
+        raise ValueError(f"Mask size {mask.shape[:2]} does not match image size {image.size}")
+    image_array = np.array(image)
+    image_array[mask] = [0, 0, 0]
+    return Image.fromarray(image_array)
+def bbox_to_mask(x0, y0, x1, y1, image_shape=(512, 512), fill_value=True):
+    height, width = image_shape
+    mask = np.zeros((height, width), dtype=bool)
+    x0 = max(0, int(x0))
+    y0 = max(0, int(y0))
+    x1 = min(width, int(x1))
+    y1 = min(height, int(y1))
+    if x0 >= x1 or y0 >= y1:
+        print("Warning: Invalid bounding box coordinates")
+        return mask
+    mask[y0:y1, x0:x1] = fill_value
+    return mask
+def combine_bbox(text, x0, y0, x1, y1):
+    bbox = [x0, y0, x1, y1]
+    return [(text, bbox)]
+def crop_masked_region(image, pred_mask_np):
+    if not isinstance(image, Image.Image):
+        raise ValueError("The input image is not a PIL Image object")
+    if not isinstance(pred_mask_np, np.ndarray) or pred_mask_np.dtype != bool:
+        raise ValueError("pred_mask_np must be a NumPy array of boolean type")
+    if pred_mask_np.shape[:2] != image.size[::-1]:
+        raise ValueError(f"Mask size {pred_mask_np.shape[:2]} does not match image size {image.size}")
+    image_rgba = image.convert("RGBA")
+    image_array = np.array(image_rgba)
+    rows = np.any(pred_mask_np, axis=1)
+    cols = np.any(pred_mask_np, axis=0)
+    if not np.any(rows) or not np.any(cols):
+        print("Warning: Mask is empty, cannot compute bounding box")
+        return image_rgba
+    y0, y1 = np.where(rows)[0][[0, -1]]
+    x0, x1 = np.where(cols)[0][[0, -1]]
+    cropped_image = image_array[y0:y1+1, x0:x1+1].copy()
+    cropped_mask = pred_mask_np[y0:y1+1, x0:x1+1]
+    alpha_channel = np.ones(cropped_mask.shape, dtype=np.uint8) * 255
+    alpha_channel[~cropped_mask] = 0
+    cropped_image[:, :, 3] = alpha_channel
+    return Image.fromarray(cropped_image, mode='RGBA')
+def roi_localization(image, instruction, category): # add, remove, replace, action change, move, resize
+    model, tokenizer = load_model("ByteDance/Sa2VA-8B")
+    if category == 'Add':
+        text_roi = extract_region_with_gpt(instruction)
+    else:
+        text_roi = extract_object_with_gpt(instruction)
+    instruction_loc = f"<image>Please segment {text_roi}."
+    img = Image.open(image).convert('RGB').resize((512, 512))
+    print(f"Processing image: {os.path.basename(image)}, Instruction: {instruction_loc}")
+    result = model.predict_forward(
+        image=img,
+        text=instruction_loc,
+        tokenizer=tokenizer,
+    )
+    prediction = result['prediction']
+    # print(f"Model Output: {prediction}")
+    if '[SEG]' in prediction and 'prediction_masks' in result:
+        pred_mask = result['prediction_masks'][0]
+        pred_mask_np = np.squeeze(np.array(pred_mask))
+        if category == 'Add':
+            ## obtain region bbox
+            rows = np.any(pred_mask_np, axis=1)
+            cols = np.any(pred_mask_np, axis=0)
+            if not np.any(rows) or not np.any(cols):
+                print("Warning: Mask is empty, cannot compute bounding box")
+                return img
+            y0, y1 = np.where(rows)[0][[0, -1]]
+            x0, x1 = np.where(cols)[0][[0, -1]]
+            ## obtain inpainting bbox
+            bbox = combine_bbox(text_roi, x0, y0, x1, y1) #? multiple?
+            # print(bbox)
+            x0, y0, x1, y1 = layout_add(bbox, instruction)
+            mask = bbox_to_mask(x0, y0, x1, y1)
+            ## make it black
+            masked_img = get_masked(mask, img)
+        elif category == 'Move' or category == 'Resize':
+            dilated_original_mask = binary_dilation(pred_mask_np, iterations=3)
+            masked_img = get_masked(dilated_original_mask, img)
+            ## obtain region bbox
+            rows = np.any(pred_mask_np, axis=1)
+            cols = np.any(pred_mask_np, axis=0)
+            if not np.any(rows) or not np.any(cols):
+                print("Warning: Mask is empty, cannot compute bounding box")
+                return img
+            y0, y1 = np.where(rows)[0][[0, -1]]
+            x0, x1 = np.where(cols)[0][[0, -1]]
+            ## obtain inpainting bbox
+            bbox = combine_bbox(text_roi, x0, y0, x1, y1) #? multiple?
+            # print(bbox)
+            x0_new, y0_new, x1_new, y1_new, = layout_change(bbox, instruction)
+            scale = (y1_new - y0_new) / (y1 - y0)
+            # print(scale)
+            changed_instance = crop_masked_region(img, pred_mask_np)
+            return masked_img, changed_instance, x0_new, y1_new, scale
+        else:
+            dilated_original_mask = binary_dilation(pred_mask_np, iterations=3)
+            masked_img = get_masked(dilated_original_mask, img)
+        return masked_img
+    else:
+        print("No valid mask found in the prediction.")
+        return None
+def fusion(background, foreground, x, y, scale):
+    background = background.convert("RGBA")
+    bg_width, bg_height = background.size
+    fg_width, fg_height = foreground.size
+    new_size = (int(fg_width * scale), int(fg_height * scale))
+    foreground_resized = foreground.resize(new_size, Image.Resampling.LANCZOS)
+    left = x
+    top = y - new_size[1]
+    canvas = Image.new('RGBA', (bg_width, bg_height), (0, 0, 0, 0))
+    canvas.paste(foreground_resized, (left, top), foreground_resized)
+    masked_foreground = process_edge(canvas, left, top, new_size)
+    result = Image.alpha_composite(background, masked_foreground)
+    return result
+def process_edge(canvas, left, top, size):
+    width, height = size
+    region = canvas.crop((left, top, left + width, top + height))
+    alpha = region.getchannel('A')
+    dilated_alpha = alpha.filter(ImageFilter.MaxFilter(5))
+    eroded_alpha = alpha.filter(ImageFilter.MinFilter(3))
+    edge_mask = Image.new('L', (width, height), 0)
+    edge_pixels = edge_mask.load()
+    dilated_pixels = dilated_alpha.load()
+    eroded_pixels = eroded_alpha.load()
+    for y in range(height):
+        for x in range(width):
+            if dilated_pixels[x, y] > 0 and eroded_pixels[x, y] == 0:
+                edge_pixels[x, y] = 255
+    black_edge = Image.new('RGBA', (width, height), (0, 0, 0, 0))
+    black_edge.putalpha(edge_mask)
+    canvas.paste(black_edge, (left, top), black_edge)
+    return canvas
+def combine_text_and_bbox(text_roi, x0, y0, x1, y1):
+    return [(text_roi, [x0, y0, x1, y1])]
+@retry(
+    reraise=True,
+    wait=wait_exponential(min=1, max=60),
+    stop=stop_after_attempt(6),
+    retry=retry_if_exception_type((openai.error.RateLimitError, openai.error.APIError))
+)
+def layout_add(bbox, instruction):
+    response = openai.ChatCompletion.create(
+        model="gpt-4o",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": f'''
+                    You are an intelligent bounding box editor. I will provide you with the current bounding boxes and an add editing instruction.
+                    Your task is to determine the new bounding box of the added object. Let's think step by step.
+                    The images are of size 512x512. The top-left corner has coordinate [0, 0]. The bottom-right corner has coordinnate [512, 512].
+                    The bounding boxes should not go beyond the image boundaries. The new box must be large enough to reasonably encompass the added object in a visually appropriate way, allowing for partial overlap with existing objects when it comes to accessories like hat, necklace. etc.
+                    Each bounding box should be in the format of (object name,[top-left x coordinate, top-left y coordinate, bottom-right x coordinate, bottom-right y coordinate]).
+                    Only return the bounding box of the newly added object. Do not include the existing bounding boxes.
+                    Please consider the semantic information of the layout, preserve semantic relations.
+                    If needed, you can make reasonable guesses. Please refer to the examples below:
+                    Input bounding boxes: [('a green car', [21, 281, 232, 440])]
+                    Editing instruction: Add a bird on the green car.
+                    Output bounding boxes: [('a bird', [80, 150, 180, 281])]
+                    Input bounding boxes: [('stool', [300, 350, 380, 450])]
+                    Editing instruction: Add a cat to the left of the stool.
+                    Output bounding boxes: [('a cat', [180, 250, 300, 450])]
+                    Here are some examples to illustrate appropriate overlapping for better visual effects:
+                    Input bounding boxes: [('the white cat', [200, 300, 320, 420])]
+                    Editing instruction: Add a hat on the white cat.
+                    Output bounding boxes: [('a hat', [200, 150, 320, 330])]
+                    Now, the current bounding boxes is {bbox}, the instruction is {instruction}.
+                    '''},
+                ],
+            }
+        ],
+        max_tokens=1000,
+    )
+    result = response.choices[0].message.content.strip()
+    bbox = extract_last_bbox(result)
+    return bbox
+@retry(
+    reraise=True,
+    wait=wait_exponential(min=1, max=60),
+    stop=stop_after_attempt(6),
+    retry=retry_if_exception_type((openai.error.RateLimitError, openai.error.APIError))
+)
+def layout_change(bbox, instruction):
+    response = openai.ChatCompletion.create(
+        model="gpt-4o",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": f'''
+                    You are an intelligent bounding box editor. I will provide you with the current bounding boxes and the editing instruction.
+                    Your task is to generate the new bounding boxes after editing.
+                    The images are of size 512x512. The top-left corner has coordinate [0, 0]. The bottom-right corner has coordinnate [512, 512].
+                    The bounding boxes should not overlap or go beyond the image boundaries.
+                    Each bounding box should be in the format of (object name, [top-left x coordinate, top-left y coordinate, bottom-right x coordinate, bottom-right y coordinate]).
+                    Do not add new objects or delete any object provided in the bounding boxes. Do not change the size or the shape of any object unless the instruction requires so.
+                    Please consider the semantic information of the layout.
+                    When resizing, keep the bottom-left corner fixed by default. When swaping locations, change according to the center point.
+                    If needed, you can make reasonable guesses. Please refer to the examples below:
+                    Input bounding boxes: [('a car', [21, 281, 232, 440])]
+                    Editing instruction: Move the car to the right.
+                    Output bounding boxes: [('a car', [121, 281, 332, 440])]
+                    Input bounding boxes: [("bed", [50, 300, 450, 450]), ("pillow", [200, 200, 300, 230])]
+                    Editing instruction: Move the pillow to the left side of the bed.
+                    Output bounding boxes: [("bed", [50, 300, 450, 450]), ("pillow", [70, 270, 170, 300])]
+                    Input bounding boxes: [("dog", [150, 250, 250, 300])]
+                    Editing instruction: Enlarge the dog.
+                    Output bounding boxes: [("dog", [150, 225, 300, 300])]
+                    Input bounding boxes: [("chair", [100, 350, 200, 450]), ("lamp", [300, 200, 360, 300])]
+                    Editing instruction: Swap the location of the chair and the lamp.
+                    Output bounding boxes: [("chair", [280, 200, 380, 300]), ("lamp", [120, 350, 180, 450])]
+                    Now, the current bounding boxes is {bbox}, the instruction is {instruction}. Let's think step by step, and output the edited layout.
+                    '''},
+                ],
+            }
+        ],
+        max_tokens=1000,
+    )
+    result = response.choices[0].message.content.strip()
+    bbox = extract_last_bbox(result)
+    return bbox