Spaces:

lehduong
/

OneDiffusion

Running on Zero

App Files Files Community

lehduong commited on Dec 11, 2024

Commit

038856e

verified ·

1 Parent(s): 07d760c

Upload folder using huggingface_hub

Browse files

Files changed (36) hide show

.gitattributes +9 -0
.gitignore +168 -0
LICENSE +407 -0
PROMPT_GUIDE.md +91 -0
README.md +169 -14
assets/cond_and_image.jpg +3 -0
assets/examples/id_customization/chenhao/image_0.png +0 -0
assets/examples/id_customization/chenhao/image_1.png +0 -0
assets/examples/id_customization/chenhao/image_2.png +0 -0
assets/onediffusion_appendix_faceid.jpg +3 -0
assets/onediffusion_appendix_faceid_3.jpg +3 -0
assets/onediffusion_appendix_multiview.jpg +3 -0
assets/onediffusion_appendix_multiview_2.jpg +0 -0
assets/onediffusion_appendix_text2multiview.pdf +3 -0
assets/onediffusion_editing.jpg +0 -0
assets/onediffusion_zeroshot.jpg +3 -0
assets/promptguide_complex.jpg +3 -0
assets/promptguide_idtask.jpg +0 -0
assets/subject_driven.jpg +0 -0
assets/teaser.png +3 -0
assets/text2image.jpg +0 -0
assets/text2multiview.jpg +3 -0
docker/Dockerfile +119 -0
gradio_demo.py +715 -0
inference.py +37 -0
onediffusion/dataset/multitask/multiview.py +277 -0
onediffusion/dataset/raydiff_utils.py +739 -0
onediffusion/dataset/transforms.py +133 -0
onediffusion/dataset/utils.py +175 -0
onediffusion/diffusion/pipelines/image_processor.py +674 -0
onediffusion/diffusion/pipelines/onediffusion.py +1080 -0
onediffusion/models/denoiser/__init__.py +3 -0
onediffusion/models/denoiser/nextdit/__init__.py +1 -0
onediffusion/models/denoiser/nextdit/layers.py +132 -0
onediffusion/models/denoiser/nextdit/modeling_nextdit.py +571 -0
requirements.txt +27 -6

.gitattributes CHANGED Viewed

@@ -33,3 +33,12 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/cond_and_image.jpg filter=lfs diff=lfs merge=lfs -text
+assets/onediffusion_appendix_faceid.jpg filter=lfs diff=lfs merge=lfs -text
+assets/onediffusion_appendix_faceid_3.jpg filter=lfs diff=lfs merge=lfs -text
+assets/onediffusion_appendix_multiview.jpg filter=lfs diff=lfs merge=lfs -text
+assets/onediffusion_appendix_text2multiview.pdf filter=lfs diff=lfs merge=lfs -text
+assets/onediffusion_zeroshot.jpg filter=lfs diff=lfs merge=lfs -text
+assets/promptguide_complex.jpg filter=lfs diff=lfs merge=lfs -text
+assets/teaser.png filter=lfs diff=lfs merge=lfs -text
+assets/text2multiview.jpg filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,168 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

LICENSE ADDED Viewed

	@@ -0,0 +1,407 @@

+Attribution-NonCommercial 4.0 International
+=======================================================================
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+Using Creative Commons Public Licenses
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright
+and certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+     Considerations for licensors: Our public licenses are
+     intended for use by those authorized to give the public
+     permission to use material in ways otherwise restricted by
+     copyright and certain other rights. Our licenses are
+     irrevocable. Licensors should read and understand the terms
+     and conditions of the license they choose before applying it.
+     Licensors should also secure all rights necessary before
+     applying our licenses so that the public can reuse the
+     material as expected. Licensors should clearly mark any
+     material not subject to the license. This includes other CC-
+     licensed material, or material used under an exception or
+     limitation to copyright. More considerations for licensors:
+    wiki.creativecommons.org/Considerations_for_licensors
+     Considerations for the public: By using one of our public
+     licenses, a licensor grants the public permission to use the
+     licensed material under specified terms and conditions. If
+     the licensor's permission is not necessary for any reason--for
+     example, because of any applicable exception or limitation to
+     copyright--then that use is not regulated by the license. Our
+     licenses grant only permissions under copyright and certain
+     other rights that a licensor has authority to grant. Use of
+     the licensed material may still be restricted for other
+     reasons, including because others have copyright or other
+     rights in the material. A licensor may make special requests,
+     such as asking that all changes be marked or described.
+     Although not required by our licenses, you are encouraged to
+     respect those requests where reasonable. More considerations
+     for the public:
+    wiki.creativecommons.org/Considerations_for_licensees
+=======================================================================
+Creative Commons Attribution-NonCommercial 4.0 International Public
+License
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-NonCommercial 4.0 International Public License ("Public
+License"). To the extent this Public License may be interpreted as a
+contract, You are granted the Licensed Rights in consideration of Your
+acceptance of these terms and conditions, and the Licensor grants You
+such rights in consideration of benefits the Licensor receives from
+making the Licensed Material available under these terms and
+conditions.
+Section 1 -- Definitions.
+  a. Adapted Material means material subject to Copyright and Similar
+     Rights that is derived from or based upon the Licensed Material
+     and in which the Licensed Material is translated, altered,
+     arranged, transformed, or otherwise modified in a manner requiring
+     permission under the Copyright and Similar Rights held by the
+     Licensor. For purposes of this Public License, where the Licensed
+     Material is a musical work, performance, or sound recording,
+     Adapted Material is always produced where the Licensed Material is
+     synched in timed relation with a moving image.
+  b. Adapter's License means the license You apply to Your Copyright
+     and Similar Rights in Your contributions to Adapted Material in
+     accordance with the terms and conditions of this Public License.
+  c. Copyright and Similar Rights means copyright and/or similar rights
+     closely related to copyright including, without limitation,
+     performance, broadcast, sound recording, and Sui Generis Database
+     Rights, without regard to how the rights are labeled or
+     categorized. For purposes of this Public License, the rights
+     specified in Section 2(b)(1)-(2) are not Copyright and Similar
+     Rights.
+  d. Effective Technological Measures means those measures that, in the
+     absence of proper authority, may not be circumvented under laws
+     fulfilling obligations under Article 11 of the WIPO Copyright
+     Treaty adopted on December 20, 1996, and/or similar international
+     agreements.
+  e. Exceptions and Limitations means fair use, fair dealing, and/or
+     any other exception or limitation to Copyright and Similar Rights
+     that applies to Your use of the Licensed Material.
+  f. Licensed Material means the artistic or literary work, database,
+     or other material to which the Licensor applied this Public
+     License.
+  g. Licensed Rights means the rights granted to You subject to the
+     terms and conditions of this Public License, which are limited to
+     all Copyright and Similar Rights that apply to Your use of the
+     Licensed Material and that the Licensor has authority to license.
+  h. Licensor means the individual(s) or entity(ies) granting rights
+     under this Public License.
+  i. NonCommercial means not primarily intended for or directed towards
+     commercial advantage or monetary compensation. For purposes of
+     this Public License, the exchange of the Licensed Material for
+     other material subject to Copyright and Similar Rights by digital
+     file-sharing or similar means is NonCommercial provided there is
+     no payment of monetary compensation in connection with the
+     exchange.
+  j. Share means to provide material to the public by any means or
+     process that requires permission under the Licensed Rights, such
+     as reproduction, public display, public performance, distribution,
+     dissemination, communication, or importation, and to make material
+     available to the public including in ways that members of the
+     public may access the material from a place and at a time
+     individually chosen by them.
+  k. Sui Generis Database Rights means rights other than copyright
+     resulting from Directive 96/9/EC of the European Parliament and of
+     the Council of 11 March 1996 on the legal protection of databases,
+     as amended and/or succeeded, as well as other essentially
+     equivalent rights anywhere in the world.
+  l. You means the individual or entity exercising the Licensed Rights
+     under this Public License. Your has a corresponding meaning.
+Section 2 -- Scope.
+  a. License grant.
+       1. Subject to the terms and conditions of this Public License,
+          the Licensor hereby grants You a worldwide, royalty-free,
+          non-sublicensable, non-exclusive, irrevocable license to
+          exercise the Licensed Rights in the Licensed Material to:
+            a. reproduce and Share the Licensed Material, in whole or
+               in part, for NonCommercial purposes only; and
+            b. produce, reproduce, and Share Adapted Material for
+               NonCommercial purposes only.
+       2. Exceptions and Limitations. For the avoidance of doubt, where
+          Exceptions and Limitations apply to Your use, this Public
+          License does not apply, and You do not need to comply with
+          its terms and conditions.
+       3. Term. The term of this Public License is specified in Section
+          6(a).
+       4. Media and formats; technical modifications allowed. The
+          Licensor authorizes You to exercise the Licensed Rights in
+          all media and formats whether now known or hereafter created,
+          and to make technical modifications necessary to do so. The
+          Licensor waives and/or agrees not to assert any right or
+          authority to forbid You from making technical modifications
+          necessary to exercise the Licensed Rights, including
+          technical modifications necessary to circumvent Effective
+          Technological Measures. For purposes of this Public License,
+          simply making modifications authorized by this Section 2(a)
+          (4) never produces Adapted Material.
+       5. Downstream recipients.
+            a. Offer from the Licensor -- Licensed Material. Every
+               recipient of the Licensed Material automatically
+               receives an offer from the Licensor to exercise the
+               Licensed Rights under the terms and conditions of this
+               Public License.
+            b. No downstream restrictions. You may not offer or impose
+               any additional or different terms or conditions on, or
+               apply any Effective Technological Measures to, the
+               Licensed Material if doing so restricts exercise of the
+               Licensed Rights by any recipient of the Licensed
+               Material.
+       6. No endorsement. Nothing in this Public License constitutes or
+          may be construed as permission to assert or imply that You
+          are, or that Your use of the Licensed Material is, connected
+          with, or sponsored, endorsed, or granted official status by,
+          the Licensor or others designated to receive attribution as
+          provided in Section 3(a)(1)(A)(i).
+  b. Other rights.
+       1. Moral rights, such as the right of integrity, are not
+          licensed under this Public License, nor are publicity,
+          privacy, and/or other similar personality rights; however, to
+          the extent possible, the Licensor waives and/or agrees not to
+          assert any such rights held by the Licensor to the limited
+          extent necessary to allow You to exercise the Licensed
+          Rights, but not otherwise.
+       2. Patent and trademark rights are not licensed under this
+          Public License.
+       3. To the extent possible, the Licensor waives any right to
+          collect royalties from You for the exercise of the Licensed
+          Rights, whether directly or through a collecting society
+          under any voluntary or waivable statutory or compulsory
+          licensing scheme. In all other cases the Licensor expressly
+          reserves any right to collect such royalties, including when
+          the Licensed Material is used other than for NonCommercial
+          purposes.
+Section 3 -- License Conditions.
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+  a. Attribution.
+       1. If You Share the Licensed Material (including in modified
+          form), You must:
+            a. retain the following if it is supplied by the Licensor
+               with the Licensed Material:
+                 i. identification of the creator(s) of the Licensed
+                    Material and any others designated to receive
+                    attribution, in any reasonable manner requested by
+                    the Licensor (including by pseudonym if
+                    designated);
+                ii. a copyright notice;
+               iii. a notice that refers to this Public License;
+                iv. a notice that refers to the disclaimer of
+                    warranties;
+                 v. a URI or hyperlink to the Licensed Material to the
+                    extent reasonably practicable;
+            b. indicate if You modified the Licensed Material and
+               retain an indication of any previous modifications; and
+            c. indicate the Licensed Material is licensed under this
+               Public License, and include the text of, or the URI or
+               hyperlink to, this Public License.
+       2. You may satisfy the conditions in Section 3(a)(1) in any
+          reasonable manner based on the medium, means, and context in
+          which You Share the Licensed Material. For example, it may be
+          reasonable to satisfy the conditions by providing a URI or
+          hyperlink to a resource that includes the required
+          information.
+       3. If requested by the Licensor, You must remove any of the
+          information required by Section 3(a)(1)(A) to the extent
+          reasonably practicable.
+       4. If You Share Adapted Material You produce, the Adapter's
+          License You apply must not prevent recipients of the Adapted
+          Material from complying with this Public License.
+Section 4 -- Sui Generis Database Rights.
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+     to extract, reuse, reproduce, and Share all or a substantial
+     portion of the contents of the database for NonCommercial purposes
+     only;
+  b. if You include all or a substantial portion of the database
+     contents in a database in which You have Sui Generis Database
+     Rights, then the database in which You have Sui Generis Database
+     Rights (but not its individual contents) is Adapted Material; and
+  c. You must comply with the conditions in Section 3(a) if You Share
+     all or a substantial portion of the contents of the database.
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+  c. The disclaimer of warranties and limitation of liability provided
+     above shall be interpreted in a manner that, to the extent
+     possible, most closely approximates an absolute disclaimer and
+     waiver of all liability.
+Section 6 -- Term and Termination.
+  a. This Public License applies for the term of the Copyright and
+     Similar Rights licensed here. However, if You fail to comply with
+     this Public License, then Your rights under this Public License
+     terminate automatically.
+  b. Where Your right to use the Licensed Material has terminated under
+     Section 6(a), it reinstates:
+       1. automatically as of the date the violation is cured, provided
+          it is cured within 30 days of Your discovery of the
+          violation; or
+       2. upon express reinstatement by the Licensor.
+     For the avoidance of doubt, this Section 6(b) does not affect any
+     right the Licensor may have to seek remedies for Your violations
+     of this Public License.
+  c. For the avoidance of doubt, the Licensor may also offer the
+     Licensed Material under separate terms or conditions or stop
+     distributing the Licensed Material at any time; however, doing so
+     will not terminate this Public License.
+  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+     License.
+Section 7 -- Other Terms and Conditions.
+  a. The Licensor shall not be bound by any additional or different
+     terms or conditions communicated by You unless expressly agreed.
+  b. Any arrangements, understandings, or agreements regarding the
+     Licensed Material not stated herein are separate from and
+     independent of the terms and conditions of this Public License.
+Section 8 -- Interpretation.
+  a. For the avoidance of doubt, this Public License does not, and
+     shall not be interpreted to, reduce, limit, restrict, or impose
+     conditions on any use of the Licensed Material that could lawfully
+     be made without permission under this Public License.
+  b. To the extent possible, if any provision of this Public License is
+     deemed unenforceable, it shall be automatically reformed to the
+     minimum extent necessary to make it enforceable. If the provision
+     cannot be reformed, it shall be severed from this Public License
+     without affecting the enforceability of the remaining terms and
+     conditions.
+  c. No term or condition of this Public License will be waived and no
+     failure to comply consented to unless expressly agreed to by the
+     Licensor.
+  d. Nothing in this Public License constitutes or may be interpreted
+     as a limitation upon, or waiver of, any privileges and immunities
+     that apply to the Licensor or You, including from the legal
+     processes of any jurisdiction or authority.
+=======================================================================
+Creative Commons is not a party to its public
+licenses. Notwithstanding, Creative Commons may elect to apply one of
+its public licenses to material it publishes and in those instances
+will be considered the “Licensor.” The text of the Creative Commons
+public licenses is dedicated to the public domain under the CC0 Public
+Domain Dedication. Except for the limited purpose of indicating that
+material is shared under a Creative Commons public license or as
+otherwise permitted by the Creative Commons policies published at
+creativecommons.org/policies, Creative Commons does not authorize the
+use of the trademark "Creative Commons" or any other trademark or logo
+of Creative Commons without its prior written consent including,
+without limitation, in connection with any unauthorized modifications
+to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For
+the avoidance of doubt, this paragraph does not form part of the
+public licenses.
+Creative Commons may be contacted at creativecommons.org.

PROMPT_GUIDE.md ADDED Viewed

	@@ -0,0 +1,91 @@

+# Prompt Guide
+All examples are generated with a CFG of $4.2$, $50$ steps, and are non-cherrypicked unless otherwise stated. Negative prompt is set to:
+```
+monochrome, greyscale, low-res, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry, artist name, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, blurry, amputation
+```
+## 1. Text-to-Image
+### 1.1 Long and detailed prompts give (much) better results.
+Since our training comprised of long and detailed prompts, the model is more likely to generate better images with detailed prompts.
+The model shows good text adherence with long and complex prompts as in below images. We use the first $20$ prompts from [simoryu's examples](https://cloneofsimo.github.io/compare_aura_sd3/). For detailed prompts, results of other models, refer to the above link.
+<p align="center">
+  <img src="assets/promptguide_complex.jpg" alt="Text-to-Image results" width="800">
+</p>
+### 1.2 Resolution
+The model generally works well with height and width in range of $[768; 1280]$ (height/width must be divisible by 16) for text-to-image. For other tasks, it performs best with resolution around $512$.
+## 2. ID Customization & Subject-driven generation
+- The expected length of source captions is $30$ to $75$ words. Empirically, we find that longer prompt can help preserve the ID better but it might hinder the text-adherence for target caption.
+- We find it better to add some descriptions (e.g., from source caption) to target to preserve the identity, especially for complex subjects with delicate details.
+<p align="center">
+  <img src="assets/promptguide_idtask.jpg" alt="ablation id task" width="800">
+</p>
+## 3. Multiview generation
+We recommend not use captions, which describe the facial features e.g., looking at the camera, etc, to mitigate multifaced/janus problems.
+## 4. Image editing
+We find it's generally better to set the guidance scale to lower value e.g., $[3; 3.5]$ to avoid over-saturation results.
+## 5. Special tokens and available colors
+### 5.1 Task Tokens
+| Task                  | Token                      | Additional Tokens |
+|:---------------------|:---------------------------|:------------------|
+| Text to Image        | `[[text2image]]`           | |
+| Deblurring           | `[[deblurring]]`           | |
+| Inpainting           | `[[image_inpainting]]`     | |
+| Canny-edge and Image       | `[[canny2image]]`          | |
+| Depth and Image     | `[[depth2image]]`          | |
+| Hed and Image      | `[[hed2img]]`              | |
+| Pose and Image      | `[[pose2image]]`           | |
+| Image editing with Instruction | `[[image_editing]]` | |
+| Semantic map and Image| `[[semanticmap2image]]`    | `<#00FFFF cyan mask: object/to/segment>` |
+| Boundingbox and Image     | `[[boundingbox2image]]`    | `<#00FFFF cyan boundingbox: object/to/detect>` |
+| ID customization             | `[[faceid]]`               | `[[img0]] target/caption [[img1]] caption/of/source/image_1 [[img2]] caption/of/source/image_2 [[img3]] caption/of/source/image_3` |
+| Multiview          | `[[multiview]]`            | |
+| Subject-Driven      | `[[subject_driven]]`       | `<item: name/of/subject> [[img0]] target/caption/goes/here [[img1]] insert/source/caption` |
+Note that you can replace the cyan color above with any from below table and have multiple additional tokens to detect/segment multiple classes.
+### 5.2 Available colors
+| Hex Code | Color Name |
+|:---------|:-----------|
+| #FF0000 | <span style="color: #FF0000">red</span> |
+| #00FF00 | <span style="color: #00FF00">lime</span> |
+| #0000FF | <span style="color: #0000FF">blue</span> |
+| #FFFF00 | <span style="color: #FFFF00">yellow</span> |
+| #FF00FF | <span style="color: #FF00FF">magenta</span> |
+| #00FFFF | <span style="color: #00FFFF">cyan</span> |
+| #FFA500 | <span style="color: #FFA500">orange</span> |
+| #800080 | <span style="color: #800080">purple</span> |
+| #A52A2A | <span style="color: #A52A2A">brown</span> |
+| #008000 | <span style="color: #008000">green</span> |
+| #FFC0CB | <span style="color: #FFC0CB">pink</span> |
+| #008080 | <span style="color: #008080">teal</span> |
+| #FF8C00 | <span style="color: #FF8C00">darkorange</span> |
+| #8A2BE2 | <span style="color: #8A2BE2">blueviolet</span> |
+| #006400 | <span style="color: #006400">darkgreen</span> |
+| #FF4500 | <span style="color: #FF4500">orangered</span> |
+| #000080 | <span style="color: #000080">navy</span> |
+| #FFD700 | <span style="color: #FFD700">gold</span> |
+| #40E0D0 | <span style="color: #40E0D0">turquoise</span> |
+| #DA70D6 | <span style="color: #DA70D6">orchid</span> |

README.md CHANGED Viewed

@@ -1,14 +1,169 @@
----
-title: OneDiffusion Space
-emoji: 🖼
-colorFrom: purple
-colorTo: red
-sdk: gradio
-sdk_version: 5.0.1
-app_file: app.py
-pinned: false
-license: cc-by-nc-4.0
-short_description: demo for onediffusion
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# One Diffusion to Generate Them All
+<p align="left">
+    <a href="https://lehduong.github.io/OneDiffusion-homepage/">
+        <img alt="Build" src="https://img.shields.io/badge/Project%20Page-OneDiffusion-yellow">
+    </a>
+    <a href="https://arxiv.org/abs/2411.16318">
+            <img alt="Build" src="https://img.shields.io/badge/arXiv%20paper-2411.16318-b31b1b.svg">
+    </a>
+    <a href="https://huggingface.co/spaces/lehduong/OneDiffusion">
+        <img alt="License" src="https://img.shields.io/badge/HF%20Demo-🤗-lightblue">
+    </a>
+    <a href="https://huggingface.co/lehduong/OneDiffusion">
+        <img alt="Build" src="https://img.shields.io/badge/HF%20Model-🤗-yellow">
+    </a>
+</p>
+<h4 align="left">
+    <p>
+        <a href=#news>News</a> |
+        <a href=#quick-start>Quick start</a> |
+        <a href=https://github.com/lehduong/OneDiffusion/blob/main/PROMPT_GUIDE.md>Prompt guide &  Supported tasks </a> |
+        <a href=#qualitative-results>Qualitative results</a> |
+        <a href="#license">License</a> |
+        <a href="#citation">Citation</a>
+    <p>
+</h4>
+<p align="center">
+  <img src="assets/teaser.png" alt="Teaser Image" width="800">
+</p>
+This is official repo of OneDiffusion, a versatile, large-scale diffusion model that seamlessly supports bidirectional image synthesis and understanding across diverse tasks.
+## News
+- 📦 2024/12/10: Released weight.
+- 📝 2024/12/06: Added image editing from instruction.
+- ✨ 2024/12/02: Added subject-driven generation
+## Installation
+```
+conda create -n onediffusion_env python=3.8 &&
+conda activate onediffusion_env &&
+pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu118 &&
+pip install "git+https://github.com/facebookresearch/pytorch3d.git" &&
+pip install -r requirements.txt
+```
+## Quick start
+Check `inference.py` for more detailed. For text-to-image, you can use below code snipe.
+```
+import torch
+from onediffusion.diffusion.pipelines.onediffusion import OneDiffusionPipeline
+device = torch.device('cuda:0')
+pipeline = OneDiffusionPipeline.from_pretrained("lehduong/OneDiffusion").to(device=device, dtype=torch.bfloat16)
+NEGATIVE_PROMPT = "monochrome, greyscale, low-res, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry, artist name, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, blurry, amputation"
+output = pipeline(
+    prompt="[[text2image]] A bipedal black cat wearing a huge oversized witch hat, a wizards robe, casting a spell,in an enchanted forest. The scene is filled with fireflies and moss on surrounding rocks and trees",
+    negative_prompt=NEGATIVE_PROMPT,
+    num_inference_steps=50,
+    guidance_scale=4,
+    height=1024,
+    width=1024,
+)
+output.images[0].save('text2image_output.jpg')
+```
+You can run the gradio demo with:
+```
+python gradio_demo.py --captioner molmo # [molmo, llava, disable]
+```
+The demo provides guidance and helps format the prompt properly for each task.
+- By default, it loads the Molmo for captioning source images, which significantly increases memory usage. You generally need a GPU with at least $40$ GB of memory to run the demo.
+- Opting to use LLaVA can reduce this requirement to $\approx 27$ GB, though the resulting captions may be less accurate in some cases.
+- You can also manually provide the caption for each input image and run with `disable` mode. In this mode, the returned caption is an empty string, but you should still press the `Generate Caption` button so that the code formats the input text properly. The memory requirement for this mode is $\approx 12$ GB.
+Note that the above required memory can change if you use higher resolution or more input images.
+## Qualitative Results
+### 1. Text-to-Image
+<p align="center">
+  <img src="assets/text2image.jpg" alt="Text-to-Image results" width="800">
+</p>
+### 2. ID customization
+<p align="center">
+  <img src="assets/onediffusion_appendix_faceid.jpg" alt="ID customization" width="800">
+</p>
+<p align="center">
+  <img src="assets/onediffusion_appendix_faceid_3.jpg" alt="ID customization non-human subject" width="800">
+</p>
+### 3. Multiview generation
+Single image to multiview:
+<p align="center">
+  <img src="assets/onediffusion_appendix_multiview.jpg" alt="Image to multiview" width="800">
+</p>
+<p align="center">
+  <img src="assets/onediffusion_appendix_multiview_2.jpg" alt="image to multiview" width="800">
+</p>
+Text to multiview:
+<p align="center">
+  <img src="assets/text2multiview.jpg" alt="Text to multiview image" width="800">
+</p>
+### 4. Condition-to-Image and vice versa
+<p align="center">
+  <img src="assets/cond_and_image.jpg" alt="Condition and Image" width="800">
+</p>
+### 5. Subject-driven generation
+We finetuned the model on [Subject-200K](https://huggingface.co/datasets/Yuanshi/Subjects200K) dataset (along with all other tasks) for additional 40k steps. The model is now capable of  subject-driven generation.
+<p align="center">
+  <img src="assets/subject_driven.jpg" alt="Subject driven generation" width="800">
+</p>
+### 6. Text-guide image editing
+We finetuned the model on [OmniEdit](https://huggingface.co/datasets/TIGER-Lab/OmniEdit-Filtered-1.2M) dataset for additional 30K steps.
+<p align="center">
+  <img src="assets/onediffusion_editing.jpg" alt="Text-guide editing" width="800">
+</p>
+### 7. Zero-shot Task combinations
+We found that the model can handle multiple tasks in a zero-shot setting by combining condition images and task tokens without any fine-tuning, as shown in the examples below. However, its performance on these combined tasks might not be robust, and the model’s behavior may change if the order of task tokens or captions is altered. For example, when using both image inpainting and ID customization together, the target prompt and the caption of the masked image must be identical. If you plan to use such combinations, we recommend fine-tuning the model on these tasks to achieve better performance and simpler usage.
+<p align="center">
+  <img src="assets/onediffusion_zeroshot.jpg" alt="Subject driven generation" width="800">
+</p>
+## License
+The model is trained on several non-commercially licensed datasets (e.g., DL3DV, Unsplash), thus, **model weights** are released under a CC BY-NC license as described in [LICENSE](https://github.com/lehduong/onediffusion/blob/main/LICENSE).
+## Citation
+```bibtex
+@misc{le2024diffusiongenerate,
+      title={One Diffusion to Generate Them All},
+      author={Duong H. Le and Tuan Pham and Sangho Lee and Christopher Clark and Aniruddha Kembhavi and Stephan Mandt and Ranjay Krishna and Jiasen Lu},
+      year={2024},
+      eprint={2411.16318},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2411.16318},
+}
+```

assets/cond_and_image.jpg ADDED Viewed

Git LFS Details

SHA256: 6fcf6f6327d4a72a05dea636e7cacae6c2bdee4b61d7f583424c15f91e4bb903
Pointer size: 132 Bytes
Size of remote file: 1.26 MB

assets/examples/id_customization/chenhao/image_0.png ADDED Viewed

assets/examples/id_customization/chenhao/image_1.png ADDED Viewed

assets/examples/id_customization/chenhao/image_2.png ADDED Viewed

assets/onediffusion_appendix_faceid.jpg ADDED Viewed

Git LFS Details

SHA256: 8a04d050bf0d2b0f6ec13934f09387bf7d0dac82c32b0c1d808d6013e25cf6ec
Pointer size: 132 Bytes
Size of remote file: 1.75 MB

assets/onediffusion_appendix_faceid_3.jpg ADDED Viewed

Git LFS Details

SHA256: dd8a9a2bb587e4093cb9b9ab36d06675c0fbc91bbd8f1ff3c45e7cc0fb1d211e
Pointer size: 132 Bytes
Size of remote file: 1.68 MB

assets/onediffusion_appendix_multiview.jpg ADDED Viewed

Git LFS Details

SHA256: 70026d6376c2d52ad268be1f5d2b7dc80fee716fa76b6c2aa611f105cbb76614
Pointer size: 132 Bytes
Size of remote file: 1.02 MB

assets/onediffusion_appendix_multiview_2.jpg ADDED Viewed

assets/onediffusion_appendix_text2multiview.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:60a945f7c1c92e823dbd3c7876c843d2668d16cb1ae883f2ba4080d324056225
+size 8278287

assets/onediffusion_editing.jpg ADDED Viewed

assets/onediffusion_zeroshot.jpg ADDED Viewed

Git LFS Details

SHA256: a243196d8e6ca959357af24a71d183e15ebb90910ef0deca56af70ebe59a83f3
Pointer size: 132 Bytes
Size of remote file: 1.99 MB

assets/promptguide_complex.jpg ADDED Viewed

Git LFS Details

SHA256: e8e338d97b8e4f90b52e2fe5680f00a4538cdbe7c0423e07042bc1780aa94a51
Pointer size: 132 Bytes
Size of remote file: 2.05 MB

assets/promptguide_idtask.jpg ADDED Viewed

assets/subject_driven.jpg ADDED Viewed

assets/teaser.png ADDED Viewed

Git LFS Details

SHA256: 2c9ba3c39cdd6882d6c8172e45f71bef282d9439700c3adb6fa951ee394afedc
Pointer size: 132 Bytes
Size of remote file: 1.78 MB

assets/text2image.jpg ADDED Viewed

assets/text2multiview.jpg ADDED Viewed

Git LFS Details

SHA256: 98bc67be460dd5cb1207be5ac1a7ae842fea0c65546a3c66b92b132aa4652cc4
Pointer size: 132 Bytes
Size of remote file: 1.37 MB

docker/Dockerfile ADDED Viewed

	@@ -0,0 +1,119 @@

+# Inspired by https://github.com/anibali/docker-pytorch/blob/master/dockerfiles/1.10.0-cuda11.3-ubuntu20.04/Dockerfile
+# ARG COMPAT=0
+ARG PERSONAL=0
+# FROM nvidia/cuda:11.3.1-devel-ubuntu20.04 as base-0
+FROM nvcr.io/nvidia/pytorch:22.12-py3 as base
+ENV HOST docker
+ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
+# https://serverfault.com/questions/683605/docker-container-time-timezone-will-not-reflect-changes
+ENV TZ America/Los_Angeles
+RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
+# git for installing dependencies
+# tzdata to set time zone
+# wget and unzip to download data
+# [2021-09-09] TD: zsh, stow, subversion, fasd are for setting up my personal environment.
+# [2021-12-07] TD: openmpi-bin for MPI (multi-node training)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    cmake \
+    curl \
+    ca-certificates \
+    sudo \
+    less \
+    htop \
+    git \
+    tzdata \
+    wget \
+    tmux \
+    zip \
+    unzip \
+    zsh stow subversion fasd \
+    && rm -rf /var/lib/apt/lists/*
+    # openmpi-bin \
+# Allow running runmpi as root
+# ENV OMPI_ALLOW_RUN_AS_ROOT=1 OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
+# # Create a non-root user and switch to it
+# RUN adduser --disabled-password --gecos '' --shell /bin/bash user \
+#     && echo "user ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/90-user
+# USER user
+# All users can use /home/user as their home directory
+ENV HOME=/home/user
+RUN mkdir -p /home/user && chmod 777 /home/user
+WORKDIR /home/user
+# Set up personal environment
+# FROM base-${COMPAT} as env-0
+FROM base as env-0
+FROM env-0 as env-1
+# Use ONBUILD so that the dotfiles dir doesn't need to exist unless we're building a personal image
+# https://stackoverflow.com/questions/31528384/conditional-copy-add-in-dockerfile
+ONBUILD COPY dotfiles ./dotfiles
+ONBUILD RUN cd ~/dotfiles && stow bash zsh tmux && sudo chsh -s /usr/bin/zsh $(whoami)
+# nvcr pytorch image sets SHELL=/bin/bash
+ONBUILD ENV SHELL=/bin/zsh
+FROM env-${PERSONAL} as packages
+# Disable pip cache: https://stackoverflow.com/questions/45594707/what-is-pips-no-cache-dir-good-for
+ENV PIP_NO_CACHE_DIR=1
+# # apex and pytorch-fast-transformers take a while to compile so we install them first
+# TD [2022-04-28] apex is already installed. In case we need a newer commit:
+# RUN pip install --upgrade --force-reinstall --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_multihead_attn" --global-option="--fmha" --global-option="--fast_layer_norm" --global-option="--xentropy" git+https://github.com/NVIDIA/apex.git#egg=apex
+# xgboost conflicts with deepspeed
+RUN pip uninstall -y xgboost && DS_BUILD_UTILS=1 DS_BUILD_FUSED_LAMB=1 pip install deepspeed==0.7.7
+# General packages that we don't care about the version
+# zstandard to extract the_pile dataset
+# psutil to get the number of cpu physical cores
+# twine to upload package to PyPI
+RUN pip install pytest matplotlib jupyter ipython ipdb gpustat scikit-learn spacy munch einops opt_einsum fvcore gsutil cmake pykeops zstandard psutil h5py twine gdown \
+    && python -m spacy download en_core_web_sm
+# hydra
+RUN pip install hydra-core==1.3.1 hydra-colorlog==1.2.0 hydra-optuna-sweeper==1.2.0 pyrootutils rich
+# Core packages
+RUN pip install transformers==4.45.2 datasets==3.0.1 pytorch-lightning==2.2.1 triton==2.3.1 wandb==0.16.3 controlnet_aux==0.0.9 timm==0.6.7 torchmetrics==1.3.2
+# torchmetrics 0.11.0 broke hydra's instantiate
+# For MLPerf
+RUN pip install git+https://github.com/mlcommons/logging.git@2.1.0
+RUN pip install accelerate==0.34.2
+RUN pip install diffusers==0.30.3
+RUN pip install deepspeed==0.15.2
+RUN pip install sentencepiece==0.1.99
+RUN pip install pillow==10.2.0
+RUN pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu118
+# Install FlashAttention
+RUN pip install flash-attn==2.6.3
+# Install CUDA extensions for fused dense
+RUN pip install git+https://github.com/Dao-AILab/flash-attention@v2.6.3#subdirectory=csrc/fused_dense_lib
+RUN pip install jaxtyping mediapipe gradio
+RUN pip install "git+https://github.com/facebookresearch/pytorch3d.git"
+RUN apt-get update && apt-get install ffmpeg libsm6 libxext6  -y
+RUN pip install opencv-python==4.5.5.64
+RUN pip install opencv-python-headless==4.5.5.64
+RUN pip install huggingface_hub==0.24
+RUN pip install numpy==1.24.4

gradio_demo.py ADDED Viewed

	@@ -0,0 +1,715 @@

+import gradio as gr
+import torch
+import base64
+import io
+from PIL import Image
+from transformers import (
+    LlavaNextProcessor, LlavaNextForConditionalGeneration,
+    T5EncoderModel, T5Tokenizer
+)
+from transformers import (
+    AutoProcessor, AutoModelForCausalLM, GenerationConfig,
+    T5EncoderModel, T5Tokenizer
+)
+from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, FlowMatchHeunDiscreteScheduler, FluxPipeline
+from onediffusion.diffusion.pipelines.onediffusion import OneDiffusionPipeline
+from onediffusion.models.denoiser.nextdit import NextDiT
+from onediffusion.dataset.utils import get_closest_ratio, ASPECT_RATIO_512
+from typing import List, Optional
+import matplotlib
+import numpy as np
+import cv2
+import argparse
+# Task-specific tokens
+TASK2SPECIAL_TOKENS = {
+    "text2image": "[[text2image]]",
+    "deblurring": "[[deblurring]]",
+    "inpainting": "[[image_inpainting]]",
+    "canny": "[[canny2image]]",
+    "depth2image": "[[depth2image]]",
+    "hed2image": "[[hed2img]]",
+    "pose2image": "[[pose2image]]",
+    "semanticmap2image": "[[semanticmap2image]]",
+    "boundingbox2image": "[[boundingbox2image]]",
+    "image_editing": "[[image_editing]]",
+    "faceid": "[[faceid]]",
+    "multiview": "[[multiview]]",
+    "subject_driven": "[[subject_driven]]"
+}
+NEGATIVE_PROMPT = "monochrome, greyscale, low-res, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry, artist name, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, blurry, amputation"
+class LlavaCaptionProcessor:
+    def __init__(self):
+        model_name = "llava-hf/llama3-llava-next-8b-hf"
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+        self.processor = LlavaNextProcessor.from_pretrained(model_name)
+        self.model = LlavaNextForConditionalGeneration.from_pretrained(
+            model_name, torch_dtype=dtype, low_cpu_mem_usage=True
+        ).to(device)
+        self.SPECIAL_TOKENS = "assistant\n\n\n"
+    def generate_response(self, image: Image.Image, msg: str) -> str:
+        conversation = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": msg}]}]
+        with torch.no_grad():
+            prompt = self.processor.apply_chat_template(conversation, add_generation_prompt=True)
+            inputs = self.processor(prompt, image, return_tensors="pt").to(self.model.device)
+            output = self.model.generate(**inputs, max_new_tokens=250)
+            response = self.processor.decode(output[0], skip_special_tokens=True)
+        return response.split(msg)[-1].strip()[len(self.SPECIAL_TOKENS):]
+    def process(self, images: List[Image.Image], msg: str = None) -> List[str]:
+        if msg is None:
+            msg = f"Describe the contents of the photo in 150 words or fewer."
+        try:
+            return [self.generate_response(img, msg) for img in images]
+        except Exception as e:
+            print(f"Error in process: {str(e)}")
+            raise
+class MolmoCaptionProcessor:
+    def __init__(self):
+        pretrained_model_name = 'allenai/Molmo-7B-O-0924'
+        self.processor = AutoProcessor.from_pretrained(
+            pretrained_model_name,
+            trust_remote_code=True,
+            torch_dtype='auto',
+            device_map='auto'
+        )
+        self.model = AutoModelForCausalLM.from_pretrained(
+            pretrained_model_name,
+            trust_remote_code=True,
+            torch_dtype='auto',
+            device_map='auto'
+        )
+    def generate_response(self, image: Image.Image, msg: str) -> str:
+        inputs = self.processor.process(
+            images=[image],
+            text=msg
+        )
+        # Move inputs to the correct device and make a batch of size 1
+        inputs = {k: v.to(self.model.device).unsqueeze(0) for k, v in inputs.items()}
+        # Generate output
+        output = self.model.generate_from_batch(
+            inputs,
+            GenerationConfig(max_new_tokens=250, stop_strings="<|endoftext|>"),
+            tokenizer=self.processor.tokenizer
+        )
+        # Only get generated tokens and decode them to text
+        generated_tokens = output[0, inputs['input_ids'].size(1):]
+        return self.processor.tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
+    def process(self, images: List[Image.Image], msg: str = None) -> List[str]:
+        if msg is None:
+            msg = f"Describe the contents of the photo in 150 words or fewer."
+        try:
+            return [self.generate_response(img, msg) for img in images]
+        except Exception as e:
+            print(f"Error in process: {str(e)}")
+            raise
+class PlaceHolderCaptionProcessor:
+    def __init__(self):
+        pass
+    def generate_response(self, image: Image.Image, msg: str) -> str:
+        return ""
+    def process(self, images: List[Image.Image], msg: str = None) -> List[str]:
+        return [""] * len(images)
+def initialize_models(captioner_name):
+    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+    pipeline = OneDiffusionPipeline.from_pretrained("lehduong/OneDiffusion").to(device=device, dtype=torch.bfloat16)
+    if captioner_name == 'molmo':
+        captioner = MolmoCaptionProcessor()
+    elif captioner_name == 'llava':
+        captioner = LlavaCaptionProcessor()
+    else:
+        captioner = PlaceHolderCaptionProcessor()
+    return pipeline, captioner
+def colorize_depth_maps(
+    depth_map, min_depth, max_depth, cmap="Spectral", valid_mask=None
+):
+    """
+    Colorize depth maps with reversed colors.
+    """
+    assert len(depth_map.shape) >= 2, "Invalid dimension"
+    if isinstance(depth_map, torch.Tensor):
+        depth = depth_map.detach().squeeze().numpy()
+    elif isinstance(depth_map, np.ndarray):
+        depth = depth_map.copy().squeeze()
+    # reshape to [ (B,) H, W ]
+    if depth.ndim < 3:
+        depth = depth[np.newaxis, :, :]
+    # Normalize depth values to [0, 1]
+    depth = ((depth - min_depth) / (max_depth - min_depth)).clip(0, 1)
+    # Invert the depth values to reverse the colors
+    depth = 1 - depth
+    # Use the colormap
+    cm = matplotlib.colormaps[cmap]
+    img_colored_np = cm(depth, bytes=False)[:, :, :, 0:3]  # values from 0 to 1
+    img_colored_np = np.rollaxis(img_colored_np, 3, 1)
+    if valid_mask is not None:
+        if isinstance(depth_map, torch.Tensor):
+            valid_mask = valid_mask.detach().numpy()
+        valid_mask = valid_mask.squeeze()  # [H, W] or [B, H, W]
+        if valid_mask.ndim < 3:
+            valid_mask = valid_mask[np.newaxis, np.newaxis, :, :]
+        else:
+            valid_mask = valid_mask[:, np.newaxis, :, :]
+        valid_mask = np.repeat(valid_mask, 3, axis=1)
+        img_colored_np[~valid_mask] = 0
+    if isinstance(depth_map, torch.Tensor):
+        img_colored = torch.from_numpy(img_colored_np).float()
+    elif isinstance(depth_map, np.ndarray):
+        img_colored = img_colored_np
+    return img_colored
+def format_prompt(task_type: str, captions: List[str]) -> str:
+    if not captions:
+        return ""
+    if task_type == "faceid":
+        img_prompts = [f"[[img{i}]] {caption}" for i, caption in enumerate(captions, start=1)]
+        return f"[[faceid]] [[img0]] insert/your/caption/here {' '.join(img_prompts)}"
+    elif task_type == "image_editing":
+        return f"[[image_editing]] insert/your/instruction/here"
+    elif task_type == "semanticmap2image":
+        return f"[[semanticmap2image]] <#00ffff Cyan mask: insert/concept/to/segment/here> {captions[0]}"
+    elif task_type == "boundingbox2image":
+        return f"[[boundingbox2image]] <#00ffff Cyan boundingbox: insert/concept/to/segment/here> {captions[0]}"
+    elif task_type == "multiview":
+        img_prompts = captions[0]
+        return f"[[multiview]] {img_prompts}"
+    elif task_type == "subject_driven":
+        return f"[[subject_driven]] <item: insert/item/here> [[img0]] insert/your/target/caption/here [[img1]] {captions[0]}"
+    else:
+        return f"{TASK2SPECIAL_TOKENS[task_type]} {captions[0]}"
+def update_prompt(images: List[Image.Image], task_type: str, custom_msg: str = None):
+    if not images:
+        return format_prompt(task_type, []), "Please upload at least one image!"
+    try:
+        captions = captioner.process(images, custom_msg)
+        if not captions:
+            return "", "No valid images found!"
+        prompt = format_prompt(task_type, captions)
+        return prompt, f"Generated {len(captions)} captions successfully!"
+    except Exception as e:
+        return "", f"Error generating captions: {str(e)}"
+def generate_image(images: List[Image.Image], prompt: str, negative_prompt: str, num_inference_steps: int, guidance_scale: float,
+                   denoise_mask: List[str], task_type: str, azimuth: str, elevation: str, distance: str, focal_length: float,
+                   height: int = 1024, width: int = 1024, scale_factor: float = 1.0, scale_watershed: float = 1.0,
+                   noise_scale: float = None, progress=gr.Progress()):
+    try:
+        img2img_kwargs = {
+            'prompt': prompt,
+            'negative_prompt': negative_prompt,
+            'num_inference_steps': num_inference_steps,
+            'guidance_scale': guidance_scale,
+            'height': height,
+            'width': width,
+            'forward_kwargs': {
+                'scale_factor': scale_factor,
+                'scale_watershed': scale_watershed
+            },
+            'noise_scale': noise_scale  # Added noise_scale here
+        }
+        if task_type == 'multiview':
+            # Parse azimuth, elevation, and distance into lists, allowing 'None' values
+            azimuths = [float(a.strip()) if a.strip().lower() != 'none' else None for a in azimuth.split(',')] if azimuth else []
+            elevations = [float(e.strip()) if e.strip().lower() != 'none' else None for e in elevation.split(',')] if elevation else []
+            distances = [float(d.strip()) if d.strip().lower() != 'none' else None for d in distance.split(',')] if distance else []
+            num_views = max(len(images), len(azimuths), len(elevations), len(distances))
+            if num_views == 0:
+                return None, "At least one image or camera parameter must be provided."
+            total_components = []
+            for i in range(num_views):
+                total_components.append(f"image_{i}")
+                total_components.append(f"camera_pose_{i}")
+            denoise_mask_int = [1 if comp in denoise_mask else 0 for comp in total_components]
+            if len(denoise_mask_int) != len(total_components):
+                return None, f"Denoise mask length mismatch: expected {len(total_components)} components."
+            # Pad the input lists to num_views length
+            images_padded = images + [] * (num_views - len(images))  # Do not add None
+            azimuths_padded = azimuths + [None] * (num_views - len(azimuths))
+            elevations_padded = elevations + [None] * (num_views - len(elevations))
+            distances_padded = distances + [None] * (num_views - len(distances))
+            # Prepare values
+            img2img_kwargs.update({
+                'image': images_padded,
+                'multiview_azimuths': azimuths_padded,
+                'multiview_elevations': elevations_padded,
+                'multiview_distances': distances_padded,
+                'multiview_focal_length': focal_length,  # Pass focal_length here
+                'is_multiview': True,
+                'denoise_mask': denoise_mask_int,
+                # 'predict_camera_poses': True,
+            })
+        else:
+            total_components = ["image_0"] + [f"image_{i+1}" for i in range(len(images))]
+            denoise_mask_int = [1 if comp in denoise_mask else 0 for comp in total_components]
+            if len(denoise_mask_int) != len(total_components):
+                return None, f"Denoise mask length mismatch: expected {len(total_components)} components."
+            img2img_kwargs.update({
+                'image': images,
+                'denoise_mask': denoise_mask_int
+            })
+        progress(0, desc="Generating image...")
+        if task_type == 'text2image':
+            output = pipeline(
+                prompt=prompt,
+                negative_prompt=negative_prompt,
+                num_inference_steps=num_inference_steps,
+                guidance_scale=guidance_scale,
+                height=height,
+                width=width,
+                scale_factor=scale_factor,
+                scale_watershed=scale_watershed,
+                noise_scale=noise_scale  # Added noise_scale here
+            )
+        else:
+            output = pipeline.img2img(**img2img_kwargs)
+        progress(1, desc="Done!")
+        # Process the output images if task is 'depth2image' and predicting depth
+        if task_type == 'depth2image' and denoise_mask_int[-1] == 1:
+            processed_images = []
+            for img in output.images:
+                depth_map = np.array(img.convert('L'))  # Convert to grayscale numpy array
+                min_depth = depth_map.min()
+                max_depth = depth_map.max()
+                colorized = colorize_depth_maps(depth_map, min_depth, max_depth)[0]
+                colorized = np.transpose(colorized, (1, 2, 0))
+                colorized = (colorized * 255).astype(np.uint8)
+                img_colorized = Image.fromarray(colorized)
+                processed_images.append(img_colorized)
+            output_images = processed_images + output.images
+        elif task_type in ['boundingbox2image', 'semanticmap2image'] and denoise_mask_int == [0,1] and images:
+            # Interpolate between input and output images
+            processed_images = []
+            for input_img, output_img in zip(images, output.images):
+                input_img_resized = input_img.resize(output_img.size)
+                blended_img = Image.blend(input_img_resized, output_img, alpha=0.5)
+                processed_images.append(blended_img)
+            output_images = processed_images + output.images
+        else:
+            output_images = output.images
+        return output_images, "Generation completed successfully!"
+    except Exception as e:
+        return None, f"Error during generation: {str(e)}"
+def update_denoise_checkboxes(images_state: List[Image.Image], task_type: str, azimuth: str, elevation: str, distance: str):
+    if task_type == 'multiview':
+        azimuths = [a.strip() for a in azimuth.split(',')] if azimuth else []
+        elevations = [e.strip() for e in elevation.split(',')] if elevation else []
+        distances = [d.strip() for d in distance.split(',')] if distance else []
+        images_len = len(images_state)
+        num_views = max(images_len, len(azimuths), len(elevations), len(distances))
+        if num_views == 0:
+            return gr.update(choices=[], value=[]), "Please provide at least one image or camera parameter."
+        # Pad lists to the same length
+        azimuths += ['None'] * (num_views - len(azimuths))
+        elevations += ['None'] * (num_views - len(elevations))
+        distances += ['None'] * (num_views - len(distances))
+        # Do not add None to images_state
+        labels = []
+        values = []
+        for i in range(num_views):
+            labels.append(f"image_{i}")
+            labels.append(f"camera_pose_{i}")
+            # Default behavior: condition on provided inputs, generate missing ones
+            if i >= images_len:
+                values.append(f"image_{i}")
+            if azimuths[i].lower() == 'none' or elevations[i].lower() == 'none' or distances[i].lower() == 'none':
+                values.append(f"camera_pose_{i}")
+        return gr.update(choices=labels, value=values)
+    else:
+        labels = ["image_0"] + [f"image_{i+1}" for i in range(len(images_state))]
+        values = ["image_0"]
+        return gr.update(choices=labels, value=values)
+def apply_mask(images_state):
+    if len(images_state) < 2:
+        return None, "Please upload at least two images: first as the base image, second as the mask."
+    base_img = images_state[0]
+    mask_img = images_state[1]
+    # Convert images to arrays
+    base_arr = np.array(base_img)
+    mask_arr = np.array(mask_img)
+    # Convert mask to grayscale
+    if mask_arr.ndim == 3:
+        gray_mask = cv2.cvtColor(mask_arr, cv2.COLOR_RGB2GRAY)
+    else:
+        gray_mask = mask_arr
+    # Create a binary mask where non-black pixels are True
+    binary_mask = gray_mask > 10
+    # Define the gray color
+    gray_color = np.array([128, 128, 128], dtype=np.uint8)
+    # Apply gray color where mask is True
+    masked_arr = base_arr.copy()
+    masked_arr[binary_mask] = gray_color
+    masked_img = Image.fromarray(masked_arr)
+    return [masked_img], "Mask applied successfully!"
+def process_images_for_task_type(images_state: List[Image.Image], task_type: str):
+    # No changes needed here since we are processing the output images
+    return images_state, images_state
+with gr.Blocks(title="OneDiffusion Demo") as demo:
+    gr.Markdown("""
+    # OneDiffusion Demo
+    **Welcome to the OneDiffusion Demo!**
+    This application allows you to generate images based on your input prompts for various tasks. Here's how to use it:
+    1. **Select Task Type**: Choose the type of task you want to perform from the "Task Type" dropdown menu.
+    2. **Upload Images**: Drag and drop images directly onto the upload area, or click to select files from your device.
+    3. **Generate Captions**: **If you upload any images**, Click the "Generate Captions with Molmo" button to generate descriptive captions for your uploaded images (depend on the task). You can enter a custom message in the "Custom Message for Molmo" textbox e.g., "caption in 30 words" instead of 50 words.
+    4. **Configure Generation Settings**: Expand the "Advanced Configuration" section to adjust parameters like the number of inference steps, guidance scale, image size, and more.
+    5. **Generate Images**: After setting your preferences, click the "Generate Image" button. The generated images will appear in the "Generated Images" gallery.
+    6. **Manage Images**: Use the "Delete Selected Images" or "Delete All Images" buttons to remove unwanted images from the gallery.
+    **Notes**:
+    - Check out the [Prompt Guide](https://github.com/lehduong/OneDiffusion/blob/main/PROMPT_GUIDE.md).
+    - For text-to-image:
+        + simply enter your prompt in this format "[[text2image]] your/prompt/here" and press the "Generate Image" button.
+    - For boundingbox2image/semantic2image/inpainting etc tasks:
+        + To perform condition-to-image such as semantic map to image, follow above steps
+        + For image-to-condition e.g., image to depth, change the denoise_mask checkbox before generating images. You must UNCHECK image_0 box and CHECK image_1 box.
+    - For FaceID tasks:
+        + Use 3 or 4 images if single input image does not give satisfactory results.
+        + All images will be resized and center cropped to the input height and width. You should choose height and width so that faces in input images won't be cropped.
+        + Model works best with close-up portrait (input and output) images.
+        + If the model does not conform your text prompt, try using shorter caption for source image(s).
+        + If you have non-human subjects and does not get satisfactory results, try "copying" part of caption of source images where it describes the properties of the subject e.g., a monster with red eyes, sharp teeth, etc.
+    - For Multiview generation:
+        + The input camera elevation/azimuth ALWAYS starts with $0$. If you want to generate images of azimuths 30,60,90 and elevations of 10,20,30 (wrt input image), the correct input azimuth is: `0, 30, 60, 90`; input elevation is `0,10,20,30`. The camera distance will be `1.5,1.5,1.5,1.5`
+        + Only support square images (ideally in 512x512 resolution).
+        + Ensure the number of elevations, azimuths, and distances are equal.
+        + The model generally works well for 2-5 views (include both input and generated images). Since the model is trained with 3 views on 512x512 resolution, you might try scale_factor of [1.1; 1.5] and scale_watershed of [100; 400] for better extrapolation.
+        + For better results:
+            1) try increasing num_inference_steps to 75-100.
+            2) avoid aggressively changes in target camera poses, for example to generate novel views at azimuth of 180, (simultaneously) generate 4 views with azimuth of 45, 90, 135, 180.
+    Enjoy creating images with OneDiffusion!
+    """)
+    with gr.Row():
+        with gr.Column():
+            images_state = gr.State([])
+            selected_indices_state = gr.State([])
+            with gr.Row():
+                gallery = gr.Gallery(
+                    label="Input Images",
+                    show_label=True,
+                    columns=2,
+                    rows=2,
+                    height="auto",
+                    object_fit="contain"
+                )
+            # In the UI section, update the file_output component:
+            file_output = gr.File(
+                file_count="multiple",
+                file_types=["image"],
+                label="Drag and drop images here or click to upload",
+                height=100,
+                scale=2,
+                type="filepath"  # Add this parameter
+            )
+            with gr.Row():
+                delete_button = gr.Button("Delete Selected Images")
+                delete_all_button = gr.Button("Delete All Images")
+            task_type = gr.Dropdown(
+                choices=list(TASK2SPECIAL_TOKENS.keys()),
+                value="text2image",
+                label="Task Type"
+            )
+            captioning_message = gr.Textbox(
+                lines=2,
+                value="Describe the contents of the photo in 50 words.",
+                label="Custom message for captioner"
+            )
+            auto_caption_btn = gr.Button("Generate Captions")
+        with gr.Column():
+            prompt = gr.Textbox(
+                lines=3,
+                placeholder="Enter your prompt here or use auto-caption...",
+                label="Prompt"
+            )
+            negative_prompt = gr.Textbox(
+                lines=3,
+                value=NEGATIVE_PROMPT,
+                placeholder="Enter negative prompt here...",
+                label="Negative Prompt"
+            )
+            caption_status = gr.Textbox(label="Caption Status")
+    num_steps = gr.Slider(
+        minimum=1,
+        maximum=200,
+        value=50,
+        step=1,
+        label="Number of Inference Steps"
+    )
+    guidance_scale = gr.Slider(
+        minimum=0.1,
+        maximum=10.0,
+        value=4,
+        step=0.1,
+        label="Guidance Scale"
+    )
+    height = gr.Number(value=1024, label="Height")
+    width = gr.Number(value=1024, label="Width")
+    with gr.Accordion("Advanced Configuration", open=False):
+        with gr.Row():
+            denoise_mask_checkbox = gr.CheckboxGroup(
+                label="Denoise Mask",
+                choices=["image_0"],
+                value=["image_0"]
+            )
+            azimuth = gr.Textbox(
+                value="0",
+                label="Azimuths (degrees, comma-separated, 'None' for missing)"
+            )
+            elevation = gr.Textbox(
+                value="0",
+                label="Elevations (degrees, comma-separated, 'None' for missing)"
+            )
+            distance = gr.Textbox(
+                value="1.5",
+                label="Distances (comma-separated, 'None' for missing)"
+            )
+            focal_length = gr.Number(
+                value=1.3887,
+                label="Focal Length of camera for multiview generation"
+            )
+            scale_factor = gr.Number(value=1.0, label="Scale Factor")
+            scale_watershed = gr.Number(value=1.0, label="Scale Watershed")
+            noise_scale = gr.Number(value=1.0, label="Noise Scale")  # Added noise_scale input
+    output_images = gr.Gallery(
+        label="Generated Images",
+        show_label=True,
+        columns=4,
+        rows=2,
+        height="auto",
+        object_fit="contain"
+    )
+    with gr.Column():
+        generate_btn = gr.Button("Generate Image")
+        # apply_mask_btn = gr.Button("Apply Mask")
+    status = gr.Textbox(label="Generation Status")
+    # Event Handlers
+    def update_gallery(files, images_state):
+        if not files:
+            return images_state, images_state
+        new_images = []
+        for file in files:
+            try:
+                # Handle both file paths and file objects
+                if isinstance(file, dict):  # For drag and drop files
+                    file = file['path']
+                elif hasattr(file, 'name'):  # For uploaded files
+                    file = file.name
+                img = Image.open(file).convert('RGB')
+                new_images.append(img)
+            except Exception as e:
+                print(f"Error loading image: {str(e)}")
+                continue
+        images_state.extend(new_images)
+        return images_state, images_state
+    def on_image_select(evt: gr.SelectData, selected_indices_state):
+        selected_indices = selected_indices_state or []
+        index = evt.index
+        if index in selected_indices:
+            selected_indices.remove(index)
+        else:
+            selected_indices.append(index)
+        return selected_indices
+    def delete_images(selected_indices, images_state):
+        updated_images = [img for i, img in enumerate(images_state) if i not in selected_indices]
+        selected_indices_state = []
+        return updated_images, updated_images, selected_indices_state
+    def delete_all_images(images_state):
+        updated_images = []
+        selected_indices_state = []
+        return updated_images, updated_images, selected_indices_state
+    def update_height_width(images_state):
+        if images_state:
+            closest_ar = get_closest_ratio(
+                height=images_state[0].size[1],
+                width=images_state[0].size[0],
+                ratios=ASPECT_RATIO_512
+            )
+            height_val, width_val = int(closest_ar[0][0]), int(closest_ar[0][1])
+        else:
+            height_val, width_val = 1024, 1024  # Default values
+        return gr.update(value=height_val), gr.update(value=width_val)
+    # Connect events
+    file_output.change(
+        fn=update_gallery,
+        inputs=[file_output, images_state],
+        outputs=[images_state, gallery]
+    ).then(
+        fn=update_height_width,
+        inputs=[images_state],
+        outputs=[height, width]
+    ).then(
+        fn=update_denoise_checkboxes,
+        inputs=[images_state, task_type, azimuth, elevation, distance],
+        outputs=[denoise_mask_checkbox]
+    )
+    gallery.select(
+        fn=on_image_select,
+        inputs=[selected_indices_state],
+        outputs=[selected_indices_state]
+    )
+    delete_button.click(
+        fn=delete_images,
+        inputs=[selected_indices_state, images_state],
+        outputs=[images_state, gallery, selected_indices_state]
+    ).then(
+        fn=update_denoise_checkboxes,
+        inputs=[images_state, task_type, azimuth, elevation, distance],
+        outputs=[denoise_mask_checkbox]
+    )
+    delete_all_button.click(
+        fn=delete_all_images,
+        inputs=[images_state],
+        outputs=[images_state, gallery, selected_indices_state]
+    ).then(
+        fn=update_denoise_checkboxes,
+        inputs=[images_state, task_type, azimuth, elevation, distance],
+        outputs=[denoise_mask_checkbox]
+    )
+    task_type.change(
+        fn=update_denoise_checkboxes,
+        inputs=[images_state, task_type, azimuth, elevation, distance],
+        outputs=[denoise_mask_checkbox]
+    )
+    azimuth.change(
+        fn=update_denoise_checkboxes,
+        inputs=[images_state, task_type, azimuth, elevation, distance],
+        outputs=[denoise_mask_checkbox]
+    )
+    elevation.change(
+        fn=update_denoise_checkboxes,
+        inputs=[images_state, task_type, azimuth, elevation, distance],
+        outputs=[denoise_mask_checkbox]
+    )
+    distance.change(
+        fn=update_denoise_checkboxes,
+        inputs=[images_state, task_type, azimuth, elevation, distance],
+        outputs=[denoise_mask_checkbox]
+    )
+    generate_btn.click(
+        fn=generate_image,
+        inputs=[
+            images_state, prompt, negative_prompt, num_steps, guidance_scale,
+            denoise_mask_checkbox, task_type, azimuth, elevation, distance,
+            focal_length, height, width, scale_factor, scale_watershed, noise_scale  # Added noise_scale here
+        ],
+        outputs=[output_images, status],
+        concurrency_id="gpu_queue"
+    )
+    auto_caption_btn.click(
+        fn=update_prompt,
+        inputs=[images_state, task_type, captioning_message],
+        outputs=[prompt, caption_status],
+        concurrency_id="gpu_queue"
+    )
+    # apply_mask_btn.click(
+    #     fn=apply_mask,
+    #     inputs=[images_state],
+    #     outputs=[output_images, status]
+    # )
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Start the Gradio demo with specified captioner.')
+    parser.add_argument('--captioner', type=str, choices=['molmo', 'llava', 'disable'], default='molmo', help='Captioner to use: molmo, llava, disable.')
+    args = parser.parse_args()
+    # Initialize models with the specified captioner
+    pipeline, captioner = initialize_models(args.captioner)
+    demo.launch(share=True)

inference.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import torch
+from onediffusion.diffusion.pipelines.onediffusion import OneDiffusionPipeline
+from PIL import Image
+device = torch.device('cuda:0')
+pipeline = OneDiffusionPipeline.from_pretrained("lehduong/OneDiffusion").to(device=device, dtype=torch.bfloat16)
+NEGATIVE_PROMPT = "monochrome, greyscale, low-res, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry, artist name, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, blurry, amputation"
+## Text-to-image
+output = pipeline(
+    prompt="[[text2image]] A bipedal black cat wearing a huge oversized witch hat, a wizards robe, casting a spell,in an enchanted forest. The scene is filled with fireflies and moss on surrounding rocks and trees",
+    negative_prompt=NEGATIVE_PROMPT,
+    num_inference_steps=50,
+    guidance_scale=4,
+    height=1024,
+    width=1024,
+)
+output.images[0].save('text2image_output.jpg')
+## ID Customization
+image = [
+    Image.open("assets/examples/id_customization/chenhao/image_0.png"),
+    Image.open("assets/examples/id_customization/chenhao/image_1.png"),
+    Image.open("assets/examples/id_customization/chenhao/image_2.png")
+]
+# input = [noise, cond_1, cond_2, cond_3]
+prompt = "[[faceid]] \
+    [[img0]] A woman dressed in traditional attire with intricate headpieces, posing gracefully with a serene expression. \
+    [[img1]] A woman with long dark hair, smiling warmly while wearing a floral dress. \
+    [[img2]] A woman in traditional clothing holding a lace parasol, with her hair styled elegantly. \
+    [[img3]] A woman in elaborate traditional attire and jewelry, with an ornate headdress, looking intently forward. \
+"
+ret = pipeline.img2img(image=image, num_inference_steps=75, prompt=prompt, denoise_mask=[1, 0, 0, 0], guidance_scale=4)
+ret.images[0].save("idcustomization_output.jpg")

onediffusion/dataset/multitask/multiview.py ADDED Viewed

	@@ -0,0 +1,277 @@

+import os
+import json
+import random
+from PIL import Image
+import torch
+from typing import List, Tuple, Union
+from torch.utils.data import Dataset
+from torchvision import transforms
+import torchvision.transforms as T
+from onediffusion.dataset.utils import *
+import glob
+from onediffusion.dataset.raydiff_utils import cameras_to_rays, first_camera_transform, normalize_cameras
+from onediffusion.dataset.transforms import CenterCropResizeImage
+from pytorch3d.renderer import PerspectiveCameras
+import numpy as np
+def _cameras_from_opencv_projection(
+    R: torch.Tensor,
+    tvec: torch.Tensor,
+    camera_matrix: torch.Tensor,
+    image_size: torch.Tensor,
+    do_normalize_cameras,
+    normalize_scale,
+) -> PerspectiveCameras:
+    focal_length = torch.stack([camera_matrix[:, 0, 0], camera_matrix[:, 1, 1]], dim=-1)
+    principal_point = camera_matrix[:, :2, 2]
+    # Retype the image_size correctly and flip to width, height.
+    image_size_wh = image_size.to(R).flip(dims=(1,))
+    # Screen to NDC conversion:
+    # For non square images, we scale the points such that smallest side
+    # has range [-1, 1] and the largest side has range [-u, u], with u > 1.
+    # This convention is consistent with the PyTorch3D renderer, as well as
+    # the transformation function `get_ndc_to_screen_transform`.
+    scale = image_size_wh.to(R).min(dim=1, keepdim=True)[0] / 2.0
+    scale = scale.expand(-1, 2)
+    c0 = image_size_wh / 2.0
+    # Get the PyTorch3D focal length and principal point.
+    focal_pytorch3d = focal_length / scale
+    p0_pytorch3d = -(principal_point - c0) / scale
+    # For R, T we flip x, y axes (opencv screen space has an opposite
+    # orientation of screen axes).
+    # We also transpose R (opencv multiplies points from the opposite=left side).
+    R_pytorch3d = R.clone().permute(0, 2, 1)
+    T_pytorch3d = tvec.clone()
+    R_pytorch3d[:, :, :2] *= -1
+    T_pytorch3d[:, :2] *= -1
+    cams = PerspectiveCameras(
+        R=R_pytorch3d,
+        T=T_pytorch3d,
+        focal_length=focal_pytorch3d,
+        principal_point=p0_pytorch3d,
+        image_size=image_size,
+        device=R.device,
+    )
+    if do_normalize_cameras:
+        cams, _ = normalize_cameras(cams, scale=normalize_scale)
+    cams = first_camera_transform(cams, rotation_only=False)
+    return cams
+def calculate_rays(Ks, sizes, Rs, Ts, target_size, use_plucker=True, do_normalize_cameras=False, normalize_scale=1.0):
+    cameras = _cameras_from_opencv_projection(
+        R=Rs,
+        tvec=Ts,
+        camera_matrix=Ks,
+        image_size=sizes,
+        do_normalize_cameras=do_normalize_cameras,
+        normalize_scale=normalize_scale
+    )
+    rays_embedding = cameras_to_rays(
+        cameras=cameras,
+        num_patches_x=target_size,
+        num_patches_y=target_size,
+        crop_parameters=None,
+        use_plucker=use_plucker
+    )
+    return rays_embedding.rays
+def convert_rgba_to_rgb_white_bg(image):
+    """Convert RGBA image to RGB with white background"""
+    if image.mode == 'RGBA':
+        # Create a white background
+        background = Image.new('RGBA', image.size, (255, 255, 255, 255))
+        # Composite the image onto the white background
+        return Image.alpha_composite(background, image).convert('RGB')
+    return image.convert('RGB')
+class MultiviewDataset(Dataset):
+    def __init__(
+        self,
+        scene_folders: str,
+        samples_per_set: Union[int, Tuple[int, int]],  # Changed from samples_per_set to samples_range
+        transform=None,
+        caption_keys: Union[str, List] = "caption",
+        multiscale=False,
+        aspect_ratio_type=ASPECT_RATIO_512,
+        c2w_scaling=1.7,
+        default_max_distance=1, # default max distance from all camera of a scene ,
+        do_normalize=True, # whether normalize translation of c2w with max_distance
+        swap_xz=False, # whether swap x and z axis of 3D scenes
+        valid_paths: str = "",
+        frame_sliding_windows: float = None # limit all sampled frames to be within this window, so that camera poses won't be too different
+    ):
+        if not isinstance(samples_per_set, tuple) and not isinstance(samples_per_set, list):
+            samples_per_set = (samples_per_set, samples_per_set)
+        self.samples_range = samples_per_set  # Tuple of (min_samples, max_samples)
+        self.transform = transform
+        self.caption_keys = caption_keys if isinstance(caption_keys, list) else [caption_keys]
+        self.aspect_ratio = aspect_ratio_type
+        self.scene_folders = sorted(glob.glob(scene_folders))
+        # filter out scene folders that do not have transforms.json
+        self.scene_folders = list(filter(lambda x: os.path.exists(os.path.join(x, "transforms.json")), self.scene_folders))
+        # if valid_paths.txt exists, only use paths in that file
+        if os.path.exists(valid_paths):
+            with open(valid_paths, 'r') as f:
+                valid_scene_folders = f.read().splitlines()
+            self.scene_folders = sorted(valid_scene_folders)
+        self.c2w_scaling = c2w_scaling
+        self.do_normalize = do_normalize
+        self.default_max_distance = default_max_distance
+        self.swap_xz = swap_xz
+        self.frame_sliding_windows = frame_sliding_windows
+        if multiscale:
+            assert self.aspect_ratio in [ASPECT_RATIO_512, ASPECT_RATIO_1024, ASPECT_RATIO_2048, ASPECT_RATIO_2880]
+            if self.aspect_ratio in [ASPECT_RATIO_2048, ASPECT_RATIO_2880]:
+                self.interpolate_model = T.InterpolationMode.LANCZOS
+            self.ratio_index = {}
+            self.ratio_nums = {}
+            for k, v in self.aspect_ratio.items():
+                self.ratio_index[float(k)] = []     # used for self.getitem
+                self.ratio_nums[float(k)] = 0      # used for batch-sampler
+    def __len__(self):
+        return len(self.scene_folders)
+    def __getitem__(self, idx):
+        try:
+            scene_path = self.scene_folders[idx]
+            if os.path.exists(os.path.join(scene_path, "images")):
+                image_folder = os.path.join(scene_path, "images")
+                downscale_factor = 1
+            elif os.path.exists(os.path.join(scene_path, "images_4")):
+                image_folder = os.path.join(scene_path, "images_4")
+                downscale_factor = 1 / 4
+            elif os.path.exists(os.path.join(scene_path, "images_8")):
+                image_folder = os.path.join(scene_path, "images_8")
+                downscale_factor = 1 / 8
+            else:
+                raise NotImplementedError
+            json_path = os.path.join(scene_path, "transforms.json")
+            caption_path = os.path.join(scene_path, "caption.json")
+            image_files = os.listdir(image_folder)
+            with open(json_path, 'r') as f:
+                json_data = json.load(f)
+                height, width = json_data['h'], json_data['w']
+                dh, dw = int(height * downscale_factor), int(width * downscale_factor)
+                fl_x, fl_y = json_data['fl_x'] * downscale_factor, json_data['fl_y'] * downscale_factor
+                cx = dw // 2
+                cy = dh // 2
+                frame_list = json_data['frames']
+            # Randomly select number of samples
+            samples_per_set = random.randint(self.samples_range[0], self.samples_range[1])
+            # uniformly for all scenes
+            if self.frame_sliding_windows is None:
+                selected_indices = random.sample(range(len(frame_list)), min(samples_per_set, len(frame_list)))
+            # limit the multiview to be in a sliding window (to avoid catastrophic difference in camera angles)
+            else:
+                # Determine the starting index of the sliding window
+                if len(frame_list) <= self.frame_sliding_windows:
+                    # If the frame list is smaller than or equal to X, use the entire list
+                    window_start = 0
+                    window_end = len(frame_list)
+                else:
+                    # Randomly select a starting point for the window
+                    window_start = random.randint(0, len(frame_list) - self.frame_sliding_windows)
+                    window_end = window_start + self.frame_sliding_windows
+                # Get the indices within the sliding window
+                window_indices = list(range(window_start, window_end))
+                # Randomly sample indices from the window
+                selected_indices = random.sample(window_indices, samples_per_set)
+            image_files = [os.path.basename(frame_list[i]['file_path']) for i in selected_indices]
+            image_paths = [os.path.join(image_folder, file) for file in image_files]
+            # Load images and convert RGBA to RGB with white background
+            images = [convert_rgba_to_rgb_white_bg(Image.open(image_path)) for image_path in image_paths]
+            if self.transform:
+                images = [self.transform(image) for image in images]
+            else:
+                closest_size, closest_ratio = self.aspect_ratio['1.0'], 1.0
+                closest_size = tuple(map(int, closest_size))
+                transform = T.Compose([
+                            T.ToTensor(),
+                            CenterCropResizeImage(closest_size),
+                            T.Normalize([.5], [.5]),
+                        ])
+                images = [transform(image) for image in images]
+            images = torch.stack(images)
+            c2ws = [frame_list[i]['transform_matrix'] for i in selected_indices]
+            c2ws = torch.tensor(c2ws).reshape(-1, 4, 4)
+            # max_distance = json_data.get('max_distance', self.default_max_distance)
+            # if 'max_distance' not in json_data.keys():
+                # print(f"not found `max_distance` in json path: {json_path}")
+            if self.swap_xz:
+                swap_xz = torch.tensor([[[0, 0, 1., 0],
+                        [0, 1., 0, 0],
+                        [-1., 0, 0, 0],
+                        [0, 0, 0, 1.]]])
+                c2ws = swap_xz @ c2ws
+            # OPENGL to OPENCV
+            c2ws[:, 0:3, 1:3] *= -1
+            c2ws = c2ws[:, [1, 0, 2, 3], :]
+            c2ws[:, 2, :] *= -1
+            w2cs = torch.inverse(c2ws)
+            K = torch.tensor([[[fl_x, 0, cx], [0, fl_y, cy], [0, 0, 1]]]).repeat(len(c2ws), 1, 1)
+            Rs = w2cs[:, :3, :3]
+            Ts = w2cs[:, :3, 3]
+            sizes = torch.tensor([[dh, dw]]).repeat(len(c2ws), 1)
+            # get ray embedding and padding last dimension to 16 (num channels of VAE)
+            # rays_od = calculate_rays(K, sizes, Rs, Ts, closest_size[0] // 8, use_plucker=False, do_normalize_cameras=self.do_normalize, normalize_scale=self.c2w_scaling)
+            rays = calculate_rays(K, sizes, Rs, Ts, closest_size[0] // 8, do_normalize_cameras=self.do_normalize, normalize_scale=self.c2w_scaling)
+            rays = rays.reshape(samples_per_set, closest_size[0] // 8, closest_size[1] // 8, 6)
+            # padding = (0, 10)  # pad the last dimension to 16
+            # rays = torch.nn.functional.pad(rays, padding, "constant", 0)
+            rays = torch.cat([rays, rays, rays[..., :4]], dim=-1) * 1.658
+            if os.path.exists(caption_path):
+                with open(caption_path, 'r') as f:
+                    caption_key = random.choice(self.caption_keys)
+                    caption = json.load(f).get(caption_key, "")
+            else:
+                caption = ""
+            caption = "[[multiview]] " + caption if caption else "[[multiview]]"
+            return {
+                'pixel_values': images,
+                'rays': rays,
+                'aspect_ratio': closest_ratio,
+                'caption': caption,
+                'height': dh,
+                'width': dw,
+                # 'origins': rays_od[..., :3],
+                # 'dirs': rays_od[..., 3:6]
+            }
+        except Exception as e:
+            return self.__getitem__(random.randint(0, len(self.scene_folders) - 1))

onediffusion/dataset/raydiff_utils.py ADDED Viewed

	@@ -0,0 +1,739 @@

+"""
+Adapted from code originally written by David Novotny.
+"""
+import torch
+from pytorch3d.transforms import Rotate, Translate
+import cv2
+import numpy as np
+import torch
+from pytorch3d.renderer import PerspectiveCameras, RayBundle
+def intersect_skew_line_groups(p, r, mask):
+    # p, r both of shape (B, N, n_intersected_lines, 3)
+    # mask of shape (B, N, n_intersected_lines)
+    p_intersect, r = intersect_skew_lines_high_dim(p, r, mask=mask)
+    if p_intersect is None:
+        return None, None, None, None
+    _, p_line_intersect = point_line_distance(
+        p, r, p_intersect[..., None, :].expand_as(p)
+    )
+    intersect_dist_squared = ((p_line_intersect - p_intersect[..., None, :]) ** 2).sum(
+        dim=-1
+    )
+    return p_intersect, p_line_intersect, intersect_dist_squared, r
+def intersect_skew_lines_high_dim(p, r, mask=None):
+    # Implements https://en.wikipedia.org/wiki/Skew_lines In more than two dimensions
+    dim = p.shape[-1]
+    # make sure the heading vectors are l2-normed
+    if mask is None:
+        mask = torch.ones_like(p[..., 0])
+    r = torch.nn.functional.normalize(r, dim=-1)
+    eye = torch.eye(dim, device=p.device, dtype=p.dtype)[None, None]
+    I_min_cov = (eye - (r[..., None] * r[..., None, :])) * mask[..., None, None]
+    sum_proj = I_min_cov.matmul(p[..., None]).sum(dim=-3)
+    # I_eps = torch.zeros_like(I_min_cov.sum(dim=-3)) + 1e-10
+    # p_intersect = torch.pinverse(I_min_cov.sum(dim=-3) + I_eps).matmul(sum_proj)[..., 0]
+    p_intersect = torch.linalg.lstsq(I_min_cov.sum(dim=-3), sum_proj).solution[..., 0]
+    # I_min_cov.sum(dim=-3): torch.Size([1, 1, 3, 3])
+    # sum_proj: torch.Size([1, 1, 3, 1])
+    # p_intersect = np.linalg.lstsq(I_min_cov.sum(dim=-3).numpy(), sum_proj.numpy(), rcond=None)[0]
+    if torch.any(torch.isnan(p_intersect)):
+        print(p_intersect)
+        return None, None
+        ipdb.set_trace()
+        assert False
+    return p_intersect, r
+def point_line_distance(p1, r1, p2):
+    df = p2 - p1
+    proj_vector = df - ((df * r1).sum(dim=-1, keepdim=True) * r1)
+    line_pt_nearest = p2 - proj_vector
+    d = (proj_vector).norm(dim=-1)
+    return d, line_pt_nearest
+def compute_optical_axis_intersection(cameras):
+    centers = cameras.get_camera_center()
+    principal_points = cameras.principal_point
+    one_vec = torch.ones((len(cameras), 1), device=centers.device)
+    optical_axis = torch.cat((principal_points, one_vec), -1)
+    # optical_axis = torch.cat(
+    #     (principal_points, cameras.focal_length[:, 0].unsqueeze(1)), -1
+    # )
+    pp = cameras.unproject_points(optical_axis, from_ndc=True, world_coordinates=True)
+    pp2 = torch.diagonal(pp, dim1=0, dim2=1).T
+    directions = pp2 - centers
+    centers = centers.unsqueeze(0).unsqueeze(0)
+    directions = directions.unsqueeze(0).unsqueeze(0)
+    p_intersect, p_line_intersect, _, r = intersect_skew_line_groups(
+        p=centers, r=directions, mask=None
+    )
+    if p_intersect is None:
+        dist = None
+    else:
+        p_intersect = p_intersect.squeeze().unsqueeze(0)
+        dist = (p_intersect - centers).norm(dim=-1)
+    return p_intersect, dist, p_line_intersect, pp2, r
+def normalize_cameras(cameras, scale=1.0):
+    """
+    Normalizes cameras such that the optical axes point to the origin, the rotation is
+    identity, and the norm of the translation of the first camera is 1.
+    Args:
+        cameras (pytorch3d.renderer.cameras.CamerasBase).
+        scale (float): Norm of the translation of the first camera.
+    Returns:
+        new_cameras (pytorch3d.renderer.cameras.CamerasBase): Normalized cameras.
+        undo_transform (function): Function that undoes the normalization.
+    """
+    # Let distance from first camera to origin be unit
+    new_cameras = cameras.clone()
+    new_transform = (
+        new_cameras.get_world_to_view_transform()
+    )  # potential R is not valid matrix
+    p_intersect, dist, p_line_intersect, pp, r = compute_optical_axis_intersection(
+        cameras
+    )
+    if p_intersect is None:
+        print("Warning: optical axes code has a nan. Returning identity cameras.")
+        new_cameras.R[:] = torch.eye(3, device=cameras.R.device, dtype=cameras.R.dtype)
+        new_cameras.T[:] = torch.tensor(
+            [0, 0, 1], device=cameras.T.device, dtype=cameras.T.dtype
+        )
+        return new_cameras, lambda x: x
+    d = dist.squeeze(dim=1).squeeze(dim=0)[0]
+    # Degenerate case
+    if d == 0:
+        print(cameras.T)
+        print(new_transform.get_matrix()[:, 3, :3])
+        assert False
+    assert d != 0
+    # Can't figure out how to make scale part of the transform too without messing up R.
+    # Ideally, we would just wrap it all in a single Pytorch3D transform so that it
+    # would work with any structure (eg PointClouds, Meshes).
+    tR = Rotate(new_cameras.R[0].unsqueeze(0)).inverse()
+    tT = Translate(p_intersect)
+    t = tR.compose(tT)
+    new_transform = t.compose(new_transform)
+    new_cameras.R = new_transform.get_matrix()[:, :3, :3]
+    new_cameras.T = new_transform.get_matrix()[:, 3, :3] / d * scale
+    def undo_transform(cameras):
+        cameras_copy = cameras.clone()
+        cameras_copy.T *= d / scale
+        new_t = (
+            t.inverse().compose(cameras_copy.get_world_to_view_transform()).get_matrix()
+        )
+        cameras_copy.R = new_t[:, :3, :3]
+        cameras_copy.T = new_t[:, 3, :3]
+        return cameras_copy
+    return new_cameras, undo_transform
+def first_camera_transform(cameras, rotation_only=True):
+    new_cameras = cameras.clone()
+    new_transform = new_cameras.get_world_to_view_transform()
+    tR = Rotate(new_cameras.R[0].unsqueeze(0))
+    if rotation_only:
+        t = tR.inverse()
+    else:
+        tT = Translate(new_cameras.T[0].unsqueeze(0))
+        t = tR.compose(tT).inverse()
+    new_transform = t.compose(new_transform)
+    new_cameras.R = new_transform.get_matrix()[:, :3, :3]
+    new_cameras.T = new_transform.get_matrix()[:, 3, :3]
+    return new_cameras
+def get_identity_cameras_with_intrinsics(cameras):
+    D = len(cameras)
+    device = cameras.R.device
+    new_cameras = cameras.clone()
+    new_cameras.R = torch.eye(3, device=device).unsqueeze(0).repeat((D, 1, 1))
+    new_cameras.T = torch.zeros((D, 3), device=device)
+    return new_cameras
+def normalize_cameras_batch(cameras, scale=1.0, normalize_first_camera=False):
+    new_cameras = []
+    undo_transforms = []
+    for cam in cameras:
+        if normalize_first_camera:
+            # Normalize cameras such that first camera is identity and origin is at
+            # first camera center.
+            normalized_cameras = first_camera_transform(cam, rotation_only=False)
+            undo_transform = None
+        else:
+            normalized_cameras, undo_transform = normalize_cameras(cam, scale=scale)
+        new_cameras.append(normalized_cameras)
+        undo_transforms.append(undo_transform)
+    return new_cameras, undo_transforms
+class Rays(object):
+    def __init__(
+        self,
+        rays=None,
+        origins=None,
+        directions=None,
+        moments=None,
+        is_plucker=False,
+        moments_rescale=1.0,
+        ndc_coordinates=None,
+        crop_parameters=None,
+        num_patches_x=16,
+        num_patches_y=16,
+    ):
+        """
+        Ray class to keep track of current ray representation.
+        Args:
+            rays: (..., 6).
+            origins: (..., 3).
+            directions: (..., 3).
+            moments: (..., 3).
+            is_plucker: If True, rays are in plucker coordinates (Default: False).
+            moments_rescale: Rescale the moment component of the rays by a scalar.
+            ndc_coordinates: (..., 2): NDC coordinates of each ray.
+        """
+        if rays is not None:
+            self.rays = rays
+            self._is_plucker = is_plucker
+        elif origins is not None and directions is not None:
+            self.rays = torch.cat((origins, directions), dim=-1)
+            self._is_plucker = False
+        elif directions is not None and moments is not None:
+            self.rays = torch.cat((directions, moments), dim=-1)
+            self._is_plucker = True
+        else:
+            raise Exception("Invalid combination of arguments")
+        if moments_rescale != 1.0:
+            self.rescale_moments(moments_rescale)
+        if ndc_coordinates is not None:
+            self.ndc_coordinates = ndc_coordinates
+        elif crop_parameters is not None:
+            # (..., H, W, 2)
+            xy_grid = compute_ndc_coordinates(
+                crop_parameters,
+                num_patches_x=num_patches_x,
+                num_patches_y=num_patches_y,
+            )[..., :2]
+            xy_grid = xy_grid.reshape(*xy_grid.shape[:-3], -1, 2)
+            self.ndc_coordinates = xy_grid
+        else:
+            self.ndc_coordinates = None
+    def __getitem__(self, index):
+        return Rays(
+            rays=self.rays[index],
+            is_plucker=self._is_plucker,
+            ndc_coordinates=(
+                self.ndc_coordinates[index]
+                if self.ndc_coordinates is not None
+                else None
+            ),
+        )
+    def to_spatial(self, include_ndc_coordinates=False):
+        """
+        Converts rays to spatial representation: (..., H * W, 6) --> (..., 6, H, W)
+        Returns:
+            torch.Tensor: (..., 6, H, W)
+        """
+        rays = self.to_plucker().rays
+        *batch_dims, P, D = rays.shape
+        H = W = int(np.sqrt(P))
+        assert H * W == P
+        rays = torch.transpose(rays, -1, -2)  # (..., 6, H * W)
+        rays = rays.reshape(*batch_dims, D, H, W)
+        if include_ndc_coordinates:
+            ndc_coords = self.ndc_coordinates.transpose(-1, -2)  # (..., 2, H * W)
+            ndc_coords = ndc_coords.reshape(*batch_dims, 2, H, W)
+            rays = torch.cat((rays, ndc_coords), dim=-3)
+        return rays
+    def rescale_moments(self, scale):
+        """
+        Rescale the moment component of the rays by a scalar. Might be desirable since
+        moments may come from a very narrow distribution.
+        Note that this modifies in place!
+        """
+        if self.is_plucker:
+            self.rays[..., 3:] *= scale
+            return self
+        else:
+            return self.to_plucker().rescale_moments(scale)
+    @classmethod
+    def from_spatial(cls, rays, moments_rescale=1.0, ndc_coordinates=None):
+        """
+        Converts rays from spatial representation: (..., 6, H, W) --> (..., H * W, 6)
+        Args:
+            rays: (..., 6, H, W)
+        Returns:
+            Rays: (..., H * W, 6)
+        """
+        *batch_dims, D, H, W = rays.shape
+        rays = rays.reshape(*batch_dims, D, H * W)
+        rays = torch.transpose(rays, -1, -2)
+        return cls(
+            rays=rays,
+            is_plucker=True,
+            moments_rescale=moments_rescale,
+            ndc_coordinates=ndc_coordinates,
+        )
+    def to_point_direction(self, normalize_moment=True):
+        """
+        Convert to point direction representation <O, D>.
+        Returns:
+            rays: (..., 6).
+        """
+        if self._is_plucker:
+            direction = torch.nn.functional.normalize(self.rays[..., :3], dim=-1)
+            moment = self.rays[..., 3:]
+            if normalize_moment:
+                c = torch.linalg.norm(direction, dim=-1, keepdim=True)
+                moment = moment / c
+            points = torch.cross(direction, moment, dim=-1)
+            return Rays(
+                rays=torch.cat((points, direction), dim=-1),
+                is_plucker=False,
+                ndc_coordinates=self.ndc_coordinates,
+            )
+        else:
+            return self
+    def to_plucker(self):
+        """
+        Convert to plucker representation <D, OxD>.
+        """
+        if self.is_plucker:
+            return self
+        else:
+            ray = self.rays.clone()
+            ray_origins = ray[..., :3]
+            ray_directions = ray[..., 3:]
+            # Normalize ray directions to unit vectors
+            ray_directions = ray_directions / ray_directions.norm(dim=-1, keepdim=True)
+            plucker_normal = torch.cross(ray_origins, ray_directions, dim=-1)
+            new_ray = torch.cat([ray_directions, plucker_normal], dim=-1)
+            return Rays(
+                rays=new_ray, is_plucker=True, ndc_coordinates=self.ndc_coordinates
+            )
+    def get_directions(self, normalize=True):
+        if self.is_plucker:
+            directions = self.rays[..., :3]
+        else:
+            directions = self.rays[..., 3:]
+        if normalize:
+            directions = torch.nn.functional.normalize(directions, dim=-1)
+        return directions
+    def get_origins(self):
+        if self.is_plucker:
+            origins = self.to_point_direction().get_origins()
+        else:
+            origins = self.rays[..., :3]
+        return origins
+    def get_moments(self):
+        if self.is_plucker:
+            moments = self.rays[..., 3:]
+        else:
+            moments = self.to_plucker().get_moments()
+        return moments
+    def get_ndc_coordinates(self):
+        return self.ndc_coordinates
+    @property
+    def is_plucker(self):
+        return self._is_plucker
+    @property
+    def device(self):
+        return self.rays.device
+    def __repr__(self, *args, **kwargs):
+        ray_str = self.rays.__repr__(*args, **kwargs)[6:]  # remove "tensor"
+        if self._is_plucker:
+            return "PluRay" + ray_str
+        else:
+            return "DirRay" + ray_str
+    def to(self, device):
+        self.rays = self.rays.to(device)
+    def clone(self):
+        return Rays(rays=self.rays.clone(), is_plucker=self._is_plucker)
+    @property
+    def shape(self):
+        return self.rays.shape
+    def visualize(self):
+        directions = torch.nn.functional.normalize(self.get_directions(), dim=-1).cpu()
+        moments = torch.nn.functional.normalize(self.get_moments(), dim=-1).cpu()
+        return (directions + 1) / 2, (moments + 1) / 2
+    def to_ray_bundle(self, length=0.3, recenter=True):
+        lengths = torch.ones_like(self.get_origins()[..., :2]) * length
+        lengths[..., 0] = 0
+        if recenter:
+            centers, _ = intersect_skew_lines_high_dim(
+                self.get_origins(), self.get_directions()
+            )
+            centers = centers.unsqueeze(1).repeat(1, lengths.shape[1], 1)
+        else:
+            centers = self.get_origins()
+        return RayBundle(
+            origins=centers,
+            directions=self.get_directions(),
+            lengths=lengths,
+            xys=self.get_directions(),
+        )
+def cameras_to_rays(
+    cameras,
+    crop_parameters,
+    use_half_pix=True,
+    use_plucker=True,
+    num_patches_x=16,
+    num_patches_y=16,
+):
+    """
+    Unprojects rays from camera center to grid on image plane.
+    Args:
+        cameras: Pytorch3D cameras to unproject. Can be batched.
+        crop_parameters: Crop parameters in NDC (cc_x, cc_y, crop_width, scale).
+            Shape is (B, 4).
+        use_half_pix: If True, use half pixel offset (Default: True).
+        use_plucker: If True, return rays in plucker coordinates (Default: False).
+        num_patches_x: Number of patches in x direction (Default: 16).
+        num_patches_y: Number of patches in y direction (Default: 16).
+    """
+    unprojected = []
+    crop_parameters_list = (
+        crop_parameters if crop_parameters is not None else [None for _ in cameras]
+    )
+    for camera, crop_param in zip(cameras, crop_parameters_list):
+        xyd_grid = compute_ndc_coordinates(
+            crop_parameters=crop_param,
+            use_half_pix=use_half_pix,
+            num_patches_x=num_patches_x,
+            num_patches_y=num_patches_y,
+        )
+        unprojected.append(
+            camera.unproject_points(
+                xyd_grid.reshape(-1, 3), world_coordinates=True, from_ndc=True
+            )
+        )
+    unprojected = torch.stack(unprojected, dim=0)  # (N, P, 3)
+    origins = cameras.get_camera_center().unsqueeze(1)  # (N, 1, 3)
+    origins = origins.repeat(1, num_patches_x * num_patches_y, 1)  # (N, P, 3)
+    directions = unprojected - origins
+    rays = Rays(
+        origins=origins,
+        directions=directions,
+        crop_parameters=crop_parameters,
+        num_patches_x=num_patches_x,
+        num_patches_y=num_patches_y,
+    )
+    if use_plucker:
+        return rays.to_plucker()
+    return rays
+def rays_to_cameras(
+    rays,
+    crop_parameters,
+    num_patches_x=16,
+    num_patches_y=16,
+    use_half_pix=True,
+    sampled_ray_idx=None,
+    cameras=None,
+    focal_length=(3.453,),
+):
+    """
+    If cameras are provided, will use those intrinsics. Otherwise will use the provided
+    focal_length(s). Dataset default is 3.32.
+    Args:
+        rays (Rays): (N, P, 6)
+        crop_parameters (torch.Tensor): (N, 4)
+    """
+    device = rays.device
+    origins = rays.get_origins()
+    directions = rays.get_directions()
+    camera_centers, _ = intersect_skew_lines_high_dim(origins, directions)
+    # Retrieve target rays
+    if cameras is None:
+        if len(focal_length) == 1:
+            focal_length = focal_length * rays.shape[0]
+        I_camera = PerspectiveCameras(focal_length=focal_length, device=device)
+    else:
+        # Use same intrinsics but reset to identity extrinsics.
+        I_camera = cameras.clone()
+        I_camera.R[:] = torch.eye(3, device=device)
+        I_camera.T[:] = torch.zeros(3, device=device)
+    I_patch_rays = cameras_to_rays(
+        cameras=I_camera,
+        num_patches_x=num_patches_x,
+        num_patches_y=num_patches_y,
+        use_half_pix=use_half_pix,
+        crop_parameters=crop_parameters,
+    ).get_directions()
+    if sampled_ray_idx is not None:
+        I_patch_rays = I_patch_rays[:, sampled_ray_idx]
+    # Compute optimal rotation to align rays
+    R = torch.zeros_like(I_camera.R)
+    for i in range(len(I_camera)):
+        R[i] = compute_optimal_rotation_alignment(
+            I_patch_rays[i],
+            directions[i],
+        )
+    # Construct and return rotated camera
+    cam = I_camera.clone()
+    cam.R = R
+    cam.T = -torch.matmul(R.transpose(1, 2), camera_centers.unsqueeze(2)).squeeze(2)
+    return cam
+# https://www.reddit.com/r/learnmath/comments/v1crd7/linear_algebra_qr_to_ql_decomposition/
+def ql_decomposition(A):
+    P = torch.tensor([[0, 0, 1], [0, 1, 0], [1, 0, 0]], device=A.device).float()
+    A_tilde = torch.matmul(A, P)
+    Q_tilde, R_tilde = torch.linalg.qr(A_tilde)
+    Q = torch.matmul(Q_tilde, P)
+    L = torch.matmul(torch.matmul(P, R_tilde), P)
+    d = torch.diag(L)
+    Q[:, 0] *= torch.sign(d[0])
+    Q[:, 1] *= torch.sign(d[1])
+    Q[:, 2] *= torch.sign(d[2])
+    L[0] *= torch.sign(d[0])
+    L[1] *= torch.sign(d[1])
+    L[2] *= torch.sign(d[2])
+    return Q, L
+def rays_to_cameras_homography(
+    rays,
+    crop_parameters,
+    num_patches_x=16,
+    num_patches_y=16,
+    use_half_pix=True,
+    sampled_ray_idx=None,
+    reproj_threshold=0.2,
+):
+    """
+    Args:
+        rays (Rays): (N, P, 6)
+        crop_parameters (torch.Tensor): (N, 4)
+    """
+    device = rays.device
+    origins = rays.get_origins()
+    directions = rays.get_directions()
+    camera_centers, _ = intersect_skew_lines_high_dim(origins, directions)
+    # Retrieve target rays
+    I_camera = PerspectiveCameras(focal_length=[1] * rays.shape[0], device=device)
+    I_patch_rays = cameras_to_rays(
+        cameras=I_camera,
+        num_patches_x=num_patches_x,
+        num_patches_y=num_patches_y,
+        use_half_pix=use_half_pix,
+        crop_parameters=crop_parameters,
+    ).get_directions()
+    if sampled_ray_idx is not None:
+        I_patch_rays = I_patch_rays[:, sampled_ray_idx]
+    # Compute optimal rotation to align rays
+    Rs = []
+    focal_lengths = []
+    principal_points = []
+    for i in range(rays.shape[-3]):
+        R, f, pp = compute_optimal_rotation_intrinsics(
+            I_patch_rays[i],
+            directions[i],
+            reproj_threshold=reproj_threshold,
+        )
+        Rs.append(R)
+        focal_lengths.append(f)
+        principal_points.append(pp)
+    R = torch.stack(Rs)
+    focal_lengths = torch.stack(focal_lengths)
+    principal_points = torch.stack(principal_points)
+    T = -torch.matmul(R.transpose(1, 2), camera_centers.unsqueeze(2)).squeeze(2)
+    return PerspectiveCameras(
+        R=R,
+        T=T,
+        focal_length=focal_lengths,
+        principal_point=principal_points,
+        device=device,
+    )
+def compute_optimal_rotation_alignment(A, B):
+    """
+    Compute optimal R that minimizes: || A - B @ R ||_F
+    Args:
+        A (torch.Tensor): (N, 3)
+        B (torch.Tensor): (N, 3)
+    Returns:
+        R (torch.tensor): (3, 3)
+    """
+    # normally with R @ B, this would be A @ B.T
+    H = B.T @ A
+    U, _, Vh = torch.linalg.svd(H, full_matrices=True)
+    s = torch.linalg.det(U @ Vh)
+    S_prime = torch.diag(torch.tensor([1, 1, torch.sign(s)], device=A.device))
+    return U @ S_prime @ Vh
+def compute_optimal_rotation_intrinsics(
+    rays_origin, rays_target, z_threshold=1e-4, reproj_threshold=0.2
+):
+    """
+    Note: for some reason, f seems to be 1/f.
+    Args:
+        rays_origin (torch.Tensor): (N, 3)
+        rays_target (torch.Tensor): (N, 3)
+        z_threshold (float): Threshold for z value to be considered valid.
+    Returns:
+        R (torch.tensor): (3, 3)
+        focal_length (torch.tensor): (2,)
+        principal_point (torch.tensor): (2,)
+    """
+    device = rays_origin.device
+    z_mask = torch.logical_and(
+        torch.abs(rays_target) > z_threshold, torch.abs(rays_origin) > z_threshold
+    )[:, 2]
+    rays_target = rays_target[z_mask]
+    rays_origin = rays_origin[z_mask]
+    rays_origin = rays_origin[:, :2] / rays_origin[:, -1:]
+    rays_target = rays_target[:, :2] / rays_target[:, -1:]
+    A, _ = cv2.findHomography(
+        rays_origin.cpu().numpy(),
+        rays_target.cpu().numpy(),
+        cv2.RANSAC,
+        reproj_threshold,
+    )
+    A = torch.from_numpy(A).float().to(device)
+    if torch.linalg.det(A) < 0:
+        A = -A
+    R, L = ql_decomposition(A)
+    L = L / L[2][2]
+    f = torch.stack((L[0][0], L[1][1]))
+    pp = torch.stack((L[2][0], L[2][1]))
+    return R, f, pp
+def compute_ndc_coordinates(
+    crop_parameters=None,
+    use_half_pix=True,
+    num_patches_x=16,
+    num_patches_y=16,
+    device=None,
+):
+    """
+    Computes NDC Grid using crop_parameters. If crop_parameters is not provided,
+    then it assumes that the crop is the entire image (corresponding to an NDC grid
+    where top left corner is (1, 1) and bottom right corner is (-1, -1)).
+    """
+    if crop_parameters is None:
+        cc_x, cc_y, width = 0, 0, 2
+    else:
+        if len(crop_parameters.shape) > 1:
+            return torch.stack(
+                [
+                    compute_ndc_coordinates(
+                        crop_parameters=crop_param,
+                        use_half_pix=use_half_pix,
+                        num_patches_x=num_patches_x,
+                        num_patches_y=num_patches_y,
+                    )
+                    for crop_param in crop_parameters
+                ],
+                dim=0,
+            )
+        device = crop_parameters.device
+        cc_x, cc_y, width, _ = crop_parameters
+    dx = 1 / num_patches_x
+    dy = 1 / num_patches_y
+    if use_half_pix:
+        min_y = 1 - dy
+        max_y = -min_y
+        min_x = 1 - dx
+        max_x = -min_x
+    else:
+        min_y = min_x = 1
+        max_y = -1 + 2 * dy
+        max_x = -1 + 2 * dx
+    y, x = torch.meshgrid(
+        torch.linspace(min_y, max_y, num_patches_y, dtype=torch.float32, device=device),
+        torch.linspace(min_x, max_x, num_patches_x, dtype=torch.float32, device=device),
+        indexing="ij",
+    )
+    x_prime = x * width / 2 - cc_x
+    y_prime = y * width / 2 - cc_y
+    xyd_grid = torch.stack([x_prime, y_prime, torch.ones_like(x)], dim=-1)
+    return xyd_grid

onediffusion/dataset/transforms.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import torch
+import torch.nn.functional as F
+def crop(image, i, j, h, w):
+    """
+    Args:
+        image (torch.tensor): Image to be cropped. Size is (C, H, W)
+    """
+    if len(image.size()) != 3:
+        raise ValueError("image should be a 3D tensor")
+    return image[..., i : i + h, j : j + w]
+def resize(image, target_size, interpolation_mode):
+    if len(target_size) != 2:
+        raise ValueError(f"target size should be tuple (height, width), instead got {target_size}")
+    return F.interpolate(image.unsqueeze(0), size=target_size, mode=interpolation_mode, align_corners=False).squeeze(0)
+def resize_scale(image, target_size, interpolation_mode):
+    if len(target_size) != 2:
+        raise ValueError(f"target size should be tuple (height, width), instead got {target_size}")
+    H, W = image.size(-2), image.size(-1)
+    scale_ = target_size[0] / min(H, W)
+    return F.interpolate(image.unsqueeze(0), scale_factor=scale_, mode=interpolation_mode, align_corners=False).squeeze(0)
+def resized_crop(image, i, j, h, w, size, interpolation_mode="bilinear"):
+    """
+    Do spatial cropping and resizing to the image
+    Args:
+        image (torch.tensor): Image to be cropped. Size is (C, H, W)
+        i (int): i in (i,j) i.e coordinates of the upper left corner.
+        j (int): j in (i,j) i.e coordinates of the upper left corner.
+        h (int): Height of the cropped region.
+        w (int): Width of the cropped region.
+        size (tuple(int, int)): height and width of resized image
+    Returns:
+        image (torch.tensor): Resized and cropped image. Size is (C, H, W)
+    """
+    if len(image.size()) != 3:
+        raise ValueError("image should be a 3D torch.tensor")
+    image = crop(image, i, j, h, w)
+    image = resize(image, size, interpolation_mode)
+    return image
+def center_crop(image, crop_size):
+    if len(image.size()) != 3:
+        raise ValueError("image should be a 3D torch.tensor")
+    h, w = image.size(-2), image.size(-1)
+    th, tw = crop_size
+    if h < th or w < tw:
+        raise ValueError("height and width must be no smaller than crop_size")
+    i = int(round((h - th) / 2.0))
+    j = int(round((w - tw) / 2.0))
+    return crop(image, i, j, th, tw)
+def center_crop_using_short_edge(image):
+    if len(image.size()) != 3:
+        raise ValueError("image should be a 3D torch.tensor")
+    h, w = image.size(-2), image.size(-1)
+    if h < w:
+        th, tw = h, h
+        i = 0
+        j = int(round((w - tw) / 2.0))
+    else:
+        th, tw = w, w
+        i = int(round((h - th) / 2.0))
+        j = 0
+    return crop(image, i, j, th, tw)
+class CenterCropResizeImage:
+    """
+    Resize the image while maintaining aspect ratio, and then crop it to the desired size.
+    The resizing is done such that the area of padding/cropping is minimized.
+    """
+    def __init__(self, size, interpolation_mode="bilinear"):
+        if isinstance(size, tuple):
+            if len(size) != 2:
+                raise ValueError(f"Size should be a tuple (height, width), instead got {size}")
+            self.size = size
+        else:
+            self.size = (size, size)
+        self.interpolation_mode = interpolation_mode
+    def __call__(self, image):
+        """
+        Args:
+            image (torch.Tensor): Image to be resized and cropped. Size is (C, H, W)
+        Returns:
+            torch.Tensor: Resized and cropped image. Size is (C, target_height, target_width)
+        """
+        target_height, target_width = self.size
+        target_aspect = target_width / target_height
+        # Get current image shape and aspect ratio
+        _, height, width = image.shape
+        height, width = float(height), float(width)
+        current_aspect = width / height
+        # Calculate crop dimensions
+        if current_aspect > target_aspect:
+            # Image is wider than target, crop width
+            crop_height = height
+            crop_width = height * target_aspect
+        else:
+            # Image is taller than target, crop height
+            crop_height = width / target_aspect
+            crop_width = width
+        # Calculate crop coordinates (center crop)
+        y1 = (height - crop_height) / 2
+        x1 = (width - crop_width) / 2
+        # Perform the crop
+        cropped_image = crop(image, int(y1), int(x1), int(crop_height), int(crop_width))
+        # Resize the cropped image to the target size
+        resized_image = resize(cropped_image, self.size, self.interpolation_mode)
+        return resized_image
+# Example usage
+if __name__ == "__main__":
+    # Create a sample image tensor
+    sample_image = torch.rand(3, 480, 640)  # (C, H, W)
+    # Initialize the transform
+    transform = CenterCropResizeImage(size=(224, 224), interpolation_mode="bilinear")
+    # Apply the transform
+    transformed_image = transform(sample_image)
+    print(f"Original image shape: {sample_image.shape}")
+    print(f"Transformed image shape: {transformed_image.shape}")

onediffusion/dataset/utils.py ADDED Viewed

	@@ -0,0 +1,175 @@

+ASPECT_RATIO_2880 = {
+    '0.25': [1408.0, 5760.0], '0.26': [1408.0, 5568.0], '0.27': [1408.0, 5376.0], '0.28': [1408.0, 5184.0],
+    '0.32': [1600.0, 4992.0], '0.33': [1600.0, 4800.0], '0.34': [1600.0, 4672.0], '0.4': [1792.0, 4480.0],
+    '0.42': [1792.0, 4288.0], '0.47': [1920.0, 4096.0], '0.49': [1920.0, 3904.0], '0.51': [1920.0, 3776.0],
+    '0.55': [2112.0, 3840.0], '0.59': [2112.0, 3584.0], '0.68': [2304.0, 3392.0], '0.72': [2304.0, 3200.0],
+    '0.78': [2496.0, 3200.0], '0.83': [2496.0, 3008.0], '0.89': [2688.0, 3008.0], '0.93': [2688.0, 2880.0],
+    '1.0': [2880.0, 2880.0], '1.07': [2880.0, 2688.0], '1.12': [3008.0, 2688.0], '1.21': [3008.0, 2496.0],
+    '1.28': [3200.0, 2496.0], '1.39': [3200.0, 2304.0], '1.47': [3392.0, 2304.0], '1.7': [3584.0, 2112.0],
+    '1.82': [3840.0, 2112.0], '2.03': [3904.0, 1920.0], '2.13': [4096.0, 1920.0], '2.39': [4288.0, 1792.0],
+    '2.5': [4480.0, 1792.0], '2.92': [4672.0, 1600.0], '3.0': [4800.0, 1600.0], '3.12': [4992.0, 1600.0],
+    '3.68': [5184.0, 1408.0], '3.82': [5376.0, 1408.0], '3.95': [5568.0, 1408.0], '4.0': [5760.0, 1408.0]
+}
+ASPECT_RATIO_2048 = {
+    '0.25': [1024.0, 4096.0], '0.26': [1024.0, 3968.0], '0.27': [1024.0, 3840.0], '0.28': [1024.0, 3712.0],
+    '0.32': [1152.0, 3584.0], '0.33': [1152.0, 3456.0], '0.35': [1152.0, 3328.0], '0.4': [1280.0, 3200.0],
+    '0.42': [1280.0, 3072.0], '0.48': [1408.0, 2944.0], '0.5': [1408.0, 2816.0], '0.52': [1408.0, 2688.0],
+    '0.57': [1536.0, 2688.0], '0.6': [1536.0, 2560.0], '0.68': [1664.0, 2432.0], '0.72': [1664.0, 2304.0],
+    '0.78': [1792.0, 2304.0], '0.82': [1792.0, 2176.0], '0.88': [1920.0, 2176.0], '0.94': [1920.0, 2048.0],
+    '1.0': [2048.0, 2048.0], '1.07': [2048.0, 1920.0], '1.13': [2176.0, 1920.0], '1.21': [2176.0, 1792.0],
+    '1.29': [2304.0, 1792.0], '1.38': [2304.0, 1664.0], '1.46': [2432.0, 1664.0], '1.67': [2560.0, 1536.0],
+    '1.75': [2688.0, 1536.0], '2.0': [2816.0, 1408.0], '2.09': [2944.0, 1408.0], '2.4': [3072.0, 1280.0],
+    '2.5': [3200.0, 1280.0], '2.89': [3328.0, 1152.0], '3.0': [3456.0, 1152.0], '3.11': [3584.0, 1152.0],
+    '3.62': [3712.0, 1024.0], '3.75': [3840.0, 1024.0], '3.88': [3968.0, 1024.0], '4.0': [4096.0, 1024.0]
+}
+ASPECT_RATIO_1024 = {
+    '0.25': [512., 2048.], '0.26': [512., 1984.], '0.27': [512., 1920.], '0.28': [512., 1856.],
+    '0.32': [576., 1792.], '0.33': [576., 1728.], '0.35': [576., 1664.], '0.4':  [640., 1600.],
+    '0.42':  [640., 1536.], '0.48': [704., 1472.], '0.5': [704., 1408.], '0.52': [704., 1344.],
+    '0.57': [768., 1344.], '0.6': [768., 1280.], '0.68': [832., 1216.], '0.72': [832., 1152.],
+    '0.78': [896., 1152.], '0.82': [896., 1088.], '0.88': [960., 1088.], '0.94': [960., 1024.],
+    '1.0':  [1024., 1024.], '1.07': [1024.,  960.], '1.13': [1088.,  960.], '1.21': [1088.,  896.],
+    '1.29': [1152.,  896.], '1.38': [1152.,  832.], '1.46': [1216.,  832.], '1.67': [1280.,  768.],
+    '1.75': [1344.,  768.], '2.0':  [1408.,  704.], '2.09':  [1472.,  704.], '2.4':  [1536.,  640.],
+    '2.5':  [1600.,  640.], '2.89':  [1664.,  576.], '3.0':  [1728.,  576.], '3.11':  [1792.,  576.],
+    '3.62':  [1856.,  512.], '3.75':  [1920.,  512.], '3.88':  [1984.,  512.], '4.0':  [2048.,  512.],
+}
+ASPECT_RATIO_512 = {
+     '0.25': [256.0, 1024.0], '0.26': [256.0, 992.0], '0.27': [256.0, 960.0], '0.28': [256.0, 928.0],
+     '0.32': [288.0, 896.0], '0.33': [288.0, 864.0], '0.35': [288.0, 832.0], '0.4': [320.0, 800.0],
+     '0.42': [320.0, 768.0], '0.48': [352.0, 736.0], '0.5': [352.0, 704.0], '0.52': [352.0, 672.0],
+     '0.57': [384.0, 672.0], '0.6': [384.0, 640.0], '0.68': [416.0, 608.0], '0.72': [416.0, 576.0],
+     '0.78': [448.0, 576.0], '0.82': [448.0, 544.0], '0.88': [480.0, 544.0], '0.94': [480.0, 512.0],
+     '1.0': [512.0, 512.0], '1.07': [512.0, 480.0], '1.13': [544.0, 480.0], '1.21': [544.0, 448.0],
+     '1.29': [576.0, 448.0], '1.38': [576.0, 416.0], '1.46': [608.0, 416.0], '1.67': [640.0, 384.0],
+     '1.75': [672.0, 384.0], '2.0': [704.0, 352.0], '2.09': [736.0, 352.0], '2.4': [768.0, 320.0],
+     '2.5': [800.0, 320.0], '2.89': [832.0, 288.0], '3.0': [864.0, 288.0], '3.11': [896.0, 288.0],
+     '3.62': [928.0, 256.0], '3.75': [960.0, 256.0], '3.88': [992.0, 256.0], '4.0': [1024.0, 256.0]
+     }
+ASPECT_RATIO_384 = {
+    '0.25': [192.0, 768.0],
+    '0.26': [192.0, 736.0],
+    '0.27': [208.0, 768.0],
+    '0.28': [208.0, 736.0],
+    '0.33': [240.0, 720.0],
+    '0.4': [256.0, 640.0],
+    '0.42': [304.0, 720.0],
+    '0.48': [368.0, 768.0],
+    '0.5': [384.0, 768.0],
+    '0.52': [384.0, 736.0],
+    '0.57': [384.0, 672.0],
+    '0.6': [384.0, 640.0],
+    '0.73': [384.0, 528.0],
+    '0.77': [384.0, 496.0],
+    '0.83': [384.0, 464.0],
+    '0.89': [384.0, 432.0],
+    '0.92': [384.0, 416.0],
+    '1.0': [384.0, 384.0],
+    '1.09': [384.0, 352.0],
+    '1.14': [384.0, 336.0],
+    '1.2': [384.0, 320.0],
+    '1.26': [384.0, 304.0],
+    '1.33': [384.0, 288.0],
+    '1.41': [384.0, 272.0],
+    '1.6': [384.0, 240.0],
+    '1.71': [384.0, 224.0],
+    '2.0': [384.0, 192.0],
+    '2.4': [384.0, 160.0],
+    '2.88': [368.0, 128.0],
+    '3.0': [384.0, 128.0],
+    '3.43': [384.0, 112.0],
+    '4.0': [384.0, 96.0]
+}
+ASPECT_RATIO_256 = {
+     '0.25': [128.0, 512.0], '0.26': [128.0, 496.0], '0.27': [128.0, 480.0], '0.28': [128.0, 464.0],
+     '0.32': [144.0, 448.0], '0.33': [144.0, 432.0], '0.35': [144.0, 416.0], '0.4': [160.0, 400.0],
+     '0.42': [160.0, 384.0], '0.48': [176.0, 368.0], '0.5': [176.0, 352.0], '0.52': [176.0, 336.0],
+     '0.57': [192.0, 336.0], '0.6': [192.0, 320.0], '0.68': [208.0, 304.0], '0.72': [208.0, 288.0],
+     '0.78': [224.0, 288.0], '0.82': [224.0, 272.0], '0.88': [240.0, 272.0], '0.94': [240.0, 256.0],
+     '1.0': [256.0, 256.0], '1.07': [256.0, 240.0], '1.13': [272.0, 240.0], '1.21': [272.0, 224.0],
+     '1.29': [288.0, 224.0], '1.38': [288.0, 208.0], '1.46': [304.0, 208.0], '1.67': [320.0, 192.0],
+     '1.75': [336.0, 192.0], '2.0': [352.0, 176.0], '2.09': [368.0, 176.0], '2.4': [384.0, 160.0],
+     '2.5': [400.0, 160.0], '2.89': [416.0, 144.0], '3.0': [432.0, 144.0], '3.11': [448.0, 144.0],
+     '3.62': [464.0, 128.0], '3.75': [480.0, 128.0], '3.88': [496.0, 128.0], '4.0': [512.0, 128.0]
+}
+ASPECT_RATIO_256_TEST = {
+     '0.25': [128.0, 512.0], '0.28': [128.0, 464.0],
+     '0.32': [144.0, 448.0], '0.33': [144.0, 432.0], '0.35': [144.0, 416.0], '0.4': [160.0, 400.0],
+     '0.42': [160.0, 384.0], '0.48': [176.0, 368.0], '0.5': [176.0, 352.0], '0.52': [176.0, 336.0],
+     '0.57': [192.0, 336.0], '0.6': [192.0, 320.0], '0.68': [208.0, 304.0], '0.72': [208.0, 288.0],
+     '0.78': [224.0, 288.0], '0.82': [224.0, 272.0], '0.88': [240.0, 272.0], '0.94': [240.0, 256.0],
+     '1.0': [256.0, 256.0], '1.07': [256.0, 240.0], '1.13': [272.0, 240.0], '1.21': [272.0, 224.0],
+     '1.29': [288.0, 224.0], '1.38': [288.0, 208.0], '1.46': [304.0, 208.0], '1.67': [320.0, 192.0],
+     '1.75': [336.0, 192.0], '2.0': [352.0, 176.0], '2.09': [368.0, 176.0], '2.4': [384.0, 160.0],
+     '2.5': [400.0, 160.0], '3.0': [432.0, 144.0],
+     '4.0': [512.0, 128.0]
+}
+ASPECT_RATIO_512_TEST = {
+     '0.25': [256.0, 1024.0], '0.28': [256.0, 928.0],
+     '0.32': [288.0, 896.0], '0.33': [288.0, 864.0], '0.35': [288.0, 832.0], '0.4': [320.0, 800.0],
+     '0.42': [320.0, 768.0], '0.48': [352.0, 736.0], '0.5': [352.0, 704.0], '0.52': [352.0, 672.0],
+     '0.57': [384.0, 672.0], '0.6': [384.0, 640.0], '0.68': [416.0, 608.0], '0.72': [416.0, 576.0],
+     '0.78': [448.0, 576.0], '0.82': [448.0, 544.0], '0.88': [480.0, 544.0], '0.94': [480.0, 512.0],
+     '1.0': [512.0, 512.0], '1.07': [512.0, 480.0], '1.13': [544.0, 480.0], '1.21': [544.0, 448.0],
+     '1.29': [576.0, 448.0], '1.38': [576.0, 416.0], '1.46': [608.0, 416.0], '1.67': [640.0, 384.0],
+     '1.75': [672.0, 384.0], '2.0': [704.0, 352.0], '2.09': [736.0, 352.0], '2.4': [768.0, 320.0],
+     '2.5': [800.0, 320.0], '3.0': [864.0, 288.0],
+     '4.0': [1024.0, 256.0]
+     }
+ASPECT_RATIO_1024_TEST = {
+    '0.25': [512., 2048.], '0.28': [512., 1856.],
+    '0.32': [576., 1792.], '0.33': [576., 1728.], '0.35': [576., 1664.], '0.4':  [640., 1600.],
+    '0.42':  [640., 1536.], '0.48': [704., 1472.], '0.5': [704., 1408.], '0.52': [704., 1344.],
+    '0.57': [768., 1344.], '0.6': [768., 1280.], '0.68': [832., 1216.], '0.72': [832., 1152.],
+    '0.78': [896., 1152.], '0.82': [896., 1088.], '0.88': [960., 1088.], '0.94': [960., 1024.],
+    '1.0':  [1024., 1024.], '1.07': [1024.,  960.], '1.13': [1088.,  960.], '1.21': [1088.,  896.],
+    '1.29': [1152.,  896.], '1.38': [1152.,  832.], '1.46': [1216.,  832.], '1.67': [1280.,  768.],
+    '1.75': [1344.,  768.], '2.0':  [1408.,  704.], '2.09':  [1472.,  704.], '2.4':  [1536.,  640.],
+    '2.5':  [1600.,  640.], '3.0':  [1728.,  576.],
+    '4.0':  [2048.,  512.],
+}
+ASPECT_RATIO_2048_TEST = {
+    '0.25': [1024.0, 4096.0], '0.26': [1024.0, 3968.0],
+    '0.32': [1152.0, 3584.0], '0.33': [1152.0, 3456.0], '0.35': [1152.0, 3328.0], '0.4': [1280.0, 3200.0],
+    '0.42': [1280.0, 3072.0], '0.48': [1408.0, 2944.0], '0.5': [1408.0, 2816.0], '0.52': [1408.0, 2688.0],
+    '0.57': [1536.0, 2688.0], '0.6': [1536.0, 2560.0], '0.68': [1664.0, 2432.0], '0.72': [1664.0, 2304.0],
+    '0.78': [1792.0, 2304.0], '0.82': [1792.0, 2176.0], '0.88': [1920.0, 2176.0], '0.94': [1920.0, 2048.0],
+    '1.0': [2048.0, 2048.0], '1.07': [2048.0, 1920.0], '1.13': [2176.0, 1920.0], '1.21': [2176.0, 1792.0],
+    '1.29': [2304.0, 1792.0], '1.38': [2304.0, 1664.0], '1.46': [2432.0, 1664.0], '1.67': [2560.0, 1536.0],
+    '1.75': [2688.0, 1536.0], '2.0': [2816.0, 1408.0], '2.09': [2944.0, 1408.0], '2.4': [3072.0, 1280.0],
+    '2.5': [3200.0, 1280.0], '3.0': [3456.0, 1152.0],
+    '4.0': [4096.0, 1024.0]
+}
+ASPECT_RATIO_2880_TEST = {
+    '0.25': [2048.0, 8192.0], '0.26': [2048.0, 7936.0],
+    '0.32': [2304.0, 7168.0], '0.33': [2304.0, 6912.0], '0.35': [2304.0, 6656.0], '0.4': [2560.0, 6400.0],
+    '0.42': [2560.0, 6144.0], '0.48': [2816.0, 5888.0], '0.5': [2816.0, 5632.0], '0.52': [2816.0, 5376.0],
+    '0.57': [3072.0, 5376.0], '0.6': [3072.0, 5120.0], '0.68': [3328.0, 4864.0], '0.72': [3328.0, 4608.0],
+    '0.78': [3584.0, 4608.0], '0.82': [3584.0, 4352.0], '0.88': [3840.0, 4352.0], '0.94': [3840.0, 4096.0],
+    '1.0': [4096.0, 4096.0], '1.07': [4096.0, 3840.0], '1.13': [4352.0, 3840.0], '1.21': [4352.0, 3584.0],
+    '1.29': [4608.0, 3584.0], '1.38': [4608.0, 3328.0], '1.46': [4864.0, 3328.0], '1.67': [5120.0, 3072.0],
+    '1.75': [5376.0, 3072.0], '2.0': [5632.0, 2816.0], '2.09': [5888.0, 2816.0], '2.4': [6144.0, 2560.0],
+    '2.5': [6400.0, 2560.0], '3.0': [6912.0, 2304.0],
+    '4.0': [8192.0, 2048.0],
+}
+def get_chunks(lst, n):
+    for i in range(0, len(lst), n):
+        yield lst[i:i + n]
+def get_closest_ratio(height: float, width: float, ratios: dict):
+    aspect_ratio = height / width
+    closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - aspect_ratio))
+    return ratios[closest_ratio], float(closest_ratio)

onediffusion/diffusion/pipelines/image_processor.py ADDED Viewed

	@@ -0,0 +1,674 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+import torchvision.transforms as T
+from PIL import Image, ImageFilter, ImageOps
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.utils import CONFIG_NAME, PIL_INTERPOLATION, deprecate
+from onediffusion.dataset.transforms import CenterCropResizeImage
+PipelineImageInput = Union[
+    PIL.Image.Image,
+    np.ndarray,
+    torch.Tensor,
+    List[PIL.Image.Image],
+    List[np.ndarray],
+    List[torch.Tensor],
+]
+PipelineDepthInput = PipelineImageInput
+def is_valid_image(image):
+    return isinstance(image, PIL.Image.Image) or isinstance(image, (np.ndarray, torch.Tensor)) and image.ndim in (2, 3)
+def is_valid_image_imagelist(images):
+    # check if the image input is one of the supported formats for image and image list:
+    # it can be either one of below 3
+    # (1) a 4d pytorch tensor or numpy array,
+    # (2) a valid image: PIL.Image.Image, 2-d np.ndarray or torch.Tensor (grayscale image), 3-d np.ndarray or torch.Tensor
+    # (3) a list of valid image
+    if isinstance(images, (np.ndarray, torch.Tensor)) and images.ndim == 4:
+        return True
+    elif is_valid_image(images):
+        return True
+    elif isinstance(images, list):
+        return all(is_valid_image(image) for image in images)
+    return False
+class VaeImageProcessorOneDiffuser(ConfigMixin):
+    """
+    Image processor for VAE.
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`. Can accept
+            `height` and `width` arguments from [`image_processor.VaeImageProcessor.preprocess`] method.
+        vae_scale_factor (`int`, *optional*, defaults to `8`):
+            VAE scale factor. If `do_resize` is `True`, the image is automatically resized to multiples of this factor.
+        resample (`str`, *optional*, defaults to `lanczos`):
+            Resampling filter to use when resizing the image.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image to [-1,1].
+        do_binarize (`bool`, *optional*, defaults to `False`):
+            Whether to binarize the image to 0/1.
+        do_convert_rgb (`bool`, *optional*, defaults to be `False`):
+            Whether to convert the images to RGB format.
+        do_convert_grayscale (`bool`, *optional*, defaults to be `False`):
+            Whether to convert the images to grayscale format.
+    """
+    config_name = CONFIG_NAME
+    @register_to_config
+    def __init__(
+        self,
+        do_resize: bool = True,
+        vae_scale_factor: int = 8,
+        vae_latent_channels: int = 4,
+        resample: str = "lanczos",
+        do_normalize: bool = True,
+        do_binarize: bool = False,
+        do_convert_rgb: bool = False,
+        do_convert_grayscale: bool = False,
+    ):
+        super().__init__()
+        if do_convert_rgb and do_convert_grayscale:
+            raise ValueError(
+                "`do_convert_rgb` and `do_convert_grayscale` can not both be set to `True`,"
+                " if you intended to convert the image into RGB format, please set `do_convert_grayscale = False`.",
+                " if you intended to convert the image into grayscale format, please set `do_convert_rgb = False`",
+            )
+    @staticmethod
+    def numpy_to_pil(images: np.ndarray) -> List[PIL.Image.Image]:
+        """
+        Convert a numpy image or a batch of images to a PIL image.
+        """
+        if images.ndim == 3:
+            images = images[None, ...]
+        images = (images * 255).round().astype("uint8")
+        if images.shape[-1] == 1:
+            # special case for grayscale (single channel) images
+            pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
+        else:
+            pil_images = [Image.fromarray(image) for image in images]
+        return pil_images
+    @staticmethod
+    def pil_to_numpy(images: Union[List[PIL.Image.Image], PIL.Image.Image]) -> np.ndarray:
+        """
+        Convert a PIL image or a list of PIL images to NumPy arrays.
+        """
+        if not isinstance(images, list):
+            images = [images]
+        images = [np.array(image).astype(np.float32) / 255.0 for image in images]
+        images = np.stack(images, axis=0)
+        return images
+    @staticmethod
+    def numpy_to_pt(images: np.ndarray) -> torch.Tensor:
+        """
+        Convert a NumPy image to a PyTorch tensor.
+        """
+        if images.ndim == 3:
+            images = images[..., None]
+        images = torch.from_numpy(images.transpose(0, 3, 1, 2))
+        return images
+    @staticmethod
+    def pt_to_numpy(images: torch.Tensor) -> np.ndarray:
+        """
+        Convert a PyTorch tensor to a NumPy image.
+        """
+        images = images.cpu().permute(0, 2, 3, 1).float().numpy()
+        return images
+    @staticmethod
+    def normalize(images: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
+        """
+        Normalize an image array to [-1,1].
+        """
+        return 2.0 * images - 1.0
+    @staticmethod
+    def denormalize(images: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
+        """
+        Denormalize an image array to [0,1].
+        """
+        return (images / 2 + 0.5).clamp(0, 1)
+    @staticmethod
+    def convert_to_rgb(image: PIL.Image.Image) -> PIL.Image.Image:
+        """
+        Converts a PIL image to RGB format.
+        """
+        image = image.convert("RGB")
+        return image
+    @staticmethod
+    def convert_to_grayscale(image: PIL.Image.Image) -> PIL.Image.Image:
+        """
+        Converts a PIL image to grayscale format.
+        """
+        image = image.convert("L")
+        return image
+    @staticmethod
+    def blur(image: PIL.Image.Image, blur_factor: int = 4) -> PIL.Image.Image:
+        """
+        Applies Gaussian blur to an image.
+        """
+        image = image.filter(ImageFilter.GaussianBlur(blur_factor))
+        return image
+    @staticmethod
+    def get_crop_region(mask_image: PIL.Image.Image, width: int, height: int, pad=0):
+        """
+        Finds a rectangular region that contains all masked ares in an image, and expands region to match the aspect
+        ratio of the original image; for example, if user drew mask in a 128x32 region, and the dimensions for
+        processing are 512x512, the region will be expanded to 128x128.
+        Args:
+            mask_image (PIL.Image.Image): Mask image.
+            width (int): Width of the image to be processed.
+            height (int): Height of the image to be processed.
+            pad (int, optional): Padding to be added to the crop region. Defaults to 0.
+        Returns:
+            tuple: (x1, y1, x2, y2) represent a rectangular region that contains all masked ares in an image and
+            matches the original aspect ratio.
+        """
+        mask_image = mask_image.convert("L")
+        mask = np.array(mask_image)
+        # 1. find a rectangular region that contains all masked ares in an image
+        h, w = mask.shape
+        crop_left = 0
+        for i in range(w):
+            if not (mask[:, i] == 0).all():
+                break
+            crop_left += 1
+        crop_right = 0
+        for i in reversed(range(w)):
+            if not (mask[:, i] == 0).all():
+                break
+            crop_right += 1
+        crop_top = 0
+        for i in range(h):
+            if not (mask[i] == 0).all():
+                break
+            crop_top += 1
+        crop_bottom = 0
+        for i in reversed(range(h)):
+            if not (mask[i] == 0).all():
+                break
+            crop_bottom += 1
+        # 2. add padding to the crop region
+        x1, y1, x2, y2 = (
+            int(max(crop_left - pad, 0)),
+            int(max(crop_top - pad, 0)),
+            int(min(w - crop_right + pad, w)),
+            int(min(h - crop_bottom + pad, h)),
+        )
+        # 3. expands crop region to match the aspect ratio of the image to be processed
+        ratio_crop_region = (x2 - x1) / (y2 - y1)
+        ratio_processing = width / height
+        if ratio_crop_region > ratio_processing:
+            desired_height = (x2 - x1) / ratio_processing
+            desired_height_diff = int(desired_height - (y2 - y1))
+            y1 -= desired_height_diff // 2
+            y2 += desired_height_diff - desired_height_diff // 2
+            if y2 >= mask_image.height:
+                diff = y2 - mask_image.height
+                y2 -= diff
+                y1 -= diff
+            if y1 < 0:
+                y2 -= y1
+                y1 -= y1
+            if y2 >= mask_image.height:
+                y2 = mask_image.height
+        else:
+            desired_width = (y2 - y1) * ratio_processing
+            desired_width_diff = int(desired_width - (x2 - x1))
+            x1 -= desired_width_diff // 2
+            x2 += desired_width_diff - desired_width_diff // 2
+            if x2 >= mask_image.width:
+                diff = x2 - mask_image.width
+                x2 -= diff
+                x1 -= diff
+            if x1 < 0:
+                x2 -= x1
+                x1 -= x1
+            if x2 >= mask_image.width:
+                x2 = mask_image.width
+        return x1, y1, x2, y2
+    def _resize_and_fill(
+        self,
+        image: PIL.Image.Image,
+        width: int,
+        height: int,
+    ) -> PIL.Image.Image:
+        """
+        Resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center
+        the image within the dimensions, filling empty with data from image.
+        Args:
+            image: The image to resize.
+            width: The width to resize the image to.
+            height: The height to resize the image to.
+        """
+        ratio = width / height
+        src_ratio = image.width / image.height
+        src_w = width if ratio < src_ratio else image.width * height // image.height
+        src_h = height if ratio >= src_ratio else image.height * width // image.width
+        resized = image.resize((src_w, src_h), resample=PIL_INTERPOLATION["lanczos"])
+        res = Image.new("RGB", (width, height))
+        res.paste(resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2))
+        if ratio < src_ratio:
+            fill_height = height // 2 - src_h // 2
+            if fill_height > 0:
+                res.paste(resized.resize((width, fill_height), box=(0, 0, width, 0)), box=(0, 0))
+                res.paste(
+                    resized.resize((width, fill_height), box=(0, resized.height, width, resized.height)),
+                    box=(0, fill_height + src_h),
+                )
+        elif ratio > src_ratio:
+            fill_width = width // 2 - src_w // 2
+            if fill_width > 0:
+                res.paste(resized.resize((fill_width, height), box=(0, 0, 0, height)), box=(0, 0))
+                res.paste(
+                    resized.resize((fill_width, height), box=(resized.width, 0, resized.width, height)),
+                    box=(fill_width + src_w, 0),
+                )
+        return res
+    def _resize_and_crop(
+        self,
+        image: PIL.Image.Image,
+        width: int,
+        height: int,
+    ) -> PIL.Image.Image:
+        """
+        Resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center
+        the image within the dimensions, cropping the excess.
+        Args:
+            image: The image to resize.
+            width: The width to resize the image to.
+            height: The height to resize the image to.
+        """
+        ratio = width / height
+        src_ratio = image.width / image.height
+        src_w = width if ratio > src_ratio else image.width * height // image.height
+        src_h = height if ratio <= src_ratio else image.height * width // image.width
+        resized = image.resize((src_w, src_h), resample=PIL_INTERPOLATION["lanczos"])
+        res = Image.new("RGB", (width, height))
+        res.paste(resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2))
+        return res
+    def resize(
+        self,
+        image: Union[PIL.Image.Image, np.ndarray, torch.Tensor],
+        height: int,
+        width: int,
+        resize_mode: str = "default",  # "default", "fill", "crop"
+    ) -> Union[PIL.Image.Image, np.ndarray, torch.Tensor]:
+        """
+        Resize image.
+        Args:
+            image (`PIL.Image.Image`, `np.ndarray` or `torch.Tensor`):
+                The image input, can be a PIL image, numpy array or pytorch tensor.
+            height (`int`):
+                The height to resize to.
+            width (`int`):
+                The width to resize to.
+            resize_mode (`str`, *optional*, defaults to `default`):
+                The resize mode to use, can be one of `default` or `fill`. If `default`, will resize the image to fit
+                within the specified width and height, and it may not maintaining the original aspect ratio. If `fill`,
+                will resize the image to fit within the specified width and height, maintaining the aspect ratio, and
+                then center the image within the dimensions, filling empty with data from image. If `crop`, will resize
+                the image to fit within the specified width and height, maintaining the aspect ratio, and then center
+                the image within the dimensions, cropping the excess. Note that resize_mode `fill` and `crop` are only
+                supported for PIL image input.
+        Returns:
+            `PIL.Image.Image`, `np.ndarray` or `torch.Tensor`:
+                The resized image.
+        """
+        if resize_mode != "default" and not isinstance(image, PIL.Image.Image):
+            raise ValueError(f"Only PIL image input is supported for resize_mode {resize_mode}")
+        if isinstance(image, PIL.Image.Image):
+            if resize_mode == "default":
+                image = image.resize((width, height), resample=PIL_INTERPOLATION[self.config.resample])
+            elif resize_mode == "fill":
+                image = self._resize_and_fill(image, width, height)
+            elif resize_mode == "crop":
+                image = self._resize_and_crop(image, width, height)
+            else:
+                raise ValueError(f"resize_mode {resize_mode} is not supported")
+        elif isinstance(image, torch.Tensor):
+            image = torch.nn.functional.interpolate(
+                image,
+                size=(height, width),
+            )
+        elif isinstance(image, np.ndarray):
+            image = self.numpy_to_pt(image)
+            image = torch.nn.functional.interpolate(
+                image,
+                size=(height, width),
+            )
+            image = self.pt_to_numpy(image)
+        return image
+    def binarize(self, image: PIL.Image.Image) -> PIL.Image.Image:
+        """
+        Create a mask.
+        Args:
+            image (`PIL.Image.Image`):
+                The image input, should be a PIL image.
+        Returns:
+            `PIL.Image.Image`:
+                The binarized image. Values less than 0.5 are set to 0, values greater than 0.5 are set to 1.
+        """
+        image[image < 0.5] = 0
+        image[image >= 0.5] = 1
+        return image
+    def get_default_height_width(
+        self,
+        image: Union[PIL.Image.Image, np.ndarray, torch.Tensor],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+    ) -> Tuple[int, int]:
+        """
+        This function return the height and width that are downscaled to the next integer multiple of
+        `vae_scale_factor`.
+        Args:
+            image(`PIL.Image.Image`, `np.ndarray` or `torch.Tensor`):
+                The image input, can be a PIL image, numpy array or pytorch tensor. if it is a numpy array, should have
+                shape `[batch, height, width]` or `[batch, height, width, channel]` if it is a pytorch tensor, should
+                have shape `[batch, channel, height, width]`.
+            height (`int`, *optional*, defaults to `None`):
+                The height in preprocessed image. If `None`, will use the height of `image` input.
+            width (`int`, *optional*`, defaults to `None`):
+                The width in preprocessed. If `None`, will use the width of the `image` input.
+        """
+        if height is None:
+            if isinstance(image, PIL.Image.Image):
+                height = image.height
+            elif isinstance(image, torch.Tensor):
+                height = image.shape[2]
+            else:
+                height = image.shape[1]
+        if width is None:
+            if isinstance(image, PIL.Image.Image):
+                width = image.width
+            elif isinstance(image, torch.Tensor):
+                width = image.shape[3]
+            else:
+                width = image.shape[2]
+        width, height = (
+            x - x % self.config.vae_scale_factor for x in (width, height)
+        )  # resize to integer multiple of vae_scale_factor
+        return height, width
+    def preprocess(
+        self,
+        image: PipelineImageInput,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        resize_mode: str = "default",  # "default", "fill", "crop"
+        crops_coords: Optional[Tuple[int, int, int, int]] = None,
+        do_crop: bool = True,
+    ) -> torch.Tensor:
+        """
+        Preprocess the image input.
+        Args:
+            image (`pipeline_image_input`):
+                The image input, accepted formats are PIL images, NumPy arrays, PyTorch tensors; Also accept list of
+                supported formats.
+            height (`int`, *optional*, defaults to `None`):
+                The height in preprocessed image. If `None`, will use the `get_default_height_width()` to get default
+                height.
+            width (`int`, *optional*`, defaults to `None`):
+                The width in preprocessed. If `None`, will use get_default_height_width()` to get the default width.
+            resize_mode (`str`, *optional*, defaults to `default`):
+                The resize mode, can be one of `default` or `fill`. If `default`, will resize the image to fit within
+                the specified width and height, and it may not maintaining the original aspect ratio. If `fill`, will
+                resize the image to fit within the specified width and height, maintaining the aspect ratio, and then
+                center the image within the dimensions, filling empty with data from image. If `crop`, will resize the
+                image to fit within the specified width and height, maintaining the aspect ratio, and then center the
+                image within the dimensions, cropping the excess. Note that resize_mode `fill` and `crop` are only
+                supported for PIL image input.
+            crops_coords (`List[Tuple[int, int, int, int]]`, *optional*, defaults to `None`):
+                The crop coordinates for each image in the batch. If `None`, will not crop the image.
+        """
+        supported_formats = (PIL.Image.Image, np.ndarray, torch.Tensor)
+        # Expand the missing dimension for 3-dimensional pytorch tensor or numpy array that represents grayscale image
+        if self.config.do_convert_grayscale and isinstance(image, (torch.Tensor, np.ndarray)) and image.ndim == 3:
+            if isinstance(image, torch.Tensor):
+                # if image is a pytorch tensor could have 2 possible shapes:
+                #    1. batch x height x width: we should insert the channel dimension at position 1
+                #    2. channel x height x width: we should insert batch dimension at position 0,
+                #       however, since both channel and batch dimension has same size 1, it is same to insert at position 1
+                #    for simplicity, we insert a dimension of size 1 at position 1 for both cases
+                image = image.unsqueeze(1)
+            else:
+                # if it is a numpy array, it could have 2 possible shapes:
+                #   1. batch x height x width: insert channel dimension on last position
+                #   2. height x width x channel: insert batch dimension on first position
+                if image.shape[-1] == 1:
+                    image = np.expand_dims(image, axis=0)
+                else:
+                    image = np.expand_dims(image, axis=-1)
+        if isinstance(image, list) and isinstance(image[0], np.ndarray) and image[0].ndim == 4:
+            warnings.warn(
+                "Passing `image` as a list of 4d np.ndarray is deprecated."
+                "Please concatenate the list along the batch dimension and pass it as a single 4d np.ndarray",
+                FutureWarning,
+            )
+            image = np.concatenate(image, axis=0)
+        if isinstance(image, list) and isinstance(image[0], torch.Tensor) and image[0].ndim == 4:
+            warnings.warn(
+                "Passing `image` as a list of 4d torch.Tensor is deprecated."
+                "Please concatenate the list along the batch dimension and pass it as a single 4d torch.Tensor",
+                FutureWarning,
+            )
+            image = torch.cat(image, axis=0)
+        if not is_valid_image_imagelist(image):
+            raise ValueError(
+                f"Input is in incorrect format. Currently, we only support {', '.join(str(x) for x in supported_formats)}"
+            )
+        if not isinstance(image, list):
+            image = [image]
+        if isinstance(image[0], PIL.Image.Image):
+            pass
+        elif isinstance(image[0], np.ndarray):
+            image = self.numpy_to_pil(image)
+        elif isinstance(image[0], torch.Tensor):
+            image = self.pt_to_numpy(image)
+            image = self.numpy_to_pil(image)
+        if do_crop:
+            transforms = T.Compose([
+                T.Lambda(lambda image: image.convert('RGB')),
+                T.ToTensor(),
+                CenterCropResizeImage((height, width)),
+                T.Normalize([.5], [.5]),
+            ])
+        else:
+            transforms = T.Compose([
+                T.Lambda(lambda image: image.convert('RGB')),
+                T.ToTensor(),
+                T.Resize((height, width)),
+                T.Normalize([.5], [.5]),
+            ])
+        image = torch.stack([transforms(i) for i in image])
+        # expected range [0,1], normalize to [-1,1]
+        do_normalize = self.config.do_normalize
+        if do_normalize and image.min() < 0:
+            warnings.warn(
+                "Passing `image` as torch tensor with value range in [-1,1] is deprecated. The expected value range for image tensor is [0,1] "
+                f"when passing as pytorch tensor or numpy Array. You passed `image` with value range [{image.min()},{image.max()}]",
+                FutureWarning,
+            )
+            do_normalize = False
+        if do_normalize:
+            image = self.normalize(image)
+        if self.config.do_binarize:
+            image = self.binarize(image)
+        return image
+    def postprocess(
+        self,
+        image: torch.Tensor,
+        output_type: str = "pil",
+        do_denormalize: Optional[List[bool]] = None,
+    ) -> Union[PIL.Image.Image, np.ndarray, torch.Tensor]:
+        """
+        Postprocess the image output from tensor to `output_type`.
+        Args:
+            image (`torch.Tensor`):
+                The image input, should be a pytorch tensor with shape `B x C x H x W`.
+            output_type (`str`, *optional*, defaults to `pil`):
+                The output type of the image, can be one of `pil`, `np`, `pt`, `latent`.
+            do_denormalize (`List[bool]`, *optional*, defaults to `None`):
+                Whether to denormalize the image to [0,1]. If `None`, will use the value of `do_normalize` in the
+                `VaeImageProcessor` config.
+        Returns:
+            `PIL.Image.Image`, `np.ndarray` or `torch.Tensor`:
+                The postprocessed image.
+        """
+        if not isinstance(image, torch.Tensor):
+            raise ValueError(
+                f"Input for postprocessing is in incorrect format: {type(image)}. We only support pytorch tensor"
+            )
+        if output_type not in ["latent", "pt", "np", "pil"]:
+            deprecation_message = (
+                f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: "
+                "`pil`, `np`, `pt`, `latent`"
+            )
+            deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False)
+            output_type = "np"
+        if output_type == "latent":
+            return image
+        if do_denormalize is None:
+            do_denormalize = [self.config.do_normalize] * image.shape[0]
+        image = torch.stack(
+            [self.denormalize(image[i]) if do_denormalize[i] else image[i] for i in range(image.shape[0])]
+        )
+        if output_type == "pt":
+            return image
+        image = self.pt_to_numpy(image)
+        if output_type == "np":
+            return image
+        if output_type == "pil":
+            return self.numpy_to_pil(image)
+    def apply_overlay(
+        self,
+        mask: PIL.Image.Image,
+        init_image: PIL.Image.Image,
+        image: PIL.Image.Image,
+        crop_coords: Optional[Tuple[int, int, int, int]] = None,
+    ) -> PIL.Image.Image:
+        """
+        overlay the inpaint output to the original image
+        """
+        width, height = image.width, image.height
+        init_image = self.resize(init_image, width=width, height=height)
+        mask = self.resize(mask, width=width, height=height)
+        init_image_masked = PIL.Image.new("RGBa", (width, height))
+        init_image_masked.paste(init_image.convert("RGBA").convert("RGBa"), mask=ImageOps.invert(mask.convert("L")))
+        init_image_masked = init_image_masked.convert("RGBA")
+        if crop_coords is not None:
+            x, y, x2, y2 = crop_coords
+            w = x2 - x
+            h = y2 - y
+            base_image = PIL.Image.new("RGBA", (width, height))
+            image = self.resize(image, height=h, width=w, resize_mode="crop")
+            base_image.paste(image, (x, y))
+            image = base_image.convert("RGB")
+        image = image.convert("RGBA")
+        image.alpha_composite(init_image_masked)
+        image = image.convert("RGB")
+        return image

onediffusion/diffusion/pipelines/onediffusion.py ADDED Viewed

	@@ -0,0 +1,1080 @@

+import einops
+import inspect
+import torch
+import numpy as np
+import PIL
+import os
+from dataclasses import dataclass
+from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.utils import (
+    CONFIG_NAME,
+    DEPRECATED_REVISION_ARGS,
+    BaseOutput,
+    PushToHubMixin,
+    deprecate,
+    is_accelerate_available,
+    is_accelerate_version,
+    is_torch_npu_available,
+    is_torch_version,
+    logging,
+    numpy_to_pil,
+    replace_example_docstring,
+)
+from diffusers.models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, ModelMixin
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.utils import BaseOutput
+# from diffusers.image_processor import VaeImageProcessor
+from transformers import T5EncoderModel, T5Tokenizer
+from typing import Any, Callable, Dict, List, Optional, Union
+from PIL import Image
+from onediffusion.models.denoiser.nextdit import NextDiT
+from onediffusion.dataset.utils import *
+from onediffusion.dataset.multitask.multiview import calculate_rays
+from onediffusion.diffusion.pipelines.image_processor import VaeImageProcessorOneDiffuser
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+SUPPORTED_DEVICE_MAP = ["balanced"]
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from one_diffusion import OneDiffusionPipeline
+        >>> pipe = OneDiffusionPipeline.from_pretrained("path_to_one_diffuser_model")
+        >>> pipe = pipe.to("cuda")
+        >>> prompt = "A beautiful sunset over the ocean"
+        >>> image = pipe(prompt).images[0]
+        >>> image.save("beautiful_sunset.png")
+        ```
+"""
+def create_c2w_matrix(azimuth_deg, elevation_deg, distance=1.0, target=np.array([0, 0, 0])):
+    """
+    Create a Camera-to-World (C2W) matrix from azimuth and elevation angles.
+    Parameters:
+    - azimuth_deg: Azimuth angle in degrees.
+    - elevation_deg: Elevation angle in degrees.
+    - distance: Distance from the target point.
+    - target: The point the camera is looking at in world coordinates.
+    Returns:
+    - C2W: A 4x4 NumPy array representing the Camera-to-World transformation matrix.
+    """
+    # Convert angles from degrees to radians
+    azimuth = np.deg2rad(azimuth_deg)
+    elevation = np.deg2rad(elevation_deg)
+    # Spherical to Cartesian conversion for camera position
+    x = distance * np.cos(elevation) * np.cos(azimuth)
+    y = distance * np.cos(elevation) * np.sin(azimuth)
+    z = distance * np.sin(elevation)
+    camera_position = np.array([x, y, z])
+    # Define the forward vector (from camera to target)
+    target = 2*camera_position - target
+    forward = target - camera_position
+    forward /= np.linalg.norm(forward)
+    # Define the world up vector
+    world_up = np.array([0, 0, 1])
+    # Compute the right vector
+    right = np.cross(world_up, forward)
+    if np.linalg.norm(right) < 1e-6:
+        # Handle the singularity when forward is parallel to world_up
+        world_up = np.array([0, 1, 0])
+        right = np.cross(world_up, forward)
+    right /= np.linalg.norm(right)
+    # Recompute the orthogonal up vector
+    up = np.cross(forward, right)
+    # Construct the rotation matrix
+    rotation = np.vstack([right, up, forward]).T  # 3x3
+    # Construct the full C2W matrix
+    C2W = np.eye(4)
+    C2W[:3, :3] = rotation
+    C2W[:3, 3] = camera_position
+    return C2W
+@dataclass
+class OneDiffusionPipelineOutput(BaseOutput):
+    """
+    Output class for Stable Diffusion pipelines.
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
+            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+    """
+    images: Union[List[Image.Image], np.ndarray]
+    latents: Optional[torch.Tensor] = None
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.16,
+    # max_clip: float = 1.5,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len) # 0.000169270833
+    b = base_shift - m * base_seq_len # 0.5-0.0433333332
+    mu = image_seq_len * m + b
+    # mu = min(mu, max_clip)
+    return mu
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+class OneDiffusionPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using OneDiffuser.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Args:
+        transformer ([`NextDiT`]):
+            Conditional transformer (NextDiT) architecture to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`T5EncoderModel`]):
+            Frozen text-encoder. OneDiffuser uses the T5 model as text encoder.
+        tokenizer (`T5Tokenizer`):
+            Tokenizer of class T5Tokenizer.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+    """
+    def __init__(
+        self,
+        transformer: NextDiT,
+        vae: AutoencoderKL,
+        text_encoder: T5EncoderModel,
+        tokenizer: T5Tokenizer,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+    ):
+        super().__init__()
+        self.register_modules(
+            transformer=transformer,
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessorOneDiffuser(vae_scale_factor=self.vae_scale_factor)
+    def enable_vae_slicing(self):
+        self.vae.enable_slicing()
+    def disable_vae_slicing(self):
+        self.vae.disable_slicing()
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+        device = torch.device(f"cuda:{gpu_id}")
+        for cpu_offloaded_model in [self.transformer, self.text_encoder, self.vae]:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+    @property
+    def _execution_device(self):
+        if self.device != torch.device("meta") or not hasattr(self.transformer, "_hf_hook"):
+            return self.device
+        for module in self.transformer.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        max_length=300,
+    ):
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        attention_mask = text_inputs.attention_mask
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {max_length} tokens: {removed_text}"
+            )
+        text_encoder_output = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask.to(device))
+        prompt_embeds = text_encoder_output[0].to(torch.float32)
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        # duplicate attention mask for each generation per prompt
+        attention_mask = attention_mask.repeat(1, num_images_per_prompt)
+        attention_mask = attention_mask.view(bs_embed * num_images_per_prompt, -1)
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_encoder_output = self.text_encoder(uncond_input.input_ids.to(device), attention_mask=uncond_input.attention_mask.to(device))
+            negative_prompt_embeds = uncond_encoder_output[0].to(torch.float32)
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+            # duplicate unconditional attention mask for each generation per prompt
+            uncond_attention_mask = uncond_input.attention_mask.repeat(1, num_images_per_prompt)
+            uncond_attention_mask = uncond_attention_mask.view(batch_size * num_images_per_prompt, -1)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            attention_mask = torch.cat([uncond_attention_mask, attention_mask])
+        return prompt_embeds.to(device), attention_mask.to(device)
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        forward_kwargs: Optional[Dict[str, Any]] = {},
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to self.transformer.config.sample_size):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.transformer.config.sample_size):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+        Examples:
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        height = height or self.transformer.config.input_size[-2] * 8 # TODO: Hardcoded downscale factor of vae
+        width = width or self.transformer.config.input_size[-1] * 8
+        # check inputs. Raise error if not correct
+        self.check_inputs(prompt, height, width, callback_steps)
+        # define call parameters
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf
+        do_classifier_free_guidance = guidance_scale > 1.0
+        encoder_hidden_states, encoder_attention_mask = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+        )
+        # set timesteps
+        # # self.scheduler.set_timesteps(num_inference_steps, device=device)
+        # timesteps = self.scheduler.timesteps
+        timesteps = None
+        # prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            self.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # prepare extra step kwargs
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 5. Prepare timesteps
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+        image_seq_len = latents.shape[-1] * latents.shape[-2] / self.transformer.config.patch_size[-1] / self.transformer.config.patch_size[-2]
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.base_image_seq_len,
+            self.scheduler.config.max_image_seq_len,
+            self.scheduler.config.base_shift,
+            self.scheduler.config.max_shift,
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            timesteps,
+            sigmas,
+            mu=mu,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+        # denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                # latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # predict the noise residual
+                noise_pred = self.transformer(
+                    samples=latent_model_input.to(self.dtype),
+                    timesteps=torch.tensor([t] * latent_model_input.shape[0], device=device),
+                    encoder_hidden_states=encoder_hidden_states.to(self.dtype),
+                    encoder_attention_mask=encoder_attention_mask,
+                    **forward_kwargs
+                )
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+        # scale and decode the image latents with vae
+        latents = 1 / self.vae.config.scaling_factor * latents
+        if latents.ndim == 5:
+            latents = latents.squeeze(1)
+        image = self.vae.decode(latents.to(self.vae.dtype)).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+        if not return_dict:
+            return (image, None)
+        return OneDiffusionPipelineOutput(images=image)
+    @torch.no_grad()
+    def img2img(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[PIL.Image.Image, List[PIL.Image.Image]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 5.0,
+        denoise_mask: Optional[List[int]] = [1, 0],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        do_crop: bool = True,
+        is_multiview: bool = False,
+        multiview_azimuths: Optional[List[int]] = [0, 30, 60, 90],
+        multiview_elevations: Optional[List[int]] = [0, 0, 0, 0],
+        multiview_distances: float = 1.7,
+        multiview_c2ws: Optional[List[torch.Tensor]] = None,
+        multiview_intrinsics: Optional[torch.Tensor] = None,
+        multiview_focal_length: float = 1.3887,
+        forward_kwargs: Optional[Dict[str, Any]] = {},
+        noise_scale: float = 1.0,
+        **kwargs,
+):
+        # Convert single image to list for consistent handling
+        if isinstance(image, PIL.Image.Image):
+            image = [image]
+        if height is None or width is None:
+            closest_ar = get_closest_ratio(height=image[0].size[1], width=image[0].size[0], ratios=ASPECT_RATIO_512)
+            height, width = int(closest_ar[0][0]), int(closest_ar[0][1])
+        if not isinstance(multiview_distances, list) and not isinstance(multiview_distances, tuple):
+            multiview_distances = [multiview_distances] * len(multiview_azimuths)
+        # height = height or self.transformer.config.input_size[-2] * 8  # TODO: Hardcoded downscale factor of vae
+        # width = width or self.transformer.config.input_size[-1] * 8
+        # 1. check inputs. Raise error if not correct
+        self.check_inputs(prompt, height, width, callback_steps)
+        # Additional input validation for image list
+        if not all(isinstance(img, PIL.Image.Image) for img in image):
+            raise ValueError("All elements in image list must be PIL.Image objects")
+        # 2. define call parameters
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        device = self._execution_device
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 3. Encode input prompt
+        encoder_hidden_states, encoder_attention_mask = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+        )
+        # 4. Preprocess all images
+        if image is not None and len(image) > 0:
+            processed_image = self.image_processor.preprocess(image, height=height, width=width, do_crop=do_crop)
+        else:
+            processed_image = None
+        # # Stack processed images along the sequence dimension
+        # if len(processed_images) > 1:
+        #     processed_image = torch.cat(processed_images, dim=0)
+        # else:
+        #     processed_image = processed_images[0]
+        timesteps = None
+        # 6. prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels
+        if processed_image is not None:
+            cond_latents = self.prepare_latents(
+                batch_size * num_images_per_prompt,
+                num_channels_latents,
+                height,
+                width,
+                self.dtype,
+                device,
+                generator,
+                latents,
+                image=processed_image,
+            )
+        else:
+            cond_latents = None
+        # 7. prepare extra step kwargs
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        denoise_mask = torch.tensor(denoise_mask, device=device)
+        denoise_indices = torch.where(denoise_mask == 1)[0]
+        cond_indices = torch.where(denoise_mask == 0)[0]
+        seq_length = denoise_mask.shape[0]
+        latents = self.prepare_init_latents(
+            batch_size * num_images_per_prompt,
+            seq_length,
+            num_channels_latents,
+            height,
+            width,
+            self.dtype,
+            device,
+            generator,
+        )
+        # 5. Prepare timesteps
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+        # image_seq_len = latents.shape[1] * latents.shape[-1] * latents.shape[-2] / self.transformer.config.patch_size[-1] / self.transformer.config.patch_size[-2]
+        image_seq_len = noise_scale * sum(denoise_mask) * latents.shape[-1] * latents.shape[-2] / self.transformer.config.patch_size[-1] / self.transformer.config.patch_size[-2]
+        # image_seq_len = 256
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.base_image_seq_len,
+            self.scheduler.config.max_image_seq_len,
+            self.scheduler.config.base_shift,
+            self.scheduler.config.max_shift,
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            timesteps,
+            sigmas,
+            mu=mu,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+        if is_multiview:
+            cond_indices_images = [index // 2 for index in cond_indices if index % 2 == 0]
+            cond_indices_rays = [index // 2 for index in cond_indices if index % 2 == 1]
+            multiview_elevations = [element for element in multiview_elevations if element is not None]
+            multiview_azimuths = [element for element in multiview_azimuths if element is not None]
+            multiview_distances = [element for element in multiview_distances if element is not None]
+            if multiview_c2ws is None:
+                multiview_c2ws = [
+                    torch.tensor(create_c2w_matrix(azimuth, elevation, distance)) for azimuth, elevation, distance in zip(multiview_azimuths, multiview_elevations, multiview_distances)
+                ]
+                c2ws = torch.stack(multiview_c2ws).float()
+            else:
+                c2ws = torch.Tensor(multiview_c2ws).float()
+            c2ws[:, 0:3, 1:3] *= -1
+            c2ws = c2ws[:, [1, 0, 2, 3], :]
+            c2ws[:, 2, :] *= -1
+            w2cs = torch.inverse(c2ws)
+            if multiview_intrinsics is None:
+                multiview_intrinsics = torch.Tensor([[[multiview_focal_length, 0, 0.5], [0, multiview_focal_length, 0.5], [0, 0, 1]]]).repeat(c2ws.shape[0], 1, 1)
+            K = multiview_intrinsics
+            Rs = w2cs[:, :3, :3]
+            Ts = w2cs[:, :3, 3]
+            sizes = torch.Tensor([[1, 1]]).repeat(c2ws.shape[0], 1)
+            assert height == width
+            cond_rays = calculate_rays(K, sizes, Rs, Ts, height // 8)
+            cond_rays = cond_rays.reshape(-1, height // 8, width // 8, 6)
+            # padding = (0, 10)
+            # cond_rays = torch.nn.functional.pad(cond_rays, padding, "constant", 0)
+            cond_rays = torch.cat([cond_rays, cond_rays, cond_rays[..., :4]], dim=-1) * 1.658
+            cond_rays = cond_rays[None].repeat(batch_size * num_images_per_prompt, 1, 1, 1, 1)
+            cond_rays = cond_rays.permute(0, 1, 4, 2, 3)
+            cond_rays = cond_rays.to(device, dtype=self.dtype)
+            latents = einops.rearrange(latents, "b (f n) c h w -> b f n c h w", n=2)
+            if cond_latents is not None:
+                latents[:, cond_indices_images, 0] = cond_latents
+            latents[:, cond_indices_rays, 1] = cond_rays
+            latents = einops.rearrange(latents, "b f n c h w -> b (f n) c h w")
+        else:
+            if cond_latents is not None:
+                latents[:, cond_indices] = cond_latents
+        # denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                input_t = torch.broadcast_to(einops.repeat(torch.Tensor([t]).to(device), "1 -> 1 f 1 1 1", f=latent_model_input.shape[1]), latent_model_input.shape).clone()
+                if is_multiview:
+                    input_t = einops.rearrange(input_t, "b (f n) c h w -> b f n c h w", n=2)
+                    input_t[:, cond_indices_images, 0] = self.scheduler.timesteps[-1]
+                    input_t[:, cond_indices_rays, 1] = self.scheduler.timesteps[-1]
+                    input_t = einops.rearrange(input_t, "b f n c h w -> b (f n) c h w")
+                else:
+                    input_t[:, cond_indices] = self.scheduler.timesteps[-1]
+                # predict the noise residual
+                noise_pred = self.transformer(
+                    samples=latent_model_input.to(self.dtype),
+                    timesteps=input_t,
+                    encoder_hidden_states=encoder_hidden_states.to(self.dtype),
+                    encoder_attention_mask=encoder_attention_mask,
+                    **forward_kwargs
+                )
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                bs, n_frame = noise_pred.shape[:2]
+                noise_pred = einops.rearrange(noise_pred, "b f c h w -> (b f) c h w")
+                latents = einops.rearrange(latents, "b f c h w -> (b f) c h w")
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                latents = einops.rearrange(latents, "(b f) c h w -> b f c h w", b=bs, f=n_frame)
+                if is_multiview:
+                    latents = einops.rearrange(latents, "b (f n) c h w -> b f n c h w", n=2)
+                    if cond_latents is not None:
+                        latents[:, cond_indices_images, 0] = cond_latents
+                    latents[:, cond_indices_rays, 1] = cond_rays
+                    latents = einops.rearrange(latents, "b f n c h w -> b (f n) c h w")
+                else:
+                    if cond_latents is not None:
+                        latents[:, cond_indices] = cond_latents
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+        decoded_latents = latents / 1.658
+        # scale and decode the image latents with vae
+        latents = 1 / self.vae.config.scaling_factor * latents
+        if latents.ndim == 5:
+            latents = latents[:, denoise_indices]
+            latents = einops.rearrange(latents, "b f c h w -> (b f) c h w")
+        image = self.vae.decode(latents.to(self.vae.dtype)).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+        if not return_dict:
+            return (image, None)
+        return OneDiffusionPipelineOutput(images=image, latents=decoded_latents)
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    def check_inputs(self, prompt, height, width, callback_steps):
+        if not isinstance(prompt, str) and not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if height % 16 != 0 or width % 16 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.")
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start:]
+        return timesteps, num_inference_steps - t_start
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None, image=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+        if image is None:
+            # scale the initial noise by the standard deviation required by the scheduler
+            # latents = latents * self.scheduler.init_noise_sigma
+            return latents
+        image = image.to(device=device, dtype=dtype)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        elif isinstance(generator, list):
+            if image.shape[0] < batch_size and batch_size % image.shape[0] == 0:
+                image = torch.cat([image] * (batch_size // image.shape[0]), dim=0)
+            elif image.shape[0] < batch_size and batch_size % image.shape[0] != 0:
+                raise ValueError(
+                    f"Cannot duplicate `image` of batch size {image.shape[0]} to effective batch_size {batch_size} "
+                )
+            init_latents = [
+                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                for i in range(batch_size)
+            ]
+            init_latents = torch.cat(init_latents, dim=0)
+        else:
+            init_latents = retrieve_latents(self.vae.encode(image.to(self.vae.dtype)), generator=generator)
+        init_latents = self.vae.config.scaling_factor * init_latents
+        init_latents = init_latents.to(device=device, dtype=dtype)
+        init_latents = einops.rearrange(init_latents, "(bs views) c h w -> bs views c h w", bs=batch_size, views=init_latents.shape[0]//batch_size)
+        # latents = einops.rearrange(latents, "b c h w -> b 1 c h w")
+        # latents = torch.concat([latents, init_latents], dim=1)
+        return init_latents
+    def prepare_init_latents(self, batch_size, seq_length, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, seq_length, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+        return latents
+    @torch.no_grad()
+    def generate(
+        self,
+        prompt: Union[str, List[str]],
+        num_inference_steps: int = 50,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+    ):
+        """
+        Function for image generation using the OneDiffusionPipeline.
+        """
+        return self(
+            prompt=prompt,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            height=height,
+            width=width,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+        )
+    @staticmethod
+    def numpy_to_pil(images):
+        """
+        Convert a numpy image or a batch of images to a PIL image.
+        """
+        if images.ndim == 3:
+            images = images[None, ...]
+        images = (images * 255).round().astype("uint8")
+        if images.shape[-1] == 1:
+            # special case for grayscale (single channel) images
+            pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
+        else:
+            pil_images = [Image.fromarray(image) for image in images]
+        return pil_images
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        model_path = pretrained_model_name_or_path
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        from_flax = kwargs.pop("from_flax", False)
+        torch_dtype = kwargs.pop("torch_dtype", None)
+        custom_pipeline = kwargs.pop("custom_pipeline", None)
+        custom_revision = kwargs.pop("custom_revision", None)
+        provider = kwargs.pop("provider", None)
+        sess_options = kwargs.pop("sess_options", None)
+        device_map = kwargs.pop("device_map", None)
+        max_memory = kwargs.pop("max_memory", None)
+        offload_folder = kwargs.pop("offload_folder", None)
+        offload_state_dict = kwargs.pop("offload_state_dict", False)
+        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT)
+        variant = kwargs.pop("variant", None)
+        use_safetensors = kwargs.pop("use_safetensors", None)
+        use_onnx = kwargs.pop("use_onnx", None)
+        load_connected_pipeline = kwargs.pop("load_connected_pipeline", False)
+        if low_cpu_mem_usage and not is_accelerate_available():
+            low_cpu_mem_usage = False
+            logger.warning(
+                "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the"
+                " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install"
+                " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip"
+                " install accelerate\n```\n."
+            )
+        if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `low_cpu_mem_usage=False`."
+            )
+        if device_map is not None and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Loading and dispatching requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `device_map=None`."
+            )
+        if device_map is not None and not is_accelerate_available():
+            raise NotImplementedError(
+                "Using `device_map` requires the `accelerate` library. Please install it using: `pip install accelerate`."
+            )
+        if device_map is not None and not isinstance(device_map, str):
+            raise ValueError("`device_map` must be a string.")
+        if device_map is not None and device_map not in SUPPORTED_DEVICE_MAP:
+            raise NotImplementedError(
+                f"{device_map} not supported. Supported strategies are: {', '.join(SUPPORTED_DEVICE_MAP)}"
+            )
+        if device_map is not None and device_map in SUPPORTED_DEVICE_MAP:
+            if is_accelerate_version("<", "0.28.0"):
+                raise NotImplementedError("Device placement requires `accelerate` version `0.28.0` or later.")
+        if low_cpu_mem_usage is False and device_map is not None:
+            raise ValueError(
+                f"You cannot set `low_cpu_mem_usage` to False while using device_map={device_map} for loading and"
+                " dispatching. Please make sure to set `low_cpu_mem_usage=True`."
+            )
+        transformer = NextDiT.from_pretrained(f"{model_path}", subfolder="transformer", torch_dtype=torch.float32, cache_dir=cache_dir)
+        vae = AutoencoderKL.from_pretrained(f"{model_path}", subfolder="vae", cache_dir=cache_dir)
+        text_encoder = T5EncoderModel.from_pretrained(f"{model_path}", subfolder="text_encoder", torch_dtype=torch.float16, cache_dir=cache_dir)
+        tokenizer = T5Tokenizer.from_pretrained(model_path, subfolder="tokenizer", cache_dir=cache_dir)
+        scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(model_path, subfolder="scheduler", cache_dir=cache_dir)
+        pipeline = cls(
+            transformer=transformer,
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+            **kwargs
+        )
+        return pipeline

onediffusion/models/denoiser/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from . import (
+    nextdit
+)

onediffusion/models/denoiser/nextdit/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .modeling_nextdit import NextDiT

onediffusion/models/denoiser/nextdit/layers.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from typing import Callable, Optional
+import warnings
+import torch
+import torch.nn as nn
+try:
+    from apex.normalization import FusedRMSNorm as RMSNorm
+except ImportError:
+    warnings.warn("Cannot import apex RMSNorm, switch to vanilla implementation")
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        """
+        Initialize the RMSNorm normalization layer.
+        Args:
+            dim (int): The dimension of the input tensor.
+            eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+        Attributes:
+            eps (float): A small value added to the denominator for numerical stability.
+            weight (nn.Parameter): Learnable scaling parameter.
+        """
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def _norm(self, x):
+        """
+        Apply the RMSNorm normalization to the input tensor.
+        Args:
+            x (torch.Tensor): The input tensor.
+        Returns:
+            torch.Tensor: The normalized tensor.
+        """
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        """
+        Forward pass through the RMSNorm layer.
+        Args:
+            x (torch.Tensor): The input tensor.
+        Returns:
+            torch.Tensor: The output tensor after applying RMSNorm.
+        """
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+def modulate(x, scale):
+    return x * (1 + scale.unsqueeze(1))
+class LLamaFeedForward(nn.Module):
+    """
+    Corresponds to the FeedForward layer in Next DiT.
+    """
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        multiple_of: int,
+        ffn_dim_multiplier: Optional[float] = None,
+        zeros_initialize: bool = True,
+        dtype: torch.dtype = torch.float32,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.hidden_dim = hidden_dim
+        self.multiple_of = multiple_of
+        self.ffn_dim_multiplier = ffn_dim_multiplier
+        self.zeros_initialize = zeros_initialize
+        self.dtype = dtype
+        # Compute hidden_dim based on the given formula
+        hidden_dim_calculated = int(2 * self.hidden_dim / 3)
+        if self.ffn_dim_multiplier is not None:
+            hidden_dim_calculated = int(self.ffn_dim_multiplier * hidden_dim_calculated)
+        hidden_dim_calculated = self.multiple_of * ((hidden_dim_calculated + self.multiple_of - 1) // self.multiple_of)
+        # Define linear layers
+        self.w1 = nn.Linear(self.dim, hidden_dim_calculated, bias=False)
+        self.w2 = nn.Linear(hidden_dim_calculated, self.dim, bias=False)
+        self.w3 = nn.Linear(self.dim, hidden_dim_calculated, bias=False)
+        # Initialize weights
+        if self.zeros_initialize:
+            nn.init.zeros_(self.w2.weight)
+        else:
+            nn.init.xavier_uniform_(self.w2.weight)
+        nn.init.xavier_uniform_(self.w1.weight)
+        nn.init.xavier_uniform_(self.w3.weight)
+    def _forward_silu_gating(self, x1, x3):
+        return F.silu(x1) * x3
+    def forward(self, x):
+        return self.w2(self._forward_silu_gating(self.w1(x), self.w3(x)))
+class FinalLayer(nn.Module):
+    """
+    The final layer of Next-DiT.
+    """
+    def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.patch_size = patch_size
+        self.out_channels = out_channels
+        # LayerNorm without learnable parameters (elementwise_affine=False)
+        self.norm_final = nn.LayerNorm(self.hidden_size, eps=1e-6, elementwise_affine=False)
+        self.linear = nn.Linear(self.hidden_size, np.prod(self.patch_size) * self.out_channels, bias=True)
+        nn.init.zeros_(self.linear.weight)
+        nn.init.zeros_(self.linear.bias)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(self.hidden_size, self.hidden_size),
+        )
+        # Initialize the last layer with zeros
+        nn.init.zeros_(self.adaLN_modulation[1].weight)
+        nn.init.zeros_(self.adaLN_modulation[1].bias)
+    def forward(self, x, c):
+        scale = self.adaLN_modulation(c)
+        x = modulate(self.norm_final(x), scale)
+        x = self.linear(x)
+        return x

onediffusion/models/denoiser/nextdit/modeling_nextdit.py ADDED Viewed

	@@ -0,0 +1,571 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import einops
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin
+from typing import Any, Tuple, Optional
+from flash_attn import flash_attn_varlen_func
+from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+from .layers import LLamaFeedForward, RMSNorm
+# import frasch
+def modulate(x, scale):
+    return x * (1 + scale)
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.frequency_embedding_size = frequency_embedding_size
+        self.mlp = nn.Sequential(
+            nn.Linear(self.frequency_embedding_size, self.hidden_size),
+            nn.SiLU(),
+            nn.Linear(self.hidden_size, self.hidden_size),
+        )
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        half = dim // 2
+        freqs = torch.exp(
+            -np.log(max_period) * torch.arange(0, half, dtype=t.dtype) / half
+        ).to(t.device)
+        args = t[:, :, None] * freqs[None, :]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :, :1])], dim=-1)
+        return embedding
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_freq = t_freq.to(self.mlp[0].weight.dtype)
+        return self.mlp(t_freq)
+class FinalLayer(nn.Module):
+    def __init__(self, hidden_size, num_patches, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, eps=1e-6, elementwise_affine=False)
+        self.linear = nn.Linear(hidden_size, num_patches * out_channels)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(min(hidden_size, 1024), hidden_size),
+        )
+    def forward(self, x, c):
+        scale = self.adaLN_modulation(c)
+        x = modulate(self.norm_final(x), scale)
+        x = self.linear(x)
+        return x
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        n_heads,
+        n_kv_heads=None,
+        qk_norm=False,
+        y_dim=0,
+        base_seqlen=None,
+        proportional_attn=False,
+        attention_dropout=0.0,
+        max_position_embeddings=384,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.n_heads = n_heads
+        self.n_kv_heads = n_kv_heads or n_heads
+        self.qk_norm = qk_norm
+        self.y_dim = y_dim
+        self.base_seqlen = base_seqlen
+        self.proportional_attn = proportional_attn
+        self.attention_dropout = attention_dropout
+        self.max_position_embeddings = max_position_embeddings
+        self.head_dim = dim // n_heads
+        self.wq = nn.Linear(dim, n_heads * self.head_dim, bias=False)
+        self.wk = nn.Linear(dim, self.n_kv_heads * self.head_dim, bias=False)
+        self.wv = nn.Linear(dim, self.n_kv_heads * self.head_dim, bias=False)
+        if y_dim > 0:
+            self.wk_y = nn.Linear(y_dim, self.n_kv_heads * self.head_dim, bias=False)
+            self.wv_y = nn.Linear(y_dim, self.n_kv_heads * self.head_dim, bias=False)
+            self.gate = nn.Parameter(torch.zeros(n_heads))
+        self.wo = nn.Linear(n_heads * self.head_dim, dim, bias=False)
+        if qk_norm:
+            self.q_norm = nn.LayerNorm(self.n_heads * self.head_dim)
+            self.k_norm = nn.LayerNorm(self.n_kv_heads * self.head_dim)
+            if y_dim > 0:
+                self.ky_norm = nn.LayerNorm(self.n_kv_heads * self.head_dim, eps=1e-6)
+            else:
+                self.ky_norm = nn.Identity()
+        else:
+            self.q_norm = nn.Identity()
+            self.k_norm = nn.Identity()
+            self.ky_norm = nn.Identity()
+    @staticmethod
+    def apply_rotary_emb(xq, xk, freqs_cis):
+        # xq, xk: [batch_size, seq_len, n_heads, head_dim]
+        # freqs_cis: [1, seq_len, 1, head_dim]
+        xq_ = xq.float().reshape(*xq.shape[:-1], -1, 2)
+        xk_ = xk.float().reshape(*xk.shape[:-1], -1, 2)
+        xq_complex = torch.view_as_complex(xq_)
+        xk_complex = torch.view_as_complex(xk_)
+        freqs_cis = freqs_cis.unsqueeze(2)
+        # Apply freqs_cis
+        xq_out = xq_complex * freqs_cis
+        xk_out = xk_complex * freqs_cis
+        # Convert back to real numbers
+        xq_out = torch.view_as_real(xq_out).flatten(-2)
+        xk_out = torch.view_as_real(xk_out).flatten(-2)
+        return xq_out.type_as(xq), xk_out.type_as(xk)
+    # copied from huggingface modeling_llama.py
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        def _get_unpad_data(attention_mask):
+            seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+            indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+            max_seqlen_in_batch = seqlens_in_batch.max().item()
+            cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+            return (
+                indices,
+                cu_seqlens,
+                max_seqlen_in_batch,
+            )
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
+            indices_k,
+        )
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
+            indices_k,
+        )
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.n_heads, head_dim),
+                indices_k,
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+    def forward(
+        self,
+        x,
+        x_mask,
+        freqs_cis,
+        y=None,
+        y_mask=None,
+        init_cache=False,
+    ):
+        bsz, seqlen, _ = x.size()
+        xq = self.wq(x)
+        xk = self.wk(x)
+        xv = self.wv(x)
+        if x_mask is None:
+            x_mask = torch.ones(bsz, seqlen, dtype=torch.bool, device=x.device)
+        inp_dtype = xq.dtype
+        xq = self.q_norm(xq)
+        xk = self.k_norm(xk)
+        xq = xq.view(bsz, seqlen, self.n_heads, self.head_dim)
+        xk = xk.view(bsz, seqlen, self.n_kv_heads, self.head_dim)
+        xv = xv.view(bsz, seqlen, self.n_kv_heads, self.head_dim)
+        if self.n_kv_heads != self.n_heads:
+            n_rep = self.n_heads // self.n_kv_heads
+            xk = xk.repeat_interleave(n_rep, dim=2)
+            xv = xv.repeat_interleave(n_rep, dim=2)
+        freqs_cis = freqs_cis.to(xq.device)
+        xq, xk = self.apply_rotary_emb(xq, xk, freqs_cis)
+        if inp_dtype in [torch.float16, torch.bfloat16]:
+            # begin var_len flash attn
+            (
+                query_states,
+                key_states,
+                value_states,
+                indices_q,
+                cu_seq_lens,
+                max_seq_lens,
+            ) = self._upad_input(xq, xk, xv, x_mask, seqlen)
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states.to(inp_dtype),
+                key_states.to(inp_dtype),
+                value_states.to(inp_dtype),
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=0.0,
+                causal=False,
+                softmax_scale=None,
+                softcap=30,
+            )
+            output = pad_input(attn_output_unpad, indices_q, bsz, seqlen)
+        else:
+            output = (
+                F.scaled_dot_product_attention(
+                    xq.permute(0, 2, 1, 3),
+                    xk.permute(0, 2, 1, 3),
+                    xv.permute(0, 2, 1, 3),
+                    attn_mask=x_mask.bool().view(bsz, 1, 1, seqlen).expand(-1, self.n_heads, seqlen, -1),
+                    scale=None,
+                )
+                .permute(0, 2, 1, 3)
+                .to(inp_dtype)
+            ) #ok
+        if hasattr(self, "wk_y"):
+            yk = self.ky_norm(self.wk_y(y)).view(bsz, -1, self.n_kv_heads, self.head_dim)
+            yv = self.wv_y(y).view(bsz, -1, self.n_kv_heads, self.head_dim)
+            n_rep = self.n_heads // self.n_kv_heads
+            # if n_rep >= 1:
+            #     yk = yk.unsqueeze(3).repeat(1, 1, 1, n_rep, 1).flatten(2, 3)
+            #     yv = yv.unsqueeze(3).repeat(1, 1, 1, n_rep, 1).flatten(2, 3)
+            if n_rep >= 1:
+                yk = einops.repeat(yk, "b l h d -> b l (repeat h) d", repeat=n_rep)
+                yv = einops.repeat(yv, "b l h d -> b l (repeat h) d", repeat=n_rep)
+            output_y = F.scaled_dot_product_attention(
+                xq.permute(0, 2, 1, 3),
+                yk.permute(0, 2, 1, 3),
+                yv.permute(0, 2, 1, 3),
+                y_mask.view(bsz, 1, 1, -1).expand(bsz, self.n_heads, seqlen, -1).to(torch.bool),
+            ).permute(0, 2, 1, 3)
+            output_y = output_y * self.gate.tanh().view(1, 1, -1, 1)
+            output = output + output_y
+        output = output.flatten(-2)
+        output = self.wo(output)
+        return output.to(inp_dtype)
+class TransformerBlock(nn.Module):
+    """
+    Corresponds to the Transformer block in the JAX code.
+    """
+    def __init__(
+        self,
+        dim,
+        n_heads,
+        n_kv_heads,
+        multiple_of,
+        ffn_dim_multiplier,
+        norm_eps,
+        qk_norm,
+        y_dim,
+        max_position_embeddings,
+    ):
+        super().__init__()
+        self.attention = Attention(dim, n_heads, n_kv_heads, qk_norm, y_dim=y_dim, max_position_embeddings=max_position_embeddings)
+        self.feed_forward = LLamaFeedForward(
+            dim=dim,
+            hidden_dim=4 * dim,
+            multiple_of=multiple_of,
+            ffn_dim_multiplier=ffn_dim_multiplier,
+        )
+        self.attention_norm1 = RMSNorm(dim, eps=norm_eps)
+        self.attention_norm2 = RMSNorm(dim, eps=norm_eps)
+        self.ffn_norm1 = RMSNorm(dim, eps=norm_eps)
+        self.ffn_norm2 = RMSNorm(dim, eps=norm_eps)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(min(dim, 1024), 4 * dim),
+        )
+        self.attention_y_norm = RMSNorm(y_dim, eps=norm_eps)
+    def forward(
+        self,
+        x,
+        x_mask,
+        freqs_cis,
+        y,
+        y_mask,
+        adaln_input=None,
+    ):
+        if adaln_input is not None:
+            scales_gates = self.adaLN_modulation(adaln_input)
+            # TODO: Duong - check the dimension of chunking
+            # scale_msa, gate_msa, scale_mlp, gate_mlp = scales_gates.chunk(4, dim=-1)
+            scale_msa, gate_msa, scale_mlp, gate_mlp = scales_gates.chunk(4, dim=-1)
+            x = x + torch.tanh(gate_msa) * self.attention_norm2(
+                self.attention(
+                    modulate(self.attention_norm1(x), scale_msa), # ok
+                    x_mask,
+                    freqs_cis,
+                    self.attention_y_norm(y), # ok
+                    y_mask,
+                )
+            )
+            x = x + torch.tanh(gate_mlp) * self.ffn_norm2(
+                self.feed_forward(
+                    modulate(self.ffn_norm1(x), scale_mlp),
+                )
+            )
+        else:
+            x = x + self.attention_norm2(
+                self.attention(
+                    self.attention_norm1(x),
+                    x_mask,
+                    freqs_cis,
+                    self.attention_y_norm(y),
+                    y_mask,
+                )
+            )
+            x = x + self.ffn_norm2(self.feed_forward(self.ffn_norm1(x)))
+        return x
+class NextDiT(ModelMixin, ConfigMixin):
+    """
+    Diffusion model with a Transformer backbone for joint image-video training.
+    """
+    @register_to_config
+    def __init__(
+        self,
+        input_size=(1, 32, 32),
+        patch_size=(1, 2, 2),
+        in_channels=16,
+        hidden_size=4096,
+        depth=32,
+        num_heads=32,
+        num_kv_heads=None,
+        multiple_of=256,
+        ffn_dim_multiplier=None,
+        norm_eps=1e-5,
+        pred_sigma=False,
+        caption_channels=4096,
+        qk_norm=False,
+        norm_type="rms",
+        model_max_length=120,
+        rotary_max_length=384,
+        rotary_max_length_t=None
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.patch_size = patch_size
+        self.in_channels = in_channels
+        self.hidden_size = hidden_size
+        self.depth = depth
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads or num_heads
+        self.multiple_of = multiple_of
+        self.ffn_dim_multiplier = ffn_dim_multiplier
+        self.norm_eps = norm_eps
+        self.pred_sigma = pred_sigma
+        self.caption_channels = caption_channels
+        self.qk_norm = qk_norm
+        self.norm_type = norm_type
+        self.model_max_length = model_max_length
+        self.rotary_max_length = rotary_max_length
+        self.rotary_max_length_t = rotary_max_length_t
+        self.out_channels = in_channels * 2 if pred_sigma else in_channels
+        self.x_embedder = nn.Linear(np.prod(self.patch_size) * in_channels, hidden_size)
+        self.t_embedder = TimestepEmbedder(min(hidden_size, 1024))
+        self.y_embedder = nn.Sequential(
+            nn.LayerNorm(caption_channels, eps=1e-6),
+            nn.Linear(caption_channels, min(hidden_size, 1024)),
+        )
+        self.layers = nn.ModuleList([
+            TransformerBlock(
+                dim=hidden_size,
+                n_heads=num_heads,
+                n_kv_heads=self.num_kv_heads,
+                multiple_of=multiple_of,
+                ffn_dim_multiplier=ffn_dim_multiplier,
+                norm_eps=norm_eps,
+                qk_norm=qk_norm,
+                y_dim=caption_channels,
+                max_position_embeddings=rotary_max_length,
+            )
+            for _ in range(depth)
+        ])
+        self.final_layer = FinalLayer(
+            hidden_size=hidden_size,
+            num_patches=np.prod(patch_size),
+            out_channels=self.out_channels,
+        )
+        assert (hidden_size // num_heads) % 6 == 0, "3d rope needs head dim to be divisible by 6"
+        self.freqs_cis = self.precompute_freqs_cis(
+            hidden_size // num_heads,
+            self.rotary_max_length,
+            end_t=self.rotary_max_length_t
+        )
+    def to(self, *args, **kwargs):
+        self = super().to(*args, **kwargs)
+        # self.freqs_cis = self.freqs_cis.to(*args, **kwargs)
+        return self
+    @staticmethod
+    def precompute_freqs_cis(
+        dim: int,
+        end: int,
+        end_t: int = None,
+        theta: float = 10000.0,
+        scale_factor: float = 1.0,
+        scale_watershed: float = 1.0,
+        timestep: float = 1.0,
+    ):
+        if timestep < scale_watershed:
+            linear_factor = scale_factor
+            ntk_factor = 1.0
+        else:
+            linear_factor = 1.0
+            ntk_factor = scale_factor
+        theta = theta * ntk_factor
+        freqs = 1.0 / (theta ** (torch.arange(0, dim, 6)[: (dim // 6)] / dim)) / linear_factor
+        timestep = torch.arange(end, dtype=torch.float32)
+        freqs = torch.outer(timestep, freqs).float()
+        freqs_cis = torch.exp(1j * freqs)
+        if end_t is not None:
+            freqs_t = 1.0 / (theta ** (torch.arange(0, dim, 6)[: (dim // 6)] / dim)) / linear_factor
+            timestep_t = torch.arange(end_t, dtype=torch.float32)
+            freqs_t = torch.outer(timestep_t, freqs_t).float()
+            freqs_cis_t = torch.exp(1j * freqs_t)
+            freqs_cis_t = freqs_cis_t.view(end_t, 1, 1, dim // 6).repeat(1, end, end, 1)
+        else:
+            end_t = end
+            freqs_cis_t = freqs_cis.view(end_t, 1, 1, dim // 6).repeat(1, end, end, 1)
+        freqs_cis_h = freqs_cis.view(1, end, 1, dim // 6).repeat(end_t, 1, end, 1)
+        freqs_cis_w = freqs_cis.view(1, 1, end, dim // 6).repeat(end_t, end, 1, 1)
+        freqs_cis = torch.cat([freqs_cis_t, freqs_cis_h, freqs_cis_w], dim=-1).view(end_t, end, end, -1)
+        return freqs_cis
+    def forward(
+        self,
+        samples,
+        timesteps,
+        encoder_hidden_states,
+        encoder_attention_mask,
+        scale_factor: float = 1.0, # scale_factor for rotary embedding
+        scale_watershed: float = 1.0, # scale_watershed for rotary embedding
+    ):
+        if samples.ndim == 4: # B C H W
+            samples = samples[:, None, ...] # B F C H W
+        precomputed_freqs_cis = None
+        if scale_factor != 1 or scale_watershed != 1:
+            precomputed_freqs_cis = self.precompute_freqs_cis(
+                self.hidden_size // self.num_heads,
+                self.rotary_max_length,
+                end_t=self.rotary_max_length_t,
+                scale_factor=scale_factor,
+                scale_watershed=scale_watershed,
+                timestep=torch.max(timesteps.cpu()).item()
+            )
+        if len(timesteps.shape) == 5:
+            t, *_ = self.patchify(timesteps, precomputed_freqs_cis)
+            timesteps = t.mean(dim=-1)
+        elif len(timesteps.shape) == 1:
+            timesteps = timesteps[:, None, None, None, None].expand_as(samples)
+            t, *_ = self.patchify(timesteps, precomputed_freqs_cis)
+            timesteps = t.mean(dim=-1)
+        samples, T, H, W, freqs_cis = self.patchify(samples, precomputed_freqs_cis)
+        samples = self.x_embedder(samples)
+        t = self.t_embedder(timesteps)
+        encoder_attention_mask_float = encoder_attention_mask[..., None].float()
+        encoder_hidden_states_pool = (encoder_hidden_states * encoder_attention_mask_float).sum(dim=1) / (encoder_attention_mask_float.sum(dim=1) + 1e-8)
+        encoder_hidden_states_pool = encoder_hidden_states_pool.to(samples.dtype)
+        y = self.y_embedder(encoder_hidden_states_pool)
+        y = y.unsqueeze(1).expand(-1, samples.size(1), -1)
+        adaln_input = t + y
+        for block in self.layers:
+            samples = block(samples, None, freqs_cis, encoder_hidden_states, encoder_attention_mask, adaln_input)
+        samples = self.final_layer(samples, adaln_input)
+        samples = self.unpatchify(samples, T, H, W)
+        return samples
+    def patchify(self, x, precompute_freqs_cis=None):
+        # pytorch is C, H, W
+        B, T, C, H, W = x.size()
+        pT, pH, pW = self.patch_size
+        x = x.view(B, T // pT, pT, C, H // pH, pH, W // pW, pW)
+        x = x.permute(0, 1, 4, 6, 2, 5, 7, 3)
+        x = x.reshape(B, -1, pT * pH * pW * C)
+        if precompute_freqs_cis is None:
+            freqs_cis = self.freqs_cis[: T // pT, :H // pH, :W // pW].reshape(-1, * self.freqs_cis.shape[3:])[None].to(x.device)
+        else:
+            freqs_cis = precompute_freqs_cis[: T // pT, :H // pH, :W // pW].reshape(-1, * precompute_freqs_cis.shape[3:])[None].to(x.device)
+        return x, T // pT, H // pH, W // pW, freqs_cis
+    def unpatchify(self, x, T, H, W):
+        B = x.size(0)
+        C = self.out_channels
+        pT, pH, pW = self.patch_size
+        x = x.view(B, T, H, W, pT, pH, pW, C)
+        x = x.permute(0, 1, 4, 7, 2, 5, 3, 6)
+        x = x.reshape(B, T * pT, C, H * pH, W * pW)
+        return x

requirements.txt CHANGED Viewed

@@ -1,6 +1,27 @@
-transformers
-diffusers
-peft
-opencv-python
-protobuf
-sentencepiece

+pytest
+matplotlib
+scikit-learn
+scipy
+spacy
+numpy
+einops
+einsum
+fvcore
+h5py
+twine
+transformers==4.45.2
+huggingface_hub==0.24
+accelerate==0.34.2
+diffusers==0.30.3
+pillow==10.2.0
+torch==2.3.1
+torchvision==0.18.1
+torchaudio==2.3.1
+flash-attn==2.6.3
+git+https://github.com/Dao-AILab/flash-attention@v2.6.3#subdirectory=csrc/fused_dense_lib
+jaxtyping
+mediapipe
+gradio
+git+https://github.com/facebookresearch/pytorch3d.git
+opencv-python==4.5.5.64
+opencv-python-headless==4.5.5.64