Spaces:

charbel-malo
/

3D-Genesis

Build error

App Files Files Community

charbel-malo commited on Nov 14, 2024

Commit

58c9024

verified ·

1 Parent(s): 25f2611

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.editorconfig +12 -0
.gitattributes +3 -0
.gitignore +217 -0
Installation.md +170 -0
LICENSE +21 -0
README.md +132 -7
README_jp.md +126 -0
README_zh.md +62 -0
app/__init__.py +0 -0
app/all_models.py +22 -0
app/custom_models/image2mvimage.yaml +63 -0
app/custom_models/image2normal.yaml +61 -0
app/custom_models/mvimg_prediction.py +57 -0
app/custom_models/normal_prediction.py +26 -0
app/custom_models/utils.py +75 -0
app/examples/Groot.png +0 -0
app/examples/aaa.png +0 -0
app/examples/abma.png +0 -0
app/examples/akun.png +0 -0
app/examples/anya.png +0 -0
app/examples/bag.png +3 -0
app/examples/ex1.png +3 -0
app/examples/ex2.png +0 -0
app/examples/ex3.jpg +0 -0
app/examples/ex4.png +0 -0
app/examples/generated_1715761545_frame0.png +0 -0
app/examples/generated_1715762357_frame0.png +0 -0
app/examples/generated_1715763329_frame0.png +0 -0
app/examples/hatsune_miku.png +0 -0
app/examples/princess-large.png +0 -0
app/gradio_3dgen.py +71 -0
app/gradio_3dgen_steps.py +87 -0
app/gradio_local.py +76 -0
app/utils.py +112 -0
assets/teaser.jpg +0 -0
assets/teaser_safe.jpg +3 -0
custum_3d_diffusion/custum_modules/attention_processors.py +385 -0
custum_3d_diffusion/custum_modules/unifield_processor.py +460 -0
custum_3d_diffusion/custum_pipeline/unifield_pipeline_img2img.py +298 -0
custum_3d_diffusion/custum_pipeline/unifield_pipeline_img2mvimg.py +296 -0
custum_3d_diffusion/modules.py +14 -0
custum_3d_diffusion/trainings/__init__.py +0 -0
custum_3d_diffusion/trainings/base.py +208 -0
custum_3d_diffusion/trainings/config_classes.py +35 -0
custum_3d_diffusion/trainings/image2image_trainer.py +86 -0
custum_3d_diffusion/trainings/image2mvimage_trainer.py +139 -0
custum_3d_diffusion/trainings/utils.py +25 -0
docker/Dockerfile +54 -0
docker/README.md +35 -0
gradio_app.py +41 -0

.editorconfig ADDED Viewed

	@@ -0,0 +1,12 @@

+root = true
+[*.py]
+charset = utf-8
+trim_trailing_whitespace = true
+end_of_line = lf
+insert_final_newline = true
+indent_style = space
+indent_size = 4
+[*.md]
+trim_trailing_whitespace = false

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+app/examples/bag.png filter=lfs diff=lfs merge=lfs -text
+app/examples/ex1.png filter=lfs diff=lfs merge=lfs -text
+assets/teaser_safe.jpg filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,217 @@

+# Created by https://www.toptal.com/developers/gitignore/api/python
+# Edit at https://www.toptal.com/developers/gitignore?templates=python
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+### Python Patch ###
+# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
+poetry.toml
+# ruff
+.ruff_cache/
+# LSP config files
+pyrightconfig.json
+# End of https://www.toptal.com/developers/gitignore/api/python
+.vscode/
+.threestudio_cache/
+outputs
+outputs/
+outputs-gradio
+outputs-gradio/
+lightning_logs/
+# pretrained model weights
+*.ckpt
+*.pt
+*.pth
+*.bin
+*.param
+# wandb
+wandb/
+# obj results
+*.obj
+*.glb
+*.ply
+# ckpts
+ckpt/*
+*.pth
+*.pt
+# tensorrt
+*.engine
+*.profile
+# zipfiles
+*.zip
+*.tar
+*.tar.gz
+# others
+run_30.sh
+ckpt

Installation.md ADDED Viewed

	@@ -0,0 +1,170 @@

+# 官方安装指南
+* 在 requirements-detail.txt 里，我们提供了详细的各个库的版本，这个对应的环境是 `python3.10 + cuda12.2`。
+* 本项目依赖于几个重要的pypi包，这几个包安装起来会有一些困难。
+### nvdiffrast 安装
+* nvdiffrast 会在第一次运行时，编译对应的torch插件，这一步需要 ninja 及 cudatoolkit的支持。
+* 因此需要先确保正确安装了 ninja 以及 cudatoolkit 并正确配置了 CUDA_HOME 环境变量。
+* cudatoolkit 安装可以参考 [linux-cuda-installation-guide](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html), [windows-cuda-installation-guide](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html)
+* ninja 则可用直接 `pip install ninja`
+* 然后设置 CUDA_HOME 变量为 cudatoolkit 的安装目录，如 `/usr/local/cuda`。
+* 最后 `pip install nvdiffrast` 即可。
+* 如果无法在目标服务器上安装 cudatoolkit （如权限不够），可用使用我修改的[预编译版本 nvdiffrast](https://github.com/wukailu/nvdiffrast-torch) 在另一台拥有 cudatoolkit 且环境相似（python, torch, cuda版本相同）的服务器上预编译后安装。
+### onnxruntime-gpu 安装
+* 注意，同时安装 `onnxruntime` 与 `onnxruntime-gpu` 可能导致最终程序无法运行在GPU，而运行在CPU，导致极慢的推理速度。
+* [onnxruntime 官方安装指南](https://onnxruntime.ai/docs/install/#python-installs)
+* TLDR: For cuda11.x, `pip install onnxruntime-gpu`. For cuda12.x, `pip install onnxruntime-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/
+`.
+* 进一步的，可用安装基于 tensorrt 的 onnxruntime，进一步加快推理速度。
+* 注意：如果没有安装基于 tensorrt 的 onnxruntime，建议将 `https://github.com/AiuniAI/Unique3D/blob/4e1174c3896fee992ffc780d0ea813500401fae9/scripts/load_onnx.py#L4` 中 `TensorrtExecutionProvider` 删除。
+* 对于 cuda12.x 可用使用如下命令快速安装带有tensorrt的onnxruntime (注意将 `/root/miniconda3/lib/python3.10/site-packages` 修改为你的python 对应路径，将 `/root/.bashrc` 改为你的用户下路径 `.bashrc` 路劲)
+```
+pip install ort-nightly-gpu --index-url=https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ort-cuda-12-nightly/pypi/simple/
+pip install onnxruntime-gpu==1.17.0 --index-url=https://pkgs.dev.azure.com/onnxruntime/onnxruntime/_packaging/onnxruntime-cuda-12/pypi/simple/
+pip install tensorrt==8.6.0
+echo -e "export LD_LIBRARY_PATH=/usr/local/cuda/targets/x86_64-linux/lib/:/root/miniconda3/lib/python3.10/site-packages/tensorrt:${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" >> /root/.bashrc
+```
+### pytorch3d 安装
+* 根据 [pytorch3d 官方的安装建议](https://github.com/facebookresearch/pytorch3d/blob/main/INSTALL.md#2-install-wheels-for-linux)，建议使用预编译版本
+```
+import sys
+import torch
+pyt_version_str=torch.__version__.split("+")[0].replace(".", "")
+version_str="".join([
+    f"py3{sys.version_info.minor}_cu",
+    torch.version.cuda.replace(".",""),
+    f"_pyt{pyt_version_str}"
+])
+!pip install fvcore iopath
+!pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html
+```
+### torch_scatter 安装
+* 在[torch_scatter 官方安装指南](https://github.com/rusty1s/pytorch_scatter?tab=readme-ov-file#installation) 使用预编译的安装包快速安装。
+* 或者直接编译安装 `pip install git+https://github.com/rusty1s/pytorch_scatter.git`
+### 其他安装
+* 其他文件 `pip install -r requirements.txt` 即可。
+-----
+# Detailed Installation Guide
+* In `requirements-detail.txt`, we provide detailed versions of all packages, which correspond to the environment of `python3.10 + cuda12.2`.
+* This project relies on several important PyPI packages, which may be difficult to install.
+### Installation of nvdiffrast
+* nvdiffrast will compile the corresponding torch plugin the first time it runs, which requires support from ninja and cudatoolkit.
+* Therefore, it is necessary to ensure that ninja and cudatoolkit are correctly installed and that the CUDA_HOME environment variable is properly configured.
+* For the installation of cudatoolkit, you can refer to the [Linux CUDA Installation Guide](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) and [Windows CUDA Installation Guide](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html).
+* Ninja can be directly installed with `pip install ninja`.
+* Then set the CUDA_HOME variable to the installation directory of cudatoolkit, such as `/usr/local/cuda`.
+* Finally, `pip install nvdiffrast`.
+* If you cannot install cudatoolkit on the computer (e.g., insufficient permissions), you can use my modified [pre-compiled version of nvdiffrast](https://github.com/wukailu/nvdiffrast-torch) to pre-compile on another computer that has cudatoolkit and a similar environment (same versions of python, torch, cuda) and then install the `.whl`.
+### Installation of onnxruntime-gpu
+* Note that installing both `onnxruntime` and `onnxruntime-gpu` may result in not running on the GPU but on the CPU, leading to extremely slow inference speed.
+* [Official ONNX Runtime Installation Guide](https://onnxruntime.ai/docs/install/#python-installs)
+* TLDR: For cuda11.x, `pip install onnxruntime-gpu`. For cuda12.x, `pip install onnxruntime-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/`.
+* Furthermore, you can install onnxruntime based on tensorrt to further increase the inference speed.
+* Note: If you do not correctly installed onnxruntime based on tensorrt, it is recommended to remove `TensorrtExecutionProvider` from `https://github.com/AiuniAI/Unique3D/blob/4e1174c3896fee992ffc780d0ea813500401fae9/scripts/load_onnx.py#L4`.
+* For cuda12.x, you can quickly install onnxruntime with tensorrt using the following commands (note to change the path `/root/miniconda3/lib/python3.10/site-packages` to the corresponding path of your python, and change `/root/.bashrc` to the path of `.bashrc` under your user directory):
+```
+pip install ort-nightly-gpu --index-url=https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ort-cuda-12-nightly/pypi/simple/
+pip install onnxruntime-gpu==1.17.0 --index-url=https://pkgs.dev.azure.com/onnxruntime/onnxruntime/_packaging/onnxruntime-cuda-12/pypi/simple/
+pip install tensorrt==8.6.0
+echo -e "export LD_LIBRARY_PATH=/usr/local/cuda/targets/x86_64-linux/lib/:/root/miniconda3/lib/python3.10/site-packages/tensorrt:${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" >> /root/.bashrc
+```
+### Installation of pytorch3d
+* According to the [official installation recommendations of pytorch3d](https://github.com/facebookresearch/pytorch3d/blob/main/INSTALL.md#2-install-wheels-for-linux), it is recommended to use the pre-compiled version:
+```
+import sys
+import torch
+pyt_version_str=torch.__version__.split("+")[0].replace(".", "")
+version_str="".join([
+    f"py3{sys.version_info.minor}_cu",
+    torch.version.cuda.replace(".",""),
+    f"_pyt{pyt_version_str}"
+])
+!pip install fvcore iopath
+!pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html
+```
+### Installation of torch_scatter
+* Use the pre-compiled installation package according to the [official installation guide of torch_scatter](https://github.com/rusty1s/pytorch_scatter?tab=readme-ov-file#installation) for a quick installation.
+* Alternatively, you can directly compile and install with `pip install git+https://github.com/rusty1s/pytorch_scatter.git`.
+### Other Installations
+* For other packages, simply `pip install -r requirements.txt`.
+-----
+# 官方インストールガイド
+* `requirements-detail.txt` には、各ライブラリのバージョンが詳細に提供されており、これは Python 3.10 + CUDA 12.2 に対応する環境です。
+* このプロジェクトは、いくつかの重要な PyPI パッケージに依存しており、これらのパッケージのインストールにはいくつかの困難が伴います。
+### nvdiffrast のインストール
+* nvdiffrast は、最初に実行するときに、torch プラグインの対応バージョンをコンパイルします。このステップには、ninja および cudatoolkit のサポートが必要です。
+* したがって、ninja および cudatoolkit の正確なインストールと、CUDA_HOME 環境変数の正確な設定を確保する必要があります。
+* cudatoolkit のインストールについては、[Linux CUDA インストールガイド](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html)、[Windows CUDA インストールガイド](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html) を参照してください。
+* ninja は、直接 `pip install ninja` でインストールできます。
+* 次に、CUDA_HOME 変数を cudatoolkit のインストールディレクトリに設定します。例えば、`/usr/local/cuda` のように。
+* 最後に、`pip install nvdiffrast` を実行します。
+* 目標サーバーで cudatoolkit をインストールできない場合（例えば、権限が不足している場合）、私の修正した[事前コンパイル済みバージョンの nvdiffrast](https://github.com/wukailu/nvdiffrast-torch)を使用できます。これは、cudatoolkit があり、環境が似ている（Python、torch、cudaのバージョンが同じ）別のサーバーで事前コンパイルしてからインストールすることができます。
+### onnxruntime-gpu のインストール
+* 注意：`onnxruntime` と `onnxruntime-gpu` を同時にインストールすると、最終的なプログラムが GPU 上で実行されず、CPU 上で実行される可能性があり、推論速度���非常に遅くなることがあります。
+* [onnxruntime 公式インストールガイド](https://onnxruntime.ai/docs/install/#python-installs)
+* TLDR: cuda11.x 用には、`pip install onnxruntime-gpu` を使用します。cuda12.x 用には、`pip install onnxruntime-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/` を使用します。
+* さらに、TensorRT ベースの onnxruntime をインストールして、推論速度をさらに向上させることができます。
+* 注意：TensorRT ベースの onnxruntime がインストールされていない場合は、`https://github.com/AiuniAI/Unique3D/blob/4e1174c3896fee992ffc780d0ea813500401fae9/scripts/load_onnx.py#L4` の `TensorrtExecutionProvider` を削除することをお勧めします。
+* cuda12.x の場合、次のコマンドを使用して迅速に TensorRT を備えた onnxruntime をインストールできます（`/root/miniconda3/lib/python3.10/site-packages` をあなたの Python に対応するパスに、`/root/.bashrc` をあなたのユーザーのパスの下の `.bashrc` に変更してください）。
+```bash
+pip install ort-nightly-gpu --index-url=https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ort-cuda-12-nightly/pypi/simple/
+pip install onnxruntime-gpu==1.17.0 --index-url=https://pkgs.dev.azure.com/onnxruntime/onnxruntime/_packaging/onnxruntime-cuda-12/pypi/simple/
+pip install tensorrt==8.6.0
+echo -e "export LD_LIBRARY_PATH=/usr/local/cuda/targets/x86_64-linux/lib/:/root/miniconda3/lib/python3.10/site-packages/tensorrt:${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" >> /root/.bashrc
+```
+### pytorch3d のインストール
+* [pytorch3d 公式のインストール提案](https://github.com/facebookresearch/pytorch3d/blob/main/INSTALL.md#2-install-wheels-for-linux)に従い、事前コンパイル済みバージョンを使用することをお勧めします。
+```python
+import sys
+import torch
+pyt_version_str=torch.__version__.split("+")[0].replace(".", "")
+version_str="".join([
+    f"py3{sys.version_info.minor}_cu",
+    torch.version.cuda.replace(".",""),
+    f"_pyt{pyt_version_str}"
+])
+!pip install fvcore iopath
+!pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html
+```
+### torch_scatter のインストール
+* [torch_scatter 公式インストールガイド](https://github.com/rusty1s/pytorch_scatter?tab=readme-ov-file#installation)に従い、事前コンパイル済みのインストールパッケージを使用して迅速インストールします。
+* または、直接コンパイルしてインストールする `pip install git+https://github.com/rusty1s/pytorch_scatter.git` も可能です。
+### その他のインストール
+* その他のファイルについては、`pip install -r requirements.txt` を実行するだけです。

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 AiuniAI
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,12 +1,137 @@
 ---
-title: 3D Genesis
-emoji: 🏆
-colorFrom: indigo
-colorTo: yellow
 sdk: gradio
 sdk_version: 5.5.0
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: 3D-Genesis
+app_file: gradio_app.py
 sdk: gradio
 sdk_version: 5.5.0
 ---
+**[中文版本](README_zh.md)**
+**[日本語版](README_jp.md)**
+# Unique3D
+Official implementation of Unique3D: High-Quality and Efficient 3D Mesh Generation from a Single Image.
+[Kailu Wu](https://scholar.google.com/citations?user=VTU0gysAAAAJ&hl=zh-CN&oi=ao), [Fangfu Liu](https://liuff19.github.io/), Zhihan Cai, Runjie Yan, Hanyang Wang, Yating Hu, [Yueqi Duan](https://duanyueqi.github.io/), [Kaisheng Ma](https://group.iiis.tsinghua.edu.cn/~maks/)
+## [Paper](https://arxiv.org/abs/2405.20343) | [Project page](https://wukailu.github.io/Unique3D/) | [Huggingface Demo](https://huggingface.co/spaces/Wuvin/Unique3D) | [Gradio Demo](http://unique3d.demo.avar.cn/) | [Online Demo](https://www.aiuni.ai/)
+* Demo inference speed: Gradio Demo > Huggingface Demo > Huggingface Demo2 > Online Demo
+**If the Gradio Demo is overcrowded or fails to produce stable results, you can use the Online Demo [aiuni.ai](https://www.aiuni.ai/), which is free to try (get the registration invitation code Join Discord: https://discord.gg/aiuni). However, the Online Demo is slightly different from the Gradio Demo, in that the inference speed is slower, but the generation is much more stable.**
+<p align="center">
+    <img src="assets/teaser_safe.jpg">
+</p>
+High-fidelity and diverse textured meshes generated by Unique3D from single-view wild images in 30 seconds.
+## More features
+The repo is still being under construction, thanks for your patience.
+- [x] Upload weights.
+- [x] Local gradio demo.
+- [x] Detailed tutorial.
+- [x] Huggingface demo.
+- [ ] Detailed local demo.
+- [x] Comfyui support.
+- [x] Windows support.
+- [x] Docker support.
+- [ ] More stable reconstruction with normal.
+- [ ] Training code release.
+## Preparation for inference
+* [Detailed linux installation guide](Installation.md).
+### Linux System Setup.
+Adapted for Ubuntu 22.04.4 LTS and CUDA 12.1.
+```angular2html
+conda create -n unique3d python=3.11
+conda activate unique3d
+pip install ninja
+pip install diffusers==0.27.2
+pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu121/torch2.3.1/index.html
+pip install -r requirements.txt
+```
+[oak-barry](https://github.com/oak-barry) provide another setup script for torch210+cu121 at [here](https://github.com/oak-barry/Unique3D).
+### Windows Setup.
+* Thank you very much `jtydhr88` for the windows installation method! See [issues/15](https://github.com/AiuniAI/Unique3D/issues/15).
+According to [issues/15](https://github.com/AiuniAI/Unique3D/issues/15), implemented a bat script to run the commands, so you can:
+1. Might still require Visual Studio Build Tools, you can find it from [Visual Studio Build Tools](https://visualstudio.microsoft.com/downloads/?q=build+tools).
+2. Create conda env and activate it
+   1. `conda create -n unique3d-py311 python=3.11`
+   2. `conda activate unique3d-py311`
+3. download [triton whl](https://huggingface.co/madbuda/triton-windows-builds/resolve/main/triton-2.1.0-cp311-cp311-win_amd64.whl) for py311, and put it into this project.
+4. run **install_windows_win_py311_cu121.bat**
+5. answer y while asking you uninstall onnxruntime and onnxruntime-gpu
+6. create the output folder **tmp\gradio** under the driver root, such as F:\tmp\gradio for me.
+7. python app/gradio_local.py --port 7860
+More details prefer to [issues/15](https://github.com/AiuniAI/Unique3D/issues/15).
+### Interactive inference: run your local gradio demo.
+1. Download the weights from [huggingface spaces](https://huggingface.co/spaces/Wuvin/Unique3D/tree/main/ckpt) or [Tsinghua Cloud Drive](https://cloud.tsinghua.edu.cn/d/319762ec478d46c8bdf7/), and extract it to `ckpt/*`.
+```
+Unique3D
+    ├──ckpt
+        ├── controlnet-tile/
+        ├── image2normal/
+        ├── img2mvimg/
+        ├── realesrgan-x4.onnx
+        └── v1-inference.yaml
+```
+2. Run the interactive inference locally.
+```bash
+python app/gradio_local.py --port 7860
+```
+## ComfyUI Support
+Thanks for the [ComfyUI-Unique3D](https://github.com/jtydhr88/ComfyUI-Unique3D) implementation from [jtydhr88](https://github.com/jtydhr88)!
+## Tips to get better results
+**Important: Because the mesh is normalized by the longest edge of xyz during training, it is desirable that the input image needs to contain the longest edge of the object during inference, or else you may get erroneously squashed results.**
+1. Unique3D is sensitive to the facing direction of input images. Due to the distribution of the training data, orthographic front-facing images with a rest pose always lead to good reconstructions.
+2. Images with occlusions will cause worse reconstructions, since four views cannot cover the complete object. Images with fewer occlusions lead to better results.
+3. Pass an image with as high a resolution as possible to the input when resolution is a factor.
+## Acknowledgement
+We have intensively borrowed code from the following repositories. Many thanks to the authors for sharing their code.
+- [Stable Diffusion](https://github.com/CompVis/stable-diffusion)
+- [Wonder3d](https://github.com/xxlong0/Wonder3D)
+- [Zero123Plus](https://github.com/SUDO-AI-3D/zero123plus)
+- [Continues Remeshing](https://github.com/Profactor/continuous-remeshing)
+- [Depth from Normals](https://github.com/YertleTurtleGit/depth-from-normals)
+## Collaborations
+Our mission is to create a 4D generative model with 3D concepts. This is just our first step, and the road ahead is still long, but we are confident. We warmly invite you to join the discussion and explore potential collaborations in any capacity. <span style="color:red">**If you're interested in connecting or partnering with us, please don't hesitate to reach out via email (wkl22@mails.tsinghua.edu.cn)**</span>.
+- Follow us on twitter for the latest updates: https://x.com/aiuni_ai
+- Join AIGC 3D/4D generation community on discord: https://discord.gg/aiuni
+- Research collaboration, please contact: ai@aiuni.ai
+## Citation
+If you found Unique3D helpful, please cite our report:
+```bibtex
+@misc{wu2024unique3d,
+      title={Unique3D: High-Quality and Efficient 3D Mesh Generation from a Single Image},
+      author={Kailu Wu and Fangfu Liu and Zhihan Cai and Runjie Yan and Hanyang Wang and Yating Hu and Yueqi Duan and Kaisheng Ma},
+      year={2024},
+      eprint={2405.20343},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```

README_jp.md ADDED Viewed

	@@ -0,0 +1,126 @@

+**他の言語のバージョン [英語](README.md) [中国語](README_zh.md)**
+# Unique3D
+Unique3D: 単一画像からの高品質かつ効率的な3Dメッシュ生成の公式実装。
+[Kailu Wu](https://scholar.google.com/citations?user=VTU0gysAAAAJ&hl=zh-CN&oi=ao), [Fangfu Liu](https://liuff19.github.io/), Zhihan Cai, Runjie Yan, Hanyang Wang, Yating Hu, [Yueqi Duan](https://duanyueqi.github.io/), [Kaisheng Ma](https://group.iiis.tsinghua.edu.cn/~maks/)
+## [論文](https://arxiv.org/abs/2405.20343) | [プロジェクトページ](https://wukailu.github.io/Unique3D/) | [Huggingfaceデモ](https://huggingface.co/spaces/Wuvin/Unique3D) | [Gradioデモ](http://unique3d.demo.avar.cn/) | [オンラインデモ](https://www.aiuni.ai/)
+* デモ推論速度: Gradioデモ > Huggingfaceデモ > Huggingfaceデモ2 > オンラインデモ
+**Gradioデモが残念ながらハングアップしたり、非常に混雑している場合は、[aiuni.ai](https://www.aiuni.ai/)のオンラインデモを使用できます。これは無料で試すことができます（登録招待コードを取得するには、Discordに参加してください: https://discord.gg/aiuni）。ただし、オンラインデモはGradioデモとは少し異なり、推論速度が遅く、生成結果が安定していない可能性がありますが、素材の品質は良いです。**
+<p align="center">
+    <img src="assets/teaser_safe.jpg">
+</p>
+Unique3Dは、野生の単一画像から高忠実度および多様なテクスチャメッシュを30秒で生成します。
+## より多くの機能
+リポジトリはまだ構築中です。ご理解いただきありがとうございます。
+- [x] 重みのアップロード。
+- [x] ローカルGradioデモ。
+- [ ] 詳細なチュートリアル。
+- [x] Huggingfaceデモ。
+- [ ] 詳細なローカルデモ。
+- [x] Comfyuiサポート。
+- [x] Windowsサポート。
+- [ ] Dockerサポート。
+- [ ] ノーマルでより安定した再構築。
+- [ ] トレーニングコードのリリース。
+## 推論の準備
+### Linuxシステムセットアップ
+Ubuntu 22.04.4 LTSおよびCUDA 12.1に適応。
+```angular2html
+conda create -n unique3d python=3.11
+conda activate unique3d
+pip install ninja
+pip install diffusers==0.27.2
+pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu121/torch2.3.1/index.html
+pip install -r requirements.txt
+```
+[oak-barry](https://github.com/oak-barry)は、[こちら](https://github.com/oak-barry/Unique3D)でtorch210+cu121の別のセットアップスクリプトを提供しています。
+### Windowsセットアップ
+* `jtydhr88`によるWindowsインストール方法に非常に感謝します！[issues/15](https://github.com/AiuniAI/Unique3D/issues/15)を参照してください。
+[issues/15](https://github.com/AiuniAI/Unique3D/issues/15)によると、コマンドを実行するバッチスクリプトを実装したので、以下の手順に従ってください。
+1. [Visual Studio Build Tools](https://visualstudio.microsoft.com/downloads/?q=build+tools)からVisual Studio Build Toolsが必要になる場合があります。
+2. conda envを作成し、アクティブにします。
+   1. `conda create -n unique3d-py311 python=3.11`
+   2. `conda activate unique3d-py311`
+3. [triton whl](https://huggingface.co/madbuda/triton-windows-builds/resolve/main/triton-2.1.0-cp311-cp311-win_amd64.whl)をダウンロードし、このプロジェクトに配置します。
+4. **install_windows_win_py311_cu121.bat**を実行します。
+5. onnxruntimeおよびonnxruntime-gpuのアンインストールを求められた場合は、yと回答します。
+6. ドライバールートの下に**tmp\gradio**フォルダを作成します（例：F:\tmp\gradio）。
+7. python app/gradio_local.py --port 7860
+詳細は[issues/15](https://github.com/AiuniAI/Unique3D/issues/15)を参照してください。
+### インタラクティブ推論：ローカルGradioデモを実行する
+1. [huggingface spaces](https://huggingface.co/spaces/Wuvin/Unique3D/tree/main/ckpt)または[Tsinghua Cloud Drive](https://cloud.tsinghua.edu.cn/d/319762ec478d46c8bdf7/)から重みをダウンロードし、`ckpt/*`に抽出します。
+```
+Unique3D
+    ├──ckpt
+        ├── controlnet-tile/
+        ├── image2normal/
+        ├── img2mvimg/
+        ├── realesrgan-x4.onnx
+        └── v1-inference.yaml
+```
+2. インタラクティブ推論をローカルで実行します。
+```bash
+python app/gradio_local.py --port 7860
+```
+## ComfyUIサポート
+[jtydhr88](https://github.com/jtydhr88)からの[ComfyUI-Unique3D](https://github.com/jtydhr88/ComfyUI-Unique3D)の実装に感謝します！
+## より良い結果を得るためのヒント
+1. Unique3Dは入力画像の向きに敏感です。トレーニングデータの分布により、正面を向いた直交画像は常に良い再構築につながります。
+2. 遮���のある画像は、4つのビューがオブジェクトを完全にカバーできないため、再構築が悪化します。遮蔽の少ない画像は、より良い結果につながります。
+3. 可能な限り高解像度の画像を入力として使用してください。
+## 謝辞
+以下のリポジトリからコードを大量に借用しました。コードを共有してくれた著者に感謝します。
+- [Stable Diffusion](https://github.com/CompVis/stable-diffusion)
+- [Wonder3d](https://github.com/xxlong0/Wonder3D)
+- [Zero123Plus](https://github.com/SUDO-AI-3D/zero123plus)
+- [Continues Remeshing](https://github.com/Profactor/continuous-remeshing)
+- [Depth from Normals](https://github.com/YertleTurtleGit/depth-from-normals)
+## コラボレーション
+私たちの使命は、3Dの概念を持つ4D生成モデルを作成することです。これは私たちの最初のステップであり、前途はまだ長いですが、私たちは自信を持っています。あらゆる形態の潜在的なコラボレーションを探求し、議論に参加することを心から歓迎します。<span style="color:red">**私たちと連絡を取りたい、またはパートナーシップを結びたい方は、メールでお気軽にお問い合わせください (wkl22@mails.tsinghua.edu.cn)**</span>。
+- 最新情報を入手するには、Twitterをフォローしてください: https://x.com/aiuni_ai
+- DiscordでAIGC 3D/4D生成コミュニティに参加してください: https://discord.gg/aiuni
+- 研究協力については、ai@aiuni.aiまでご連絡ください。
+## 引用
+Unique3Dが役立つと思われる場合は、私たちのレポートを引用してください：
+```bibtex
+@misc{wu2024unique3d,
+      title={Unique3D: High-Quality and Efficient 3D Mesh Generation from a Single Image},
+      author={Kailu Wu and Fangfu Liu and Zhihan Cai and Runjie Yan and Hanyang Wang and Yating Hu and Yueqi Duan and Kaisheng Ma},
+      year={2024},
+      eprint={2405.20343},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```

README_zh.md ADDED Viewed

	@@ -0,0 +1,62 @@

+**其他语言版本 [English](README.md)**
+# Unique3D
+High-Quality and Efficient 3D Mesh Generation from a Single Image
+[Kailu Wu](https://scholar.google.com/citations?user=VTU0gysAAAAJ&hl=zh-CN&oi=ao), [Fangfu Liu](https://liuff19.github.io/), Zhihan Cai, Runjie Yan, Hanyang Wang, Yating Hu, [Yueqi Duan](https://duanyueqi.github.io/), [Kaisheng Ma](https://group.iiis.tsinghua.edu.cn/~maks/)
+## [论文](https://arxiv.org/abs/2405.20343) | [项目页面](https://wukailu.github.io/Unique3D/) | [Huggingface Demo](https://huggingface.co/spaces/Wuvin/Unique3D) | [Gradio Demo](http://unique3d.demo.avar.cn/) | [在线演示](https://www.aiuni.ai/)
+<p align="center">
+    <img src="assets/teaser_safe.jpg">
+</p>
+Unique3D从单视图图像生成高保真度和多样化纹理的网格，在4090上大约需要30秒。
+### 推理准备
+#### Linux系统设置
+```angular2html
+conda create -n unique3d
+conda activate unique3d
+pip install -r requirements.txt
+```
+#### 交互式推理：运行您的本地gradio演示
+1. 从 [huggingface spaces](https://huggingface.co/spaces/Wuvin/Unique3D/tree/main/ckpt) 下载或者从[清华云盘](https://cloud.tsinghua.edu.cn/d/319762ec478d46c8bdf7/)下载权重，并将其解压到`ckpt/*`。
+```
+Unique3D
+    ├──ckpt
+        ├── controlnet-tile/
+        ├── image2normal/
+        ├── img2mvimg/
+        ├── realesrgan-x4.onnx
+        └── v1-inference.yaml
+```
+2. 在本地运行交互式推理。
+```bash
+python app/gradio_local.py --port 7860
+```
+## 获取更好结果的提示
+1. Unique3D对输入图像的朝向非常敏感。由于训练数据的分布，**正交正视图像**通常总是能带来良好的重建。对于人物而言，最好是 A-pose 或者 T-pose，因为目前训练数据很少含有其他类型姿态。
+2. 有遮挡的图像会导致更差的重建，因为4个视图无法覆盖完整的对象。遮挡较少的图像会带来更好的结果。
+3. 尽可能将高分辨率的图像用作输入。
+## 致谢
+我们借用了以下代码库的代码。非常感谢作者们分享他们的代码。
+- [Stable Diffusion](https://github.com/CompVis/stable-diffusion)
+- [Wonder3d](https://github.com/xxlong0/Wonder3D)
+- [Zero123Plus](https://github.com/SUDO-AI-3D/zero123plus)
+- [Continues Remeshing](https://github.com/Profactor/continuous-remeshing)
+- [Depth from Normals](https://github.com/YertleTurtleGit/depth-from-normals)
+## 合作
+我们使命是创建一个具有3D概念的4D生成模型。这只是我们的第一步，前方的道路仍然很长，但我们有信心。我们热情邀请您加入讨论，并探索任何形式的潜在合作。<span style="color:red">**如果您有兴趣联系或与我们合作，欢迎通过电子邮件(wkl22@mails.tsinghua.edu.cn)与我们联系**</span>。

app/__init__.py ADDED Viewed

File without changes

app/all_models.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import torch
+from scripts.sd_model_zoo import load_common_sd15_pipe
+from diffusers import StableDiffusionControlNetImg2ImgPipeline, StableDiffusionPipeline
+class MyModelZoo:
+    _pipe_disney_controlnet_lineart_ipadapter_i2i: StableDiffusionControlNetImg2ImgPipeline = None
+    base_model = "runwayml/stable-diffusion-v1-5"
+    def __init__(self, base_model=None) -> None:
+        if base_model is not None:
+            self.base_model = base_model
+    @property
+    def pipe_disney_controlnet_tile_ipadapter_i2i(self):
+        return self._pipe_disney_controlnet_lineart_ipadapter_i2i
+    def init_models(self):
+        self._pipe_disney_controlnet_lineart_ipadapter_i2i = load_common_sd15_pipe(base_model=self.base_model, ip_adapter=True, plus_model=False, controlnet="./ckpt/controlnet-tile", pipeline_class=StableDiffusionControlNetImg2ImgPipeline)
+model_zoo = MyModelZoo()

app/custom_models/image2mvimage.yaml ADDED Viewed

	@@ -0,0 +1,63 @@

+pretrained_model_name_or_path: "./ckpt/img2mvimg"
+mixed_precision: "bf16"
+init_config:
+  # enable controls
+  enable_cross_attn_lora: False
+  enable_cross_attn_ip: False
+  enable_self_attn_lora: False
+  enable_self_attn_ref: False
+  enable_multiview_attn: True
+  # for cross attention
+  init_cross_attn_lora: False
+  init_cross_attn_ip: False
+  cross_attn_lora_rank: 256        # 0 for not enabled
+  cross_attn_lora_only_kv: False
+  ipadapter_pretrained_name: "h94/IP-Adapter"
+  ipadapter_subfolder_name: "models"
+  ipadapter_weight_name: "ip-adapter_sd15.safetensors"
+  ipadapter_effect_on: "all"    # all, first
+  # for self attention
+  init_self_attn_lora: False
+  self_attn_lora_rank: 256
+  self_attn_lora_only_kv: False
+  # for self attention ref
+  init_self_attn_ref: False
+  self_attn_ref_position: "attn1"
+  self_attn_ref_other_model_name: "lambdalabs/sd-image-variations-diffusers"
+  self_attn_ref_pixel_wise_crosspond: False
+  self_attn_ref_effect_on: "all"
+  # for multiview attention
+  init_multiview_attn: True
+  multiview_attn_position: "attn1"
+  use_mv_joint_attn: True
+  num_modalities: 1
+  # for unet
+  init_unet_path: "${pretrained_model_name_or_path}"
+  cat_condition: True       # cat condition to input
+  # for cls embedding
+  init_num_cls_label: 8     # for initialize
+  cls_labels: [0, 1, 2, 3]  # for current task
+trainers:
+  - trainer_type: "image2mvimage_trainer"
+    trainer:
+        pretrained_model_name_or_path: "${pretrained_model_name_or_path}"
+        attn_config:
+          cls_labels: [0, 1, 2, 3]  # for current task
+          enable_cross_attn_lora: False
+          enable_cross_attn_ip: False
+          enable_self_attn_lora: False
+          enable_self_attn_ref: False
+          enable_multiview_attn: True
+        resolution: "256"
+        condition_image_resolution: "256"
+        normal_cls_offset: 4
+        condition_image_column_name: "conditioning_image"
+        image_column_name: "image"

app/custom_models/image2normal.yaml ADDED Viewed

	@@ -0,0 +1,61 @@

+pretrained_model_name_or_path: "lambdalabs/sd-image-variations-diffusers"
+mixed_precision: "bf16"
+init_config:
+  # enable controls
+  enable_cross_attn_lora: False
+  enable_cross_attn_ip: False
+  enable_self_attn_lora: False
+  enable_self_attn_ref: True
+  enable_multiview_attn: False
+  # for cross attention
+  init_cross_attn_lora: False
+  init_cross_attn_ip: False
+  cross_attn_lora_rank: 512        # 0 for not enabled
+  cross_attn_lora_only_kv: False
+  ipadapter_pretrained_name: "h94/IP-Adapter"
+  ipadapter_subfolder_name: "models"
+  ipadapter_weight_name: "ip-adapter_sd15.safetensors"
+  ipadapter_effect_on: "all"    # all, first
+  # for self attention
+  init_self_attn_lora: False
+  self_attn_lora_rank: 512
+  self_attn_lora_only_kv: False
+  # for self attention ref
+  init_self_attn_ref: True
+  self_attn_ref_position: "attn1"
+  self_attn_ref_other_model_name: "lambdalabs/sd-image-variations-diffusers"
+  self_attn_ref_pixel_wise_crosspond: True
+  self_attn_ref_effect_on: "all"
+  # for multiview attention
+  init_multiview_attn: False
+  multiview_attn_position: "attn1"
+  num_modalities: 1
+  # for unet
+  init_unet_path: "${pretrained_model_name_or_path}"
+  init_num_cls_label: 0     # for initialize
+  cls_labels: []  # for current task
+trainers:
+  - trainer_type: "image2image_trainer"
+    trainer:
+        pretrained_model_name_or_path: "${pretrained_model_name_or_path}"
+        attn_config:
+          cls_labels: []  # for current task
+          enable_cross_attn_lora: False
+          enable_cross_attn_ip: False
+          enable_self_attn_lora: False
+          enable_self_attn_ref: True
+          enable_multiview_attn: False
+        resolution: "512"
+        condition_image_resolution: "512"
+        condition_image_column_name: "conditioning_image"
+        image_column_name: "image"

app/custom_models/mvimg_prediction.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import sys
+import torch
+import gradio as gr
+from PIL import Image
+import numpy as np
+from rembg import remove
+from app.utils import change_rgba_bg, rgba_to_rgb
+from app.custom_models.utils import load_pipeline
+from scripts.all_typing import *
+from scripts.utils import session, simple_preprocess
+training_config = "app/custom_models/image2mvimage.yaml"
+checkpoint_path = "ckpt/img2mvimg/unet_state_dict.pth"
+trainer, pipeline = load_pipeline(training_config, checkpoint_path)
+# pipeline.enable_model_cpu_offload()
+def predict(img_list: List[Image.Image], guidance_scale=2., **kwargs):
+    if isinstance(img_list, Image.Image):
+        img_list = [img_list]
+    img_list = [rgba_to_rgb(i) if i.mode == 'RGBA' else i for i in img_list]
+    ret = []
+    for img in img_list:
+        images = trainer.pipeline_forward(
+            pipeline=pipeline,
+            image=img,
+            guidance_scale=guidance_scale,
+            **kwargs
+        ).images
+        ret.extend(images)
+    return ret
+def run_mvprediction(input_image: Image.Image, remove_bg=True, guidance_scale=1.5, seed=1145):
+    if input_image.mode == 'RGB' or np.array(input_image)[..., -1].mean() == 255.:
+        # still do remove using rembg, since simple_preprocess requires RGBA image
+        print("RGB image not RGBA! still remove bg!")
+        remove_bg = True
+    if remove_bg:
+        input_image = remove(input_image, session=session)
+    # make front_pil RGBA with white bg
+    input_image = change_rgba_bg(input_image, "white")
+    single_image = simple_preprocess(input_image)
+    generator = torch.Generator(device="cuda").manual_seed(int(seed)) if seed >= 0 else None
+    rgb_pils = predict(
+        single_image,
+        generator=generator,
+        guidance_scale=guidance_scale,
+        width=256,
+        height=256,
+        num_inference_steps=30,
+    )
+    return rgb_pils, single_image

app/custom_models/normal_prediction.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import sys
+from PIL import Image
+from app.utils import rgba_to_rgb, simple_remove
+from app.custom_models.utils import load_pipeline
+from scripts.utils import rotate_normals_torch
+from scripts.all_typing import *
+training_config = "app/custom_models/image2normal.yaml"
+checkpoint_path = "ckpt/image2normal/unet_state_dict.pth"
+trainer, pipeline = load_pipeline(training_config, checkpoint_path)
+# pipeline.enable_model_cpu_offload()
+def predict_normals(image: List[Image.Image], guidance_scale=2., do_rotate=True, num_inference_steps=30, **kwargs):
+    img_list = image if isinstance(image, list) else [image]
+    img_list = [rgba_to_rgb(i) if i.mode == 'RGBA' else i for i in img_list]
+    images = trainer.pipeline_forward(
+        pipeline=pipeline,
+        image=img_list,
+        num_inference_steps=num_inference_steps,
+        guidance_scale=guidance_scale,
+        **kwargs
+    ).images
+    images = simple_remove(images)
+    if do_rotate and len(images) > 1:
+        images = rotate_normals_torch(images, return_types='pil')
+    return images

app/custom_models/utils.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import torch
+from typing import List
+from dataclasses import dataclass
+from app.utils import rgba_to_rgb
+from custum_3d_diffusion.trainings.config_classes import ExprimentConfig, TrainerSubConfig
+from custum_3d_diffusion import modules
+from custum_3d_diffusion.custum_modules.unifield_processor import AttnConfig, ConfigurableUNet2DConditionModel
+from custum_3d_diffusion.trainings.base import BasicTrainer
+from custum_3d_diffusion.trainings.utils import load_config
+@dataclass
+class FakeAccelerator:
+    device: torch.device = torch.device("cuda")
+def init_trainers(cfg_path: str, weight_dtype: torch.dtype, extras: dict):
+    accelerator = FakeAccelerator()
+    cfg: ExprimentConfig = load_config(ExprimentConfig, cfg_path, extras)
+    init_config: AttnConfig = load_config(AttnConfig, cfg.init_config)
+    configurable_unet = ConfigurableUNet2DConditionModel(init_config, weight_dtype)
+    configurable_unet.enable_xformers_memory_efficient_attention()
+    trainer_cfgs: List[TrainerSubConfig] = [load_config(TrainerSubConfig, trainer) for trainer in cfg.trainers]
+    trainers: List[BasicTrainer] = [modules.find(trainer.trainer_type)(accelerator, None, configurable_unet, trainer.trainer, weight_dtype, i) for i, trainer in enumerate(trainer_cfgs)]
+    return trainers, configurable_unet
+from app.utils import make_image_grid, split_image
+def process_image(function, img, guidance_scale=2., merged_image=False, remove_bg=True):
+    from rembg import remove
+    if remove_bg:
+        img = remove(img)
+    img = rgba_to_rgb(img)
+    if merged_image:
+        img = split_image(img, rows=2)
+    images = function(
+        image=img,
+        guidance_scale=guidance_scale,
+    )
+    if len(images) > 1:
+        return make_image_grid(images, rows=2)
+    else:
+        return images[0]
+def process_text(trainer, pipeline, img, guidance_scale=2.):
+    pipeline.cfg.validation_prompts = [img]
+    titles, images = trainer.batched_validation_forward(pipeline, guidance_scale=[guidance_scale])
+    return images[0]
+def load_pipeline(config_path, ckpt_path, pipeline_filter=lambda x: True, weight_dtype = torch.bfloat16):
+    training_config = config_path
+    load_from_checkpoint = ckpt_path
+    extras = []
+    device = "cuda"
+    trainers, configurable_unet = init_trainers(training_config, weight_dtype, extras)
+    shared_modules = dict()
+    for trainer in trainers:
+        shared_modules = trainer.init_shared_modules(shared_modules)
+    if load_from_checkpoint is not None:
+        state_dict = torch.load(load_from_checkpoint)
+        configurable_unet.unet.load_state_dict(state_dict, strict=False)
+    # Move unet, vae and text_encoder to device and cast to weight_dtype
+    configurable_unet.unet.to(device, dtype=weight_dtype)
+    pipeline = None
+    trainer_out = None
+    for trainer in trainers:
+        if pipeline_filter(trainer.cfg.trainer_name):
+            pipeline = trainer.construct_pipeline(shared_modules, configurable_unet.unet)
+            pipeline.set_progress_bar_config(disable=False)
+            trainer_out = trainer
+    pipeline = pipeline.to(device)
+    return trainer_out, pipeline

app/examples/Groot.png ADDED Viewed

app/examples/aaa.png ADDED Viewed

app/examples/abma.png ADDED Viewed

app/examples/akun.png ADDED Viewed

app/examples/anya.png ADDED Viewed

app/examples/bag.png ADDED Viewed

Git LFS Details

SHA256: ac798ea1f112091c04f5bdfa47c490806fb433a02fe17758aa1f8c55cd64b66e
Pointer size: 132 Bytes
Size of remote file: 1.54 MB

app/examples/ex1.png ADDED Viewed

Git LFS Details

SHA256: d49ccccd40fe0317c2886b0d36a11667003d17a49cc49d9244208d250de9fe31
Pointer size: 132 Bytes
Size of remote file: 1.17 MB

app/examples/ex2.png ADDED Viewed

app/examples/ex3.jpg ADDED Viewed

app/examples/ex4.png ADDED Viewed

app/examples/generated_1715761545_frame0.png ADDED Viewed

app/examples/generated_1715762357_frame0.png ADDED Viewed

app/examples/generated_1715763329_frame0.png ADDED Viewed

app/examples/hatsune_miku.png ADDED Viewed

app/examples/princess-large.png ADDED Viewed

app/gradio_3dgen.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import os
+import gradio as gr
+from PIL import Image
+from pytorch3d.structures import Meshes
+from app.utils import clean_up
+from app.custom_models.mvimg_prediction import run_mvprediction
+from app.custom_models.normal_prediction import predict_normals
+from scripts.refine_lr_to_sr import run_sr_fast
+from scripts.utils import save_glb_and_video
+from scripts.multiview_inference import geo_reconstruct
+def generate3dv2(preview_img, input_processing, seed, render_video=True, do_refine=True, expansion_weight=0.1, init_type="std"):
+    if preview_img is None:
+        raise gr.Error("preview_img is none")
+    if isinstance(preview_img, str):
+        preview_img = Image.open(preview_img)
+    if preview_img.size[0] <= 512:
+        preview_img = run_sr_fast([preview_img])[0]
+    rgb_pils, front_pil = run_mvprediction(preview_img, remove_bg=input_processing, seed=int(seed)) # 6s
+    new_meshes = geo_reconstruct(rgb_pils, None, front_pil, do_refine=do_refine, predict_normal=True, expansion_weight=expansion_weight, init_type=init_type)
+    vertices = new_meshes.verts_packed()
+    vertices = vertices / 2 * 1.35
+    vertices[..., [0, 2]] = - vertices[..., [0, 2]]
+    new_meshes = Meshes(verts=[vertices], faces=new_meshes.faces_list(), textures=new_meshes.textures)
+    ret_mesh, video = save_glb_and_video("/tmp/gradio/generated", new_meshes, with_timestamp=True, dist=3.5, fov_in_degrees=2 / 1.35, cam_type="ortho", export_video=render_video)
+    return ret_mesh, video
+#######################################
+def create_ui(concurrency_id="wkl"):
+    with gr.Row():
+        with gr.Column(scale=2):
+            input_image = gr.Image(type='pil', image_mode='RGBA', label='Frontview')
+            example_folder = os.path.join(os.path.dirname(__file__), "./examples")
+            example_fns = sorted([os.path.join(example_folder, example) for example in os.listdir(example_folder)])
+            gr.Examples(
+                examples=example_fns,
+                inputs=[input_image],
+                cache_examples=False,
+                label='Examples (click one of the images below to start)',
+                examples_per_page=12
+            )
+        with gr.Column(scale=3):
+            # export mesh display
+            output_mesh = gr.Model3D(value=None, label="Mesh Model", show_label=True, height=320)
+            output_video = gr.Video(label="Preview", show_label=True, show_share_button=True, height=320, visible=False)
+            input_processing = gr.Checkbox(
+                value=True,
+                label='Remove Background',
+                visible=True,
+            )
+            do_refine = gr.Checkbox(value=True, label="Refine Multiview Details", visible=False)
+            expansion_weight = gr.Slider(minimum=-1., maximum=1.0, value=0.1, step=0.1, label="Expansion Weight", visible=False)
+            init_type = gr.Dropdown(choices=["std", "thin"], label="Mesh Initialization", value="std", visible=False)
+            setable_seed = gr.Slider(-1, 1000000000, -1, step=1, visible=True, label="Seed")
+            render_video = gr.Checkbox(value=False, visible=False, label="generate video")
+            fullrunv2_btn = gr.Button('Generate 3D', interactive=True)
+    fullrunv2_btn.click(
+        fn = generate3dv2,
+        inputs=[input_image, input_processing, setable_seed, render_video, do_refine, expansion_weight, init_type],
+        outputs=[output_mesh, output_video],
+        concurrency_id=concurrency_id,
+        api_name="generate3dv2",
+    ).success(clean_up, api_name=False)
+    return input_image

app/gradio_3dgen_steps.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import gradio as gr
+from PIL import Image
+from app.custom_models.mvimg_prediction import run_mvprediction
+from app.utils import make_image_grid, split_image
+from scripts.utils import save_glb_and_video
+def concept_to_multiview(preview_img, input_processing, seed, guidance=1.):
+    seed = int(seed)
+    if preview_img is None:
+        raise gr.Error("preview_img is none.")
+    if isinstance(preview_img, str):
+        preview_img = Image.open(preview_img)
+    rgb_pils, front_pil = run_mvprediction(preview_img, remove_bg=input_processing, seed=seed, guidance_scale=guidance)
+    rgb_pil = make_image_grid(rgb_pils, rows=2)
+    return rgb_pil, front_pil
+def concept_to_multiview_ui(concurrency_id="wkl"):
+    with gr.Row():
+        with gr.Column(scale=2):
+            preview_img = gr.Image(type='pil', image_mode='RGBA', label='Frontview')
+            input_processing = gr.Checkbox(
+                value=True,
+                label='Remove Background',
+            )
+            seed = gr.Slider(minimum=-1, maximum=1000000000, value=-1, step=1.0, label="seed")
+            guidance = gr.Slider(minimum=1.0, maximum=5.0, value=1.0, label="Guidance Scale", step=0.5)
+            run_btn = gr.Button('Generate Multiview', interactive=True)
+        with gr.Column(scale=3):
+            # export mesh display
+            output_rgb = gr.Image(type='pil', label="RGB", show_label=True)
+            output_front = gr.Image(type='pil', image_mode='RGBA', label="Frontview", show_label=True)
+    run_btn.click(
+        fn = concept_to_multiview,
+        inputs=[preview_img, input_processing, seed, guidance],
+        outputs=[output_rgb, output_front],
+        concurrency_id=concurrency_id,
+        api_name=False,
+    )
+    return output_rgb, output_front
+from app.custom_models.normal_prediction import predict_normals
+from scripts.multiview_inference import geo_reconstruct
+def multiview_to_mesh_v2(rgb_pil, normal_pil, front_pil, do_refine=False, expansion_weight=0.1, init_type="std"):
+    rgb_pils = split_image(rgb_pil, rows=2)
+    if normal_pil is not None:
+        normal_pil = split_image(normal_pil, rows=2)
+    if front_pil is None:
+        front_pil = rgb_pils[0]
+    new_meshes = geo_reconstruct(rgb_pils, normal_pil, front_pil, do_refine=do_refine, predict_normal=normal_pil is None, expansion_weight=expansion_weight, init_type=init_type)
+    ret_mesh, video = save_glb_and_video("/tmp/gradio/generated", new_meshes, with_timestamp=True, dist=3.5, fov_in_degrees=2 / 1.35, cam_type="ortho", export_video=False)
+    return ret_mesh
+def new_multiview_to_mesh_ui(concurrency_id="wkl"):
+    with gr.Row():
+        with gr.Column(scale=2):
+            rgb_pil = gr.Image(type='pil', image_mode='RGB', label='RGB')
+            front_pil = gr.Image(type='pil', image_mode='RGBA', label='Frontview(Optinal)')
+            normal_pil = gr.Image(type='pil', image_mode='RGBA', label='Normal(Optinal)')
+            do_refine = gr.Checkbox(
+                value=False,
+                label='Refine rgb',
+                visible=False,
+            )
+            expansion_weight = gr.Slider(minimum=-1.0, maximum=1.0, value=0.1, step=0.1, label="Expansion Weight", visible=False)
+            init_type = gr.Dropdown(choices=["std", "thin"], label="Mesh initialization", value="std", visible=False)
+            run_btn = gr.Button('Generate 3D', interactive=True)
+        with gr.Column(scale=3):
+            # export mesh display
+            output_mesh = gr.Model3D(value=None, label="mesh model", show_label=True)
+    run_btn.click(
+        fn = multiview_to_mesh_v2,
+        inputs=[rgb_pil, normal_pil, front_pil, do_refine, expansion_weight, init_type],
+        outputs=[output_mesh],
+        concurrency_id=concurrency_id,
+        api_name="multiview_to_mesh",
+    )
+    return rgb_pil, front_pil, output_mesh
+#######################################
+def create_step_ui(concurrency_id="wkl"):
+    with gr.Tab(label="3D:concept_to_multiview"):
+        concept_to_multiview_ui(concurrency_id)
+    with gr.Tab(label="3D:new_multiview_to_mesh"):
+        new_multiview_to_mesh_ui(concurrency_id)

app/gradio_local.py ADDED Viewed

	@@ -0,0 +1,76 @@

+if __name__ == "__main__":
+    import os
+    import sys
+    sys.path.append(os.curdir)
+    if 'CUDA_VISIBLE_DEVICES' not in os.environ:
+        os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+    os.environ['TRANSFORMERS_OFFLINE']='0'
+    os.environ['DIFFUSERS_OFFLINE']='0'
+    os.environ['HF_HUB_OFFLINE']='0'
+    os.environ['GRADIO_ANALYTICS_ENABLED']='False'
+    os.environ['HF_ENDPOINT']='https://hf-mirror.com'
+    import torch
+    torch.set_float32_matmul_precision('medium')
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.set_grad_enabled(False)
+import gradio as gr
+import argparse
+from app.gradio_3dgen import create_ui as create_3d_ui
+# from app.gradio_3dgen_steps import create_step_ui
+from app.all_models import model_zoo
+_TITLE = '''Unique3D: High-Quality and Efficient 3D Mesh Generation from a Single Image'''
+_DESCRIPTION = '''
+[Project page](https://wukailu.github.io/Unique3D/)
+* High-fidelity and diverse textured meshes generated by Unique3D from single-view images.
+**If the Gradio Demo is overcrowded or fails to produce stable results, you can use the Online Demo [aiuni.ai](https://www.aiuni.ai/), which is free to try (get the registration invitation code Join Discord: https://discord.gg/aiuni). However, the Online Demo is slightly different from the Gradio Demo, in that the inference speed is slower, but the generation is much more stable.**
+'''
+def launch(
+    port,
+    listen=False,
+    share=False,
+    gradio_root="",
+):
+    model_zoo.init_models()
+    with gr.Blocks(
+        title=_TITLE,
+        theme=gr.themes.Monochrome(),
+    ) as demo:
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown('# ' + _TITLE)
+        gr.Markdown(_DESCRIPTION)
+        create_3d_ui("wkl")
+    launch_args = {}
+    if listen:
+        launch_args["server_name"] = "0.0.0.0"
+    demo.queue(default_concurrency_limit=1).launch(
+        server_port=None if port == 0 else port,
+        share=share,
+        root_path=gradio_root if gradio_root != "" else None,  # "/myapp"
+        **launch_args,
+    )
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    args, extra = parser.parse_known_args()
+    parser.add_argument("--listen", action="store_true")
+    parser.add_argument("--port", type=int, default=0)
+    parser.add_argument("--share", action="store_true")
+    parser.add_argument("--gradio_root", default="")
+    args = parser.parse_args()
+    launch(
+        args.port,
+        listen=args.listen,
+        share=args.share,
+        gradio_root=args.gradio_root,
+    )

app/utils.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import torch
+import numpy as np
+from PIL import Image
+import gc
+import numpy as np
+import numpy as np
+from PIL import Image
+from scripts.refine_lr_to_sr import run_sr_fast
+GRADIO_CACHE = "/tmp/gradio/"
+def clean_up():
+    torch.cuda.empty_cache()
+    gc.collect()
+def remove_color(arr):
+    if arr.shape[-1] == 4:
+        arr = arr[..., :3]
+    # calc diffs
+    base = arr[0, 0]
+    diffs = np.abs(arr.astype(np.int32) - base.astype(np.int32)).sum(axis=-1)
+    alpha = (diffs <= 80)
+    arr[alpha] = 255
+    alpha = ~alpha
+    arr = np.concatenate([arr, alpha[..., None].astype(np.int32) * 255], axis=-1)
+    return arr
+def simple_remove(imgs, run_sr=True):
+    """Only works for normal"""
+    if not isinstance(imgs, list):
+        imgs = [imgs]
+        single_input = True
+    else:
+        single_input = False
+    if run_sr:
+        imgs = run_sr_fast(imgs)
+    rets = []
+    for img in imgs:
+        arr = np.array(img)
+        arr = remove_color(arr)
+        rets.append(Image.fromarray(arr.astype(np.uint8)))
+    if single_input:
+        return rets[0]
+    return rets
+def rgba_to_rgb(rgba: Image.Image, bkgd="WHITE"):
+    new_image = Image.new("RGBA", rgba.size, bkgd)
+    new_image.paste(rgba, (0, 0), rgba)
+    new_image = new_image.convert('RGB')
+    return new_image
+def change_rgba_bg(rgba: Image.Image, bkgd="WHITE"):
+    rgb_white = rgba_to_rgb(rgba, bkgd)
+    new_rgba = Image.fromarray(np.concatenate([np.array(rgb_white), np.array(rgba)[:, :, 3:4]], axis=-1))
+    return new_rgba
+def split_image(image, rows=None, cols=None):
+    """
+        inverse function of make_image_grid
+    """
+    # image is in square
+    if rows is None and cols is None:
+        # image.size [W, H]
+        rows = 1
+        cols = image.size[0] // image.size[1]
+        assert cols * image.size[1] == image.size[0]
+        subimg_size = image.size[1]
+    elif rows is None:
+        subimg_size = image.size[0] // cols
+        rows = image.size[1] // subimg_size
+        assert rows * subimg_size == image.size[1]
+    elif cols is None:
+        subimg_size = image.size[1] // rows
+        cols = image.size[0] // subimg_size
+        assert cols * subimg_size == image.size[0]
+    else:
+        subimg_size = image.size[1] // rows
+        assert cols * subimg_size == image.size[0]
+    subimgs = []
+    for i in range(rows):
+        for j in range(cols):
+            subimg = image.crop((j*subimg_size, i*subimg_size, (j+1)*subimg_size, (i+1)*subimg_size))
+            subimgs.append(subimg)
+    return subimgs
+def make_image_grid(images, rows=None, cols=None, resize=None):
+    if rows is None and cols is None:
+        rows = 1
+        cols = len(images)
+    if rows is None:
+        rows = len(images) // cols
+        if len(images) % cols != 0:
+            rows += 1
+    if cols is None:
+        cols = len(images) // rows
+        if len(images) % rows != 0:
+            cols += 1
+    total_imgs = rows * cols
+    if total_imgs > len(images):
+        images += [Image.new(images[0].mode, images[0].size) for _ in range(total_imgs - len(images))]
+    if resize is not None:
+        images = [img.resize((resize, resize)) for img in images]
+    w, h = images[0].size
+    grid = Image.new(images[0].mode, size=(cols * w, rows * h))
+    for i, img in enumerate(images):
+        grid.paste(img, box=(i % cols * w, i // cols * h))
+    return grid

assets/teaser.jpg ADDED Viewed

assets/teaser_safe.jpg ADDED Viewed

Git LFS Details

SHA256: 5eb9060bc45c1d334f988e8053f1de40cf60df907750dfef89d81cdbe86ffc79
Pointer size: 132 Bytes
Size of remote file: 2.82 MB

custum_3d_diffusion/custum_modules/attention_processors.py ADDED Viewed

	@@ -0,0 +1,385 @@

+from typing import Any, Dict, Optional
+import torch
+from diffusers.models.attention_processor import Attention
+def construct_pix2pix_attention(hidden_states_dim, norm_type="none"):
+    if norm_type == "layernorm":
+        norm = torch.nn.LayerNorm(hidden_states_dim)
+    else:
+        norm = torch.nn.Identity()
+    attention = Attention(
+        query_dim=hidden_states_dim,
+        heads=8,
+        dim_head=hidden_states_dim // 8,
+        bias=True,
+    )
+    # NOTE: xformers 0.22 does not support batchsize >= 4096
+    attention.xformers_not_supported = True # hacky solution
+    return norm, attention
+class ExtraAttnProc(torch.nn.Module):
+    def __init__(
+        self,
+        chained_proc,
+        enabled=False,
+        name=None,
+        mode='extract',
+        with_proj_in=False,
+        proj_in_dim=768,
+        target_dim=None,
+        pixel_wise_crosspond=False,
+        norm_type="none",   # none or layernorm
+        crosspond_effect_on="all",  # all or first
+        crosspond_chain_pos="parralle",     # before or parralle or after
+        simple_3d=False,
+        views=4,
+    ) -> None:
+        super().__init__()
+        self.enabled = enabled
+        self.chained_proc = chained_proc
+        self.name = name
+        self.mode = mode
+        self.with_proj_in=with_proj_in
+        self.proj_in_dim = proj_in_dim
+        self.target_dim = target_dim or proj_in_dim
+        self.hidden_states_dim = self.target_dim
+        self.pixel_wise_crosspond = pixel_wise_crosspond
+        self.crosspond_effect_on = crosspond_effect_on
+        self.crosspond_chain_pos = crosspond_chain_pos
+        self.views = views
+        self.simple_3d = simple_3d
+        if self.with_proj_in and self.enabled:
+            self.in_linear = torch.nn.Linear(self.proj_in_dim, self.target_dim, bias=False)
+            if self.target_dim == self.proj_in_dim:
+                self.in_linear.weight.data = torch.eye(proj_in_dim)
+        else:
+            self.in_linear = None
+        if self.pixel_wise_crosspond and self.enabled:
+            self.crosspond_norm, self.crosspond_attention = construct_pix2pix_attention(self.hidden_states_dim, norm_type=norm_type)
+    def do_crosspond_attention(self, hidden_states: torch.FloatTensor, other_states: torch.FloatTensor):
+        hidden_states = self.crosspond_norm(hidden_states)
+        batch, L, D = hidden_states.shape
+        assert hidden_states.shape == other_states.shape, f"got {hidden_states.shape} and {other_states.shape}"
+        # to -> batch * L, 1, D
+        hidden_states = hidden_states.reshape(batch * L, 1, D)
+        other_states = other_states.reshape(batch * L, 1, D)
+        hidden_states_catted = other_states
+        hidden_states = self.crosspond_attention(
+            hidden_states,
+            encoder_hidden_states=hidden_states_catted,
+        )
+        return hidden_states.reshape(batch, L, D)
+    def __call__(
+        self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None,
+        ref_dict: dict = None, mode=None, **kwargs
+    ) -> Any:
+        if not self.enabled:
+            return self.chained_proc(attn, hidden_states, encoder_hidden_states, attention_mask, **kwargs)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        assert ref_dict is not None
+        if (mode or self.mode) == 'extract':
+            ref_dict[self.name] = hidden_states
+            hidden_states1 = self.chained_proc(attn, hidden_states, encoder_hidden_states, attention_mask, **kwargs)
+            if self.pixel_wise_crosspond and self.crosspond_chain_pos == "after":
+                ref_dict[self.name] = hidden_states1
+            return hidden_states1
+        elif (mode or self.mode) == 'inject':
+            ref_state = ref_dict.pop(self.name)
+            if self.with_proj_in:
+                ref_state = self.in_linear(ref_state)
+            B, L, D = ref_state.shape
+            if hidden_states.shape[0] == B:
+                modalities = 1
+                views = 1
+            else:
+                modalities = hidden_states.shape[0] // B // self.views
+                views = self.views
+            if self.pixel_wise_crosspond:
+                if self.crosspond_effect_on == "all":
+                    ref_state = ref_state[:, None].expand(-1, modalities * views, -1, -1).reshape(-1, *ref_state.shape[-2:])
+                    if self.crosspond_chain_pos == "before":
+                        hidden_states = hidden_states + self.do_crosspond_attention(hidden_states, ref_state)
+                    hidden_states1 = self.chained_proc(attn, hidden_states, encoder_hidden_states, attention_mask, **kwargs)
+                    if self.crosspond_chain_pos == "parralle":
+                        hidden_states1 = hidden_states1 + self.do_crosspond_attention(hidden_states, ref_state)
+                    if self.crosspond_chain_pos == "after":
+                        hidden_states1 = hidden_states1 + self.do_crosspond_attention(hidden_states1, ref_state)
+                    return hidden_states1
+                else:
+                    assert self.crosspond_effect_on == "first"
+                    # hidden_states [B * modalities * views, L, D]
+                    # ref_state [B, L, D]
+                    ref_state = ref_state[:, None].expand(-1, modalities, -1, -1).reshape(-1, ref_state.shape[-2], ref_state.shape[-1])  # [B * modalities, L, D]
+                    def do_paritial_crosspond(hidden_states, ref_state):
+                        first_view_hidden_states = hidden_states.view(-1, views, hidden_states.shape[1], hidden_states.shape[2])[:, 0]  # [B * modalities, L, D]
+                        hidden_states2 = self.do_crosspond_attention(first_view_hidden_states, ref_state) # [B * modalities, L, D]
+                        hidden_states2_padded = torch.zeros_like(hidden_states).reshape(-1, views, hidden_states.shape[1], hidden_states.shape[2])
+                        hidden_states2_padded[:, 0] = hidden_states2
+                        hidden_states2_padded = hidden_states2_padded.reshape(-1, hidden_states.shape[1], hidden_states.shape[2])
+                        return hidden_states2_padded
+                    if self.crosspond_chain_pos == "before":
+                        hidden_states = hidden_states + do_paritial_crosspond(hidden_states, ref_state)
+                    hidden_states1 = self.chained_proc(attn, hidden_states, encoder_hidden_states, attention_mask, **kwargs)    # [B * modalities * views, L, D]
+                    if self.crosspond_chain_pos == "parralle":
+                        hidden_states1 = hidden_states1 + do_paritial_crosspond(hidden_states, ref_state)
+                    if self.crosspond_chain_pos == "after":
+                        hidden_states1 = hidden_states1 + do_paritial_crosspond(hidden_states1, ref_state)
+                    return hidden_states1
+            elif self.simple_3d:
+                B, L, C = encoder_hidden_states.shape
+                mv = self.views
+                encoder_hidden_states = encoder_hidden_states.reshape(B // mv, mv, L, C)
+                ref_state = ref_state[:, None]
+                encoder_hidden_states = torch.cat([encoder_hidden_states, ref_state], dim=1)
+                encoder_hidden_states = encoder_hidden_states.reshape(B // mv, 1, (mv+1) * L, C)
+                encoder_hidden_states = encoder_hidden_states.repeat(1, mv, 1, 1).reshape(-1, (mv+1) * L, C)
+                return self.chained_proc(attn, hidden_states, encoder_hidden_states, attention_mask, **kwargs)
+            else:
+                ref_state = ref_state[:, None].expand(-1, modalities * views, -1, -1).reshape(-1, ref_state.shape[-2], ref_state.shape[-1])
+                encoder_hidden_states = torch.cat([encoder_hidden_states, ref_state], dim=1)
+                return self.chained_proc(attn, hidden_states, encoder_hidden_states, attention_mask, **kwargs)
+        else:
+            raise NotImplementedError("mode or self.mode is required to be 'extract' or 'inject'")
+def add_extra_processor(model: torch.nn.Module, enable_filter=lambda x:True, **kwargs):
+    return_dict = torch.nn.ModuleDict()
+    proj_in_dim = kwargs.get('proj_in_dim', False)
+    kwargs.pop('proj_in_dim', None)
+    def recursive_add_processors(name: str, module: torch.nn.Module):
+        for sub_name, child in module.named_children():
+            if "ref_unet" not in (sub_name + name):
+                recursive_add_processors(f"{name}.{sub_name}", child)
+        if isinstance(module, Attention):
+            new_processor = ExtraAttnProc(
+                chained_proc=module.get_processor(),
+                enabled=enable_filter(f"{name}.processor"),
+                name=f"{name}.processor",
+                proj_in_dim=proj_in_dim if proj_in_dim else module.cross_attention_dim,
+                target_dim=module.cross_attention_dim,
+                **kwargs
+            )
+            module.set_processor(new_processor)
+            return_dict[f"{name}.processor".replace(".", "__")] = new_processor
+    for name, module in model.named_children():
+        recursive_add_processors(name, module)
+    return return_dict
+def switch_extra_processor(model, enable_filter=lambda x:True):
+    def recursive_add_processors(name: str, module: torch.nn.Module):
+        for sub_name, child in module.named_children():
+            recursive_add_processors(f"{name}.{sub_name}", child)
+        if isinstance(module, ExtraAttnProc):
+            module.enabled = enable_filter(name)
+    for name, module in model.named_children():
+        recursive_add_processors(name, module)
+class multiviewAttnProc(torch.nn.Module):
+    def __init__(
+        self,
+        chained_proc,
+        enabled=False,
+        name=None,
+        hidden_states_dim=None,
+        chain_pos="parralle",     # before or parralle or after
+        num_modalities=1,
+        views=4,
+        base_img_size=64,
+    ) -> None:
+        super().__init__()
+        self.enabled = enabled
+        self.chained_proc = chained_proc
+        self.name = name
+        self.hidden_states_dim = hidden_states_dim
+        self.num_modalities = num_modalities
+        self.views = views
+        self.base_img_size = base_img_size
+        self.chain_pos = chain_pos
+        self.diff_joint_attn = True
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        **kwargs
+    ) -> torch.Tensor:
+        if not self.enabled:
+            return self.chained_proc(attn, hidden_states, encoder_hidden_states, attention_mask, **kwargs)
+        B, L, C = hidden_states.shape
+        mv = self.views
+        hidden_states = hidden_states.reshape(B // mv, mv, L, C).reshape(-1, mv * L, C)
+        hidden_states = self.chained_proc(attn, hidden_states, encoder_hidden_states, attention_mask, **kwargs)
+        return hidden_states.reshape(B // mv, mv, L, C).reshape(-1, L, C)
+def add_multiview_processor(model: torch.nn.Module, enable_filter=lambda x:True, **kwargs):
+    return_dict = torch.nn.ModuleDict()
+    def recursive_add_processors(name: str, module: torch.nn.Module):
+        for sub_name, child in module.named_children():
+            if "ref_unet" not in (sub_name + name):
+                recursive_add_processors(f"{name}.{sub_name}", child)
+        if isinstance(module, Attention):
+            new_processor = multiviewAttnProc(
+                chained_proc=module.get_processor(),
+                enabled=enable_filter(f"{name}.processor"),
+                name=f"{name}.processor",
+                hidden_states_dim=module.inner_dim,
+                **kwargs
+            )
+            module.set_processor(new_processor)
+            return_dict[f"{name}.processor".replace(".", "__")] = new_processor
+    for name, module in model.named_children():
+        recursive_add_processors(name, module)
+    return return_dict
+def switch_multiview_processor(model, enable_filter=lambda x:True):
+    def recursive_add_processors(name: str, module: torch.nn.Module):
+        for sub_name, child in module.named_children():
+            recursive_add_processors(f"{name}.{sub_name}", child)
+        if isinstance(module, Attention):
+            processor = module.get_processor()
+            if isinstance(processor, multiviewAttnProc):
+                processor.enabled = enable_filter(f"{name}.processor")
+    for name, module in model.named_children():
+        recursive_add_processors(name, module)
+class NNModuleWrapper(torch.nn.Module):
+    def __init__(self, module):
+        super().__init__()
+        self.module = module
+    def forward(self, *args, **kwargs):
+        return self.module(*args, **kwargs)
+    def __getattr__(self, name: str):
+        try:
+            return super().__getattr__(name)
+        except AttributeError:
+            return getattr(self.module, name)
+class AttnProcessorSwitch(torch.nn.Module):
+    def __init__(
+        self,
+        proc_dict: dict,
+        enabled_proc="default",
+        name=None,
+        switch_name="default_switch",
+    ):
+        super().__init__()
+        self.proc_dict = torch.nn.ModuleDict({k: (v if isinstance(v, torch.nn.Module) else NNModuleWrapper(v)) for k, v in proc_dict.items()})
+        self.enabled_proc = enabled_proc
+        self.name = name
+        self.switch_name = switch_name
+        self.choose_module(enabled_proc)
+    def choose_module(self, enabled_proc):
+        self.enabled_proc = enabled_proc
+        assert enabled_proc in self.proc_dict.keys()
+    def __call__(
+        self,
+        *args,
+        **kwargs
+    ) -> torch.FloatTensor:
+        used_proc = self.proc_dict[self.enabled_proc]
+        return used_proc(*args, **kwargs)
+def add_switch(model: torch.nn.Module, module_filter=lambda x:True, switch_dict_fn=lambda x: {"default": x}, switch_name="default_switch", enabled_proc="default"):
+    return_dict = torch.nn.ModuleDict()
+    def recursive_add_processors(name: str, module: torch.nn.Module):
+        for sub_name, child in module.named_children():
+            if "ref_unet" not in (sub_name + name):
+                recursive_add_processors(f"{name}.{sub_name}", child)
+        if isinstance(module, Attention):
+            processor = module.get_processor()
+            if module_filter(processor):
+                proc_dict = switch_dict_fn(processor)
+                new_processor = AttnProcessorSwitch(
+                    proc_dict=proc_dict,
+                    enabled_proc=enabled_proc,
+                    name=f"{name}.processor",
+                    switch_name=switch_name,
+                )
+                module.set_processor(new_processor)
+                return_dict[f"{name}.processor".replace(".", "__")] = new_processor
+    for name, module in model.named_children():
+        recursive_add_processors(name, module)
+    return return_dict
+def change_switch(model: torch.nn.Module, switch_name="default_switch", enabled_proc="default"):
+    def recursive_change_processors(name: str, module: torch.nn.Module):
+        for sub_name, child in module.named_children():
+            recursive_change_processors(f"{name}.{sub_name}", child)
+        if isinstance(module, Attention):
+            processor = module.get_processor()
+            if isinstance(processor, AttnProcessorSwitch) and processor.switch_name == switch_name:
+                processor.choose_module(enabled_proc)
+    for name, module in model.named_children():
+        recursive_change_processors(name, module)
+########## Hack: Attention fix #############
+from diffusers.models.attention import Attention
+def forward(
+    self,
+    hidden_states: torch.FloatTensor,
+    encoder_hidden_states: Optional[torch.FloatTensor] = None,
+    attention_mask: Optional[torch.FloatTensor] = None,
+    **cross_attention_kwargs,
+) -> torch.Tensor:
+    r"""
+    The forward method of the `Attention` class.
+    Args:
+        hidden_states (`torch.Tensor`):
+            The hidden states of the query.
+        encoder_hidden_states (`torch.Tensor`, *optional*):
+            The hidden states of the encoder.
+        attention_mask (`torch.Tensor`, *optional*):
+            The attention mask to use. If `None`, no mask is applied.
+        **cross_attention_kwargs:
+            Additional keyword arguments to pass along to the cross attention.
+    Returns:
+        `torch.Tensor`: The output of the attention layer.
+    """
+    # The `Attention` class can call different attention processors / attention functions
+    # here we simply pass along all tensors to the selected processor class
+    # For standard processors that are defined here, `**cross_attention_kwargs` is empty
+    return self.processor(
+        self,
+        hidden_states,
+        encoder_hidden_states=encoder_hidden_states,
+        attention_mask=attention_mask,
+        **cross_attention_kwargs,
+    )
+Attention.forward = forward

custum_3d_diffusion/custum_modules/unifield_processor.py ADDED Viewed

	@@ -0,0 +1,460 @@

+from types import FunctionType
+from typing import Any, Dict, List
+from diffusers import UNet2DConditionModel
+import torch
+from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel, ImageProjection
+from diffusers.models.attention_processor import Attention, AttnProcessor, AttnProcessor2_0, XFormersAttnProcessor
+from dataclasses import dataclass, field
+from diffusers.loaders import IPAdapterMixin
+from custum_3d_diffusion.custum_modules.attention_processors import add_extra_processor, switch_extra_processor, add_multiview_processor, switch_multiview_processor, add_switch, change_switch
+@dataclass
+class AttnConfig:
+    """
+    * CrossAttention: Attention module (inherits knowledge), LoRA module (achieves fine-tuning), IPAdapter module (achieves conceptual control).
+    * SelfAttention: Attention module (inherits knowledge), LoRA module (achieves fine-tuning), Reference Attention module (achieves pixel-level control).
+    * Multiview Attention module: Multiview Attention module (achieves multi-view consistency).
+    * Cross Modality Attention module: Cross Modality Attention module (achieves multi-modality consistency).
+    For setups:
+        train_xxx_lr is implemented in the U-Net architecture.
+        enable_xxx_lora is implemented in the U-Net architecture.
+        enable_xxx_ip is implemented in the processor and U-Net architecture.
+        enable_xxx_ref_proj_in is implemented in the processor.
+    """
+    latent_size: int = 64
+    train_lr: float = 0
+    # for cross attention
+    # 0 learning rate for not training
+    train_cross_attn_lr: float = 0
+    train_cross_attn_lora_lr: float = 0
+    train_cross_attn_ip_lr: float = 0      # 0 for not trained
+    init_cross_attn_lora: bool = False
+    enable_cross_attn_lora: bool = False
+    init_cross_attn_ip: bool = False
+    enable_cross_attn_ip: bool = False
+    cross_attn_lora_rank: int = 64        # 0 for not enabled
+    cross_attn_lora_only_kv: bool = False
+    ipadapter_pretrained_name: str = "h94/IP-Adapter"
+    ipadapter_subfolder_name: str = "models"
+    ipadapter_weight_name: str = "ip-adapter-plus_sd15.safetensors"
+    ipadapter_effect_on: str = "all"    # all, first
+    # for self attention
+    train_self_attn_lr: float = 0
+    train_self_attn_lora_lr: float = 0
+    init_self_attn_lora: bool = False
+    enable_self_attn_lora: bool = False
+    self_attn_lora_rank: int = 64
+    self_attn_lora_only_kv: bool = False
+    train_self_attn_ref_lr: float = 0
+    train_ref_unet_lr: float = 0
+    init_self_attn_ref: bool = False
+    enable_self_attn_ref: bool = False
+    self_attn_ref_other_model_name: str = ""
+    self_attn_ref_position: str = "attn1"
+    self_attn_ref_pixel_wise_crosspond: bool = False    # enable pixel_wise_crosspond in refattn
+    self_attn_ref_chain_pos: str = "parralle"           # before or parralle or after
+    self_attn_ref_effect_on: str = "all"                # all or first, for _crosspond attn
+    self_attn_ref_zero_init: bool = True
+    use_simple3d_attn: bool = False
+    # for multiview attention
+    init_multiview_attn: bool = False
+    enable_multiview_attn: bool = False
+    multiview_attn_position: str = "attn1"
+    multiview_chain_pose: str = "parralle"             # before or parralle or after
+    num_modalities: int = 1
+    use_mv_joint_attn: bool = False
+    # for unet
+    init_unet_path: str = "runwayml/stable-diffusion-v1-5"
+    init_num_cls_label: int = 0                         # for initialize
+    cls_labels: List[int] = field(default_factory=lambda: [])
+    cls_label_type: str = "embedding"
+    cat_condition: bool = False                         # cat condition to input
+class Configurable:
+    attn_config: AttnConfig
+    def set_config(self, attn_config: AttnConfig):
+        raise NotImplementedError()
+    def update_config(self, attn_config: AttnConfig):
+        self.attn_config = attn_config
+    def do_set_config(self, attn_config: AttnConfig):
+        self.set_config(attn_config)
+        for name, module in self.named_modules():
+            if isinstance(module, Configurable):
+                if hasattr(module, "do_set_config"):
+                    module.do_set_config(attn_config)
+                else:
+                    print(f"Warning: {name} has no attribute do_set_config, but is an instance of Configurable")
+                    module.attn_config = attn_config
+    def do_update_config(self, attn_config: AttnConfig):
+        self.update_config(attn_config)
+        for name, module in self.named_modules():
+            if isinstance(module, Configurable):
+                if hasattr(module, "do_update_config"):
+                    module.do_update_config(attn_config)
+                else:
+                    print(f"Warning: {name} has no attribute do_update_config, but is an instance of Configurable")
+                    module.attn_config = attn_config
+from diffusers import ModelMixin  # Must import ModelMixin for CompiledUNet
+class UnifieldWrappedUNet(UNet2DConditionModel):
+    forward_hook: FunctionType
+    def forward(self, *args, **kwargs):
+        if hasattr(self, 'forward_hook'):
+            return self.forward_hook(super().forward, *args, **kwargs)
+        return super().forward(*args, **kwargs)
+class ConfigurableUNet2DConditionModel(Configurable, IPAdapterMixin):
+    unet: UNet2DConditionModel
+    cls_embedding_param_dict = {}
+    cross_attn_lora_param_dict = {}
+    self_attn_lora_param_dict = {}
+    cross_attn_param_dict = {}
+    self_attn_param_dict = {}
+    ipadapter_param_dict = {}
+    ref_attn_param_dict = {}
+    ref_unet_param_dict = {}
+    multiview_attn_param_dict = {}
+    other_param_dict = {}
+    rev_param_name_mapping = {}
+    class_labels = []
+    def set_class_labels(self, class_labels: torch.Tensor):
+        if self.attn_config.init_num_cls_label != 0:
+            self.class_labels = class_labels.to(self.unet.device).long()
+    def __init__(self, init_config: AttnConfig, weight_dtype) -> None:
+        super().__init__()
+        self.weight_dtype = weight_dtype
+        self.set_config(init_config)
+    def enable_xformers_memory_efficient_attention(self):
+        self.unet.enable_xformers_memory_efficient_attention
+        def recursive_add_processors(name: str, module: torch.nn.Module):
+            for sub_name, child in module.named_children():
+                recursive_add_processors(f"{name}.{sub_name}", child)
+            if isinstance(module, Attention):
+                if hasattr(module, 'xformers_not_supported'):
+                    return
+                old_processor = module.get_processor()
+                if isinstance(old_processor, (AttnProcessor, AttnProcessor2_0)):
+                    module.set_use_memory_efficient_attention_xformers(True)
+        for name, module in self.unet.named_children():
+            recursive_add_processors(name, module)
+    def __getattr__(self, name: str) -> Any:
+        try:
+            return super().__getattr__(name)
+        except AttributeError:
+            return getattr(self.unet, name)
+    # --- for IPAdapterMixin
+    def register_modules(self, **kwargs):
+        for name, module in kwargs.items():
+            # set models
+            setattr(self, name, module)
+    def register_to_config(self, **kwargs):
+        pass
+    def unload_ip_adapter(self):
+        raise NotImplementedError()
+    # --- for Configurable
+    def get_refunet(self):
+        if self.attn_config.self_attn_ref_other_model_name == "self":
+            return self.unet
+        else:
+            return self.unet.ref_unet
+    def set_config(self, attn_config: AttnConfig):
+        self.attn_config = attn_config
+        unet_type = UnifieldWrappedUNet
+        # class_embed_type = "projection" for 'camera'
+        # class_embed_type = None for 'embedding'
+        unet_kwargs = {}
+        if attn_config.init_num_cls_label > 0:
+            if attn_config.cls_label_type == "embedding":
+                unet_kwargs = {
+                    "num_class_embeds": attn_config.init_num_cls_label,
+                    "device_map": None,
+                    "low_cpu_mem_usage": False,
+                    "class_embed_type": None,
+                }
+            else:
+                raise ValueError(f"cls_label_type {attn_config.cls_label_type} is not supported")
+        self.unet: UnifieldWrappedUNet = unet_type.from_pretrained(
+            attn_config.init_unet_path, subfolder="unet", torch_dtype=self.weight_dtype,
+            ignore_mismatched_sizes=True,  # Added this line
+            **unet_kwargs
+        )
+        assert isinstance(self.unet, UnifieldWrappedUNet)
+        self.unet.forward_hook = self.unet_forward_hook
+        if self.attn_config.cat_condition:
+            # double in_channels
+            if self.unet.config.in_channels != 8:
+                self.unet.register_to_config(in_channels=self.unet.config.in_channels * 2)
+                # repeate unet.conv_in weight twice
+                doubled_conv_in = torch.nn.Conv2d(self.unet.conv_in.in_channels * 2, self.unet.conv_in.out_channels, self.unet.conv_in.kernel_size, self.unet.conv_in.stride, self.unet.conv_in.padding)
+                doubled_conv_in.weight.data = torch.cat([self.unet.conv_in.weight.data, torch.zeros_like(self.unet.conv_in.weight.data)], dim=1)
+                doubled_conv_in.bias.data = self.unet.conv_in.bias.data
+                self.unet.conv_in = doubled_conv_in
+        used_param_ids = set()
+        if attn_config.init_cross_attn_lora:
+            # setup lora
+            from peft import LoraConfig
+            from peft.utils import get_peft_model_state_dict
+            if attn_config.cross_attn_lora_only_kv:
+                target_modules=["attn2.to_k", "attn2.to_v"]
+            else:
+                target_modules=["attn2.to_k", "attn2.to_q", "attn2.to_v", "attn2.to_out.0"]
+            lora_config: LoraConfig = LoraConfig(
+                r=attn_config.cross_attn_lora_rank,
+                lora_alpha=attn_config.cross_attn_lora_rank,
+                init_lora_weights="gaussian",
+                target_modules=target_modules,
+            )
+            adapter_name="cross_attn_lora"
+            self.unet.add_adapter(lora_config, adapter_name=adapter_name)
+            # update cross_attn_lora_param_dict
+            self.cross_attn_lora_param_dict = {id(param): param for name, param in self.unet.named_parameters() if adapter_name in name and id(param) not in used_param_ids}
+            used_param_ids.update(self.cross_attn_lora_param_dict.keys())
+        if attn_config.init_self_attn_lora:
+            # setup lora
+            from peft import LoraConfig
+            if attn_config.self_attn_lora_only_kv:
+                target_modules=["attn1.to_k", "attn1.to_v"]
+            else:
+                target_modules=["attn1.to_k", "attn1.to_q", "attn1.to_v", "attn1.to_out.0"]
+            lora_config: LoraConfig = LoraConfig(
+                r=attn_config.self_attn_lora_rank,
+                lora_alpha=attn_config.self_attn_lora_rank,
+                init_lora_weights="gaussian",
+                target_modules=target_modules,
+            )
+            adapter_name="self_attn_lora"
+            self.unet.add_adapter(lora_config, adapter_name=adapter_name)
+            # update cross_self_lora_param_dict
+            self.self_attn_lora_param_dict = {id(param): param for name, param in self.unet.named_parameters() if adapter_name in name and id(param) not in used_param_ids}
+            used_param_ids.update(self.self_attn_lora_param_dict.keys())
+        if attn_config.init_num_cls_label != 0:
+            self.cls_embedding_param_dict = {id(param): param for param in self.unet.class_embedding.parameters()}
+            used_param_ids.update(self.cls_embedding_param_dict.keys())
+            self.set_class_labels(torch.tensor(attn_config.cls_labels).long())
+        if attn_config.init_cross_attn_ip:
+            self.image_encoder = None
+            # setup ipadapter
+            self.load_ip_adapter(
+                attn_config.ipadapter_pretrained_name,
+                subfolder=attn_config.ipadapter_subfolder_name,
+                weight_name=attn_config.ipadapter_weight_name
+            )
+            # warp ip_adapter_attn_proc with switch
+            from diffusers.models.attention_processor import IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0
+            add_switch(self.unet, module_filter=lambda x: isinstance(x, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)), switch_dict_fn=lambda x: {"ipadapter": x, "default": XFormersAttnProcessor()}, switch_name="ipadapter_switch", enabled_proc="ipadapter")
+            # update ipadapter_param_dict
+            # weights are in attention processors and unet.encoder_hid_proj
+            self.ipadapter_param_dict = {id(param): param for param in self.unet.encoder_hid_proj.parameters() if id(param) not in used_param_ids}
+            used_param_ids.update(self.ipadapter_param_dict.keys())
+            print("DEBUG: ipadapter_param_dict len in encoder_hid_proj", len(self.ipadapter_param_dict))
+            for name, processor in self.unet.attn_processors.items():
+                if hasattr(processor, "to_k_ip"):
+                    self.ipadapter_param_dict.update({id(param): param for param in processor.parameters()})
+            print(f"DEBUG: ipadapter_param_dict len in all", len(self.ipadapter_param_dict))
+        ref_unet = None
+        if attn_config.init_self_attn_ref:
+            # setup reference attention processor
+            if attn_config.self_attn_ref_other_model_name == "self":
+                raise NotImplementedError("self reference is not fully implemented")
+            else:
+                ref_unet: UNet2DConditionModel = UNet2DConditionModel.from_pretrained(
+                    attn_config.self_attn_ref_other_model_name, subfolder="unet", torch_dtype=self.unet.dtype
+                )
+                ref_unet.to(self.unet.device)
+                if self.attn_config.train_ref_unet_lr == 0:
+                    ref_unet.eval()
+                    ref_unet.requires_grad_(False)
+                else:
+                    ref_unet.train()
+                add_extra_processor(
+                    model=ref_unet,
+                    enable_filter=lambda name: name.endswith(f"{attn_config.self_attn_ref_position}.processor"),
+                    mode='extract',
+                    with_proj_in=False,
+                    pixel_wise_crosspond=False,
+                )
+                # NOTE: Here require cross_attention_dim in two unet's self attention should be the same
+                processor_dict = add_extra_processor(
+                    model=self.unet,
+                    enable_filter=lambda name: name.endswith(f"{attn_config.self_attn_ref_position}.processor"),
+                    mode='inject',
+                    with_proj_in=False,
+                    pixel_wise_crosspond=attn_config.self_attn_ref_pixel_wise_crosspond,
+                    crosspond_effect_on=attn_config.self_attn_ref_effect_on,
+                    crosspond_chain_pos=attn_config.self_attn_ref_chain_pos,
+                    simple_3d=attn_config.use_simple3d_attn,
+                )
+                self.ref_unet_param_dict = {id(param): param for name, param in ref_unet.named_parameters() if id(param) not in used_param_ids and (attn_config.self_attn_ref_position in name)}
+                if attn_config.self_attn_ref_chain_pos != "after":
+                    # pop untrainable paramters
+                    for name, param in ref_unet.named_parameters():
+                        if id(param) in self.ref_unet_param_dict and ('up_blocks.3.attentions.2.transformer_blocks.0.' in name):
+                            self.ref_unet_param_dict.pop(id(param))
+                used_param_ids.update(self.ref_unet_param_dict.keys())
+            # update ref_attn_param_dict
+            self.ref_attn_param_dict = {id(param): param for name, param in processor_dict.named_parameters() if id(param) not in used_param_ids}
+            used_param_ids.update(self.ref_attn_param_dict.keys())
+        if attn_config.init_multiview_attn:
+            processor_dict = add_multiview_processor(
+                model = self.unet,
+                enable_filter = lambda name: name.endswith(f"{attn_config.multiview_attn_position}.processor"),
+                num_modalities = attn_config.num_modalities,
+                base_img_size = attn_config.latent_size,
+                chain_pos = attn_config.multiview_chain_pose,
+            )
+            # update multiview_attn_param_dict
+            self.multiview_attn_param_dict = {id(param): param for name, param in processor_dict.named_parameters() if id(param) not in used_param_ids}
+            used_param_ids.update(self.multiview_attn_param_dict.keys())
+        # initialize cross_attn_param_dict parameters
+        self.cross_attn_param_dict = {id(param): param for name, param in self.unet.named_parameters() if "attn2" in name and id(param) not in used_param_ids}
+        used_param_ids.update(self.cross_attn_param_dict.keys())
+        # initialize self_attn_param_dict parameters
+        self.self_attn_param_dict = {id(param): param for name, param in self.unet.named_parameters() if "attn1" in name and id(param) not in used_param_ids}
+        used_param_ids.update(self.self_attn_param_dict.keys())
+        # initialize other_param_dict parameters
+        self.other_param_dict = {id(param): param for name, param in self.unet.named_parameters() if id(param) not in used_param_ids}
+        if ref_unet is not None:
+            self.unet.ref_unet = ref_unet
+        self.rev_param_name_mapping = {id(param): name for name, param in self.unet.named_parameters()}
+        self.update_config(attn_config, force_update=True)
+        return self.unet
+    _attn_keys_to_update = ["enable_cross_attn_lora", "enable_cross_attn_ip", "enable_self_attn_lora", "enable_self_attn_ref", "enable_multiview_attn", "cls_labels"]
+    def update_config(self, attn_config: AttnConfig, force_update=False):
+        assert isinstance(self.unet, UNet2DConditionModel), "unet must be an instance of UNet2DConditionModel"
+        need_to_update = False
+        # update cls_labels
+        for key in self._attn_keys_to_update:
+            if getattr(self.attn_config, key) != getattr(attn_config, key):
+                need_to_update = True
+                break
+        if not force_update and not need_to_update:
+            return
+        self.set_class_labels(torch.tensor(attn_config.cls_labels).long())
+        # setup loras
+        if self.attn_config.init_cross_attn_lora or self.attn_config.init_self_attn_lora:
+            if attn_config.enable_cross_attn_lora or attn_config.enable_self_attn_lora:
+                cross_attn_lora_weight = 1. if attn_config.enable_cross_attn_lora > 0 else 0
+                self_attn_lora_weight = 1. if attn_config.enable_self_attn_lora > 0 else 0
+                self.unet.set_adapters(["cross_attn_lora", "self_attn_lora"], weights=[cross_attn_lora_weight, self_attn_lora_weight])
+            else:
+                self.unet.disable_adapters()
+        # setup ipadapter
+        if self.attn_config.init_cross_attn_ip:
+            if attn_config.enable_cross_attn_ip:
+                change_switch(self.unet, "ipadapter_switch", "ipadapter")
+            else:
+                change_switch(self.unet, "ipadapter_switch", "default")
+        # setup reference attention processor
+        if self.attn_config.init_self_attn_ref:
+            if attn_config.enable_self_attn_ref:
+                switch_extra_processor(self.unet, enable_filter=lambda name: name.endswith(f"{attn_config.self_attn_ref_position}.processor"))
+            else:
+                switch_extra_processor(self.unet, enable_filter=lambda name: False)
+        # setup multiview attention processor
+        if self.attn_config.init_multiview_attn:
+            if attn_config.enable_multiview_attn:
+                switch_multiview_processor(self.unet, enable_filter=lambda name: name.endswith(f"{attn_config.multiview_attn_position}.processor"))
+            else:
+                switch_multiview_processor(self.unet, enable_filter=lambda name: False)
+        # update cls_labels
+        for key in self._attn_keys_to_update:
+            setattr(self.attn_config, key, getattr(attn_config, key))
+    def unet_forward_hook(self, raw_forward, sample: torch.FloatTensor, timestep: torch.Tensor, encoder_hidden_states: torch.Tensor, *args, cross_attention_kwargs=None, condition_latents=None, class_labels=None, noisy_condition_input=False, cond_pixels_clip=None, **kwargs):
+        if class_labels is None and len(self.class_labels) > 0:
+            class_labels = self.class_labels.repeat(sample.shape[0] // self.class_labels.shape[0]).to(sample.device)
+        elif self.attn_config.init_num_cls_label != 0:
+            assert class_labels is not None, "class_labels should be passed if self.class_labels is empty and self.attn_config.init_num_cls_label is not 0"
+        if class_labels is not None:
+            if self.attn_config.cls_label_type == "embedding":
+                pass
+            else:
+                raise ValueError(f"cls_label_type {self.attn_config.cls_label_type} is not supported")
+        if self.attn_config.init_self_attn_ref and self.attn_config.enable_self_attn_ref:
+            # NOTE: extra step, extract condition
+            ref_dict = {}
+            ref_unet = self.get_refunet().to(sample.device)
+            assert condition_latents is not None
+            if self.attn_config.self_attn_ref_other_model_name == "self":
+                raise NotImplementedError()
+            else:
+                with torch.no_grad():
+                    cond_encoder_hidden_states = encoder_hidden_states.reshape(condition_latents.shape[0], -1, *encoder_hidden_states.shape[1:])[:, 0]
+                    if timestep.dim() == 0:
+                        cond_timestep = timestep
+                    else:
+                        cond_timestep = timestep.reshape(condition_latents.shape[0], -1)[:, 0]
+                ref_unet(condition_latents, cond_timestep, cond_encoder_hidden_states,  cross_attention_kwargs=dict(ref_dict=ref_dict))
+            # NOTE: extra step, inject condition
+            # Predict the noise residual and compute loss
+            if cross_attention_kwargs is None:
+                cross_attention_kwargs = {}
+            cross_attention_kwargs.update(ref_dict=ref_dict, mode='inject')
+        elif condition_latents is not None:
+            if not hasattr(self, 'condition_latents_raised'):
+                print("Warning! condition_latents is not None, but self_attn_ref is not enabled! This warning will only be raised once.")
+                self.condition_latents_raised = True
+        if self.attn_config.init_cross_attn_ip:
+            raise NotImplementedError()
+        if self.attn_config.cat_condition:
+            assert condition_latents is not None
+            B = condition_latents.shape[0]
+            cat_latents = condition_latents.reshape(B, 1, *condition_latents.shape[1:]).repeat(1, sample.shape[0] // B, 1, 1, 1).reshape(*sample.shape)
+            sample = torch.cat([sample, cat_latents], dim=1)
+        return raw_forward(sample, timestep, encoder_hidden_states, *args, cross_attention_kwargs=cross_attention_kwargs, class_labels=class_labels, **kwargs)

custum_3d_diffusion/custum_pipeline/unifield_pipeline_img2img.py ADDED Viewed

	@@ -0,0 +1,298 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# modified by Wuvin
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import numpy as np
+import torch
+from diffusers import AutoencoderKL, UNet2DConditionModel, StableDiffusionImageVariationPipeline
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker, StableDiffusionPipelineOutput
+from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel
+from PIL import Image
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+class StableDiffusionImageCustomPipeline(
+    StableDiffusionImageVariationPipeline
+):
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        image_encoder: CLIPVisionModelWithProjection,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+        latents_offset=None,
+        noisy_cond_latents=False,
+    ):
+        super().__init__(
+            vae=vae,
+            image_encoder=image_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            requires_safety_checker=requires_safety_checker
+        )
+        latents_offset = tuple(latents_offset) if latents_offset is not None else None
+        self.latents_offset = latents_offset
+        if latents_offset is not None:
+            self.register_to_config(latents_offset=latents_offset)
+        self.noisy_cond_latents = noisy_cond_latents
+        self.register_to_config(noisy_cond_latents=noisy_cond_latents)
+    def encode_latents(self, image, device, dtype, height, width):
+        # support batchsize > 1
+        if isinstance(image, Image.Image):
+            image = [image]
+        image = [img.convert("RGB") for img in image]
+        images = self.image_processor.preprocess(image, height=height, width=width).to(device, dtype=dtype)
+        latents = self.vae.encode(images).latent_dist.mode() * self.vae.config.scaling_factor
+        if self.latents_offset is not None:
+            return latents - torch.tensor(self.latents_offset).to(latents.device)[None, :, None, None]
+        else:
+            return latents
+    def _encode_image(self, image, device, num_images_per_prompt, do_classifier_free_guidance):
+        dtype = next(self.image_encoder.parameters()).dtype
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(images=image, return_tensors="pt").pixel_values
+        image = image.to(device=device, dtype=dtype)
+        image_embeddings = self.image_encoder(image).image_embeds
+        image_embeddings = image_embeddings.unsqueeze(1)
+        # duplicate image embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = image_embeddings.shape
+        image_embeddings = image_embeddings.repeat(1, num_images_per_prompt, 1)
+        image_embeddings = image_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        if do_classifier_free_guidance:
+            # NOTE: the same as original code
+            negative_prompt_embeds = torch.zeros_like(image_embeddings)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            image_embeddings = torch.cat([negative_prompt_embeds, image_embeddings])
+        return image_embeddings
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: Union[Image.Image, List[Image.Image], torch.FloatTensor],
+        height: Optional[int] = 1024,
+        width: Optional[int] = 1024,
+        height_cond: Optional[int] = 512,
+        width_cond: Optional[int] = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        upper_left_feature: bool = False,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            image (`Image.Image` or `List[Image.Image]` or `torch.FloatTensor`):
+                Image or images to guide image generation. If you provide a tensor, it needs to be compatible with
+                [`CLIPImageProcessor`](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json).
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        Examples:
+        ```py
+        from diffusers import StableDiffusionImageVariationPipeline
+        from PIL import Image
+        from io import BytesIO
+        import requests
+        pipe = StableDiffusionImageVariationPipeline.from_pretrained(
+            "lambdalabs/sd-image-variations-diffusers", revision="v2.0"
+        )
+        pipe = pipe.to("cuda")
+        url = "https://lh3.googleusercontent.com/y-iFOHfLTwkuQSUegpwDdgKmOjRSTvPxat63dQLB25xkTs4lhIbRUFeNBWZzYf370g=s1200"
+        response = requests.get(url)
+        image = Image.open(BytesIO(response.content)).convert("RGB")
+        out = pipe(image, num_images_per_prompt=3, guidance_scale=15)
+        out["images"][0].save("result.jpg")
+        ```
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(image, height, width, callback_steps)
+        # 2. Define call parameters
+        if isinstance(image, Image.Image):
+            batch_size = 1
+        elif isinstance(image, list):
+            batch_size = len(image)
+        else:
+            batch_size = image.shape[0]
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 3. Encode input image
+        if isinstance(image, Image.Image) and upper_left_feature:
+            # only use the first one of four images
+            emb_image = image.crop((0, 0, image.size[0] // 2, image.size[1] // 2))
+        else:
+            emb_image = image
+        image_embeddings = self._encode_image(emb_image, device, num_images_per_prompt, do_classifier_free_guidance)
+        cond_latents = self.encode_latents(image, image_embeddings.device, image_embeddings.dtype, height_cond, width_cond)
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.out_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            image_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 6. Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.noisy_cond_latents:
+                    raise ValueError("Noisy condition latents is not recommended.")
+                else:
+                    noisy_cond_latents = cond_latents
+                noisy_cond_latents = torch.cat([torch.zeros_like(noisy_cond_latents), noisy_cond_latents]) if do_classifier_free_guidance else noisy_cond_latents
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # predict the noise residual
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=image_embeddings, condition_latents=noisy_cond_latents).sample
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+        self.maybe_free_model_hooks()
+        if self.latents_offset is not None:
+            latents = latents + torch.tensor(self.latents_offset).to(latents.device)[None, :, None, None]
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, image_embeddings.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+if __name__ == "__main__":
+    pass

custum_3d_diffusion/custum_pipeline/unifield_pipeline_img2mvimg.py ADDED Viewed

	@@ -0,0 +1,296 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# modified by Wuvin
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import numpy as np
+import torch
+from diffusers import AutoencoderKL, UNet2DConditionModel, StableDiffusionImageVariationPipeline
+from diffusers.schedulers import KarrasDiffusionSchedulers, DDPMScheduler
+from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker, StableDiffusionPipelineOutput
+from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel
+from PIL import Image
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+class StableDiffusionImage2MVCustomPipeline(
+    StableDiffusionImageVariationPipeline
+):
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        image_encoder: CLIPVisionModelWithProjection,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+        latents_offset=None,
+        noisy_cond_latents=False,
+        condition_offset=True,
+    ):
+        super().__init__(
+            vae=vae,
+            image_encoder=image_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            requires_safety_checker=requires_safety_checker
+        )
+        latents_offset = tuple(latents_offset) if latents_offset is not None else None
+        self.latents_offset = latents_offset
+        if latents_offset is not None:
+            self.register_to_config(latents_offset=latents_offset)
+        if noisy_cond_latents:
+            raise NotImplementedError("Noisy condition latents not supported Now.")
+        self.condition_offset = condition_offset
+        self.register_to_config(condition_offset=condition_offset)
+    def encode_latents(self, image: Image.Image, device, dtype, height, width):
+        images = self.image_processor.preprocess(image.convert("RGB"), height=height, width=width).to(device, dtype=dtype)
+        # NOTE: .mode() for condition
+        latents = self.vae.encode(images).latent_dist.mode() * self.vae.config.scaling_factor
+        if self.latents_offset is not None and self.condition_offset:
+            return latents - torch.tensor(self.latents_offset).to(latents.device)[None, :, None, None]
+        else:
+            return latents
+    def _encode_image(self, image, device, num_images_per_prompt, do_classifier_free_guidance):
+        dtype = next(self.image_encoder.parameters()).dtype
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(images=image, return_tensors="pt").pixel_values
+        image = image.to(device=device, dtype=dtype)
+        image_embeddings = self.image_encoder(image).image_embeds
+        image_embeddings = image_embeddings.unsqueeze(1)
+        # duplicate image embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = image_embeddings.shape
+        image_embeddings = image_embeddings.repeat(1, num_images_per_prompt, 1)
+        image_embeddings = image_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        if do_classifier_free_guidance:
+            # NOTE: the same as original code
+            negative_prompt_embeds = torch.zeros_like(image_embeddings)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            image_embeddings = torch.cat([negative_prompt_embeds, image_embeddings])
+        return image_embeddings
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: Union[Image.Image, List[Image.Image], torch.FloatTensor],
+        height: Optional[int] = 1024,
+        width: Optional[int] = 1024,
+        height_cond: Optional[int] = 512,
+        width_cond: Optional[int] = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            image (`Image.Image` or `List[Image.Image]` or `torch.FloatTensor`):
+                Image or images to guide image generation. If you provide a tensor, it needs to be compatible with
+                [`CLIPImageProcessor`](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json).
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        Examples:
+        ```py
+        from diffusers import StableDiffusionImageVariationPipeline
+        from PIL import Image
+        from io import BytesIO
+        import requests
+        pipe = StableDiffusionImageVariationPipeline.from_pretrained(
+            "lambdalabs/sd-image-variations-diffusers", revision="v2.0"
+        )
+        pipe = pipe.to("cuda")
+        url = "https://lh3.googleusercontent.com/y-iFOHfLTwkuQSUegpwDdgKmOjRSTvPxat63dQLB25xkTs4lhIbRUFeNBWZzYf370g=s1200"
+        response = requests.get(url)
+        image = Image.open(BytesIO(response.content)).convert("RGB")
+        out = pipe(image, num_images_per_prompt=3, guidance_scale=15)
+        out["images"][0].save("result.jpg")
+        ```
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(image, height, width, callback_steps)
+        # 2. Define call parameters
+        if isinstance(image, Image.Image):
+            batch_size = 1
+        elif len(image) == 1:
+            image = image[0]
+            batch_size = 1
+        else:
+            raise NotImplementedError()
+        # elif isinstance(image, list):
+        #     batch_size = len(image)
+        # else:
+        #     batch_size = image.shape[0]
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 3. Encode input image
+        emb_image = image
+        image_embeddings = self._encode_image(emb_image, device, num_images_per_prompt, do_classifier_free_guidance)
+        cond_latents = self.encode_latents(image, image_embeddings.device, image_embeddings.dtype, height_cond, width_cond)
+        cond_latents = torch.cat([torch.zeros_like(cond_latents), cond_latents]) if do_classifier_free_guidance else cond_latents
+        image_pixels = self.feature_extractor(images=emb_image, return_tensors="pt").pixel_values
+        if do_classifier_free_guidance:
+            image_pixels = torch.cat([torch.zeros_like(image_pixels), image_pixels], dim=0)
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.out_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            image_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 6. Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # predict the noise residual
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=image_embeddings, condition_latents=cond_latents, noisy_condition_input=False, cond_pixels_clip=image_pixels).sample
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+        self.maybe_free_model_hooks()
+        if self.latents_offset is not None:
+            latents = latents + torch.tensor(self.latents_offset).to(latents.device)[None, :, None, None]
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, image_embeddings.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+if __name__ == "__main__":
+    pass

custum_3d_diffusion/modules.py ADDED Viewed

	@@ -0,0 +1,14 @@

+__modules__ = {}
+def register(name):
+    def decorator(cls):
+        __modules__[name] = cls
+        return cls
+    return decorator
+def find(name):
+    return __modules__[name]
+from custum_3d_diffusion.trainings import base, image2mvimage_trainer, image2image_trainer

custum_3d_diffusion/trainings/__init__.py ADDED Viewed

File without changes

custum_3d_diffusion/trainings/base.py ADDED Viewed

	@@ -0,0 +1,208 @@

+import torch
+from accelerate import Accelerator
+from accelerate.logging import MultiProcessAdapter
+from dataclasses import dataclass, field
+from typing import Optional, Union
+from datasets import load_dataset
+import json
+import abc
+from diffusers.utils import make_image_grid
+import numpy as np
+import wandb
+from custum_3d_diffusion.trainings.utils import load_config
+from custum_3d_diffusion.custum_modules.unifield_processor import ConfigurableUNet2DConditionModel, AttnConfig
+class BasicTrainer(torch.nn.Module, abc.ABC):
+    accelerator: Accelerator
+    logger: MultiProcessAdapter
+    unet: ConfigurableUNet2DConditionModel
+    train_dataloader: torch.utils.data.DataLoader
+    test_dataset: torch.utils.data.Dataset
+    attn_config: AttnConfig
+    @dataclass
+    class TrainerConfig:
+        trainer_name: str = "basic"
+        pretrained_model_name_or_path: str = ""
+        attn_config: dict = field(default_factory=dict)
+        dataset_name: str = ""
+        dataset_config_name: Optional[str] = None
+        resolution: str = "1024"
+        dataloader_num_workers: int = 4
+        pair_sampler_group_size: int = 1
+        num_views: int = 4
+        max_train_steps: int = -1                       # -1 means infinity, otherwise [0, max_train_steps)
+        training_step_interval: int = 1                 # train on step i*interval, stop at max_train_steps
+        max_train_samples: Optional[int] = None
+        seed: Optional[int] = None                      # For dataset related operations and validation stuff
+        train_batch_size: int = 1
+        validation_interval: int = 5000
+        debug: bool = False
+    cfg: TrainerConfig    # only enable_xxx is used
+    def __init__(
+        self,
+        accelerator: Accelerator,
+        logger: MultiProcessAdapter,
+        unet: ConfigurableUNet2DConditionModel,
+        config: Union[dict, str],
+        weight_dtype: torch.dtype,
+        index: int,
+    ):
+        super().__init__()
+        self.index = index              # index in all trainers
+        self.accelerator = accelerator
+        self.logger = logger
+        self.unet = unet
+        self.weight_dtype = weight_dtype
+        self.ext_logs = {}
+        self.cfg = load_config(self.TrainerConfig, config)
+        self.attn_config = load_config(AttnConfig, self.cfg.attn_config)
+        self.test_dataset = None
+        self.validate_trainer_config()
+        self.configure()
+    def get_HW(self):
+        resolution = json.loads(self.cfg.resolution)
+        if isinstance(resolution, int):
+            H = W = resolution
+        elif isinstance(resolution, list):
+            H, W = resolution
+        return H, W
+    def unet_update(self):
+        self.unet.update_config(self.attn_config)
+    def validate_trainer_config(self):
+        pass
+    def is_train_finished(self, current_step):
+        assert isinstance(self.cfg.max_train_steps, int)
+        return self.cfg.max_train_steps != -1 and current_step >= self.cfg.max_train_steps
+    def next_train_step(self, current_step):
+        if self.is_train_finished(current_step):
+            return None
+        return current_step + self.cfg.training_step_interval
+    @classmethod
+    def make_image_into_grid(cls, all_imgs, rows=2, columns=2):
+        catted = [make_image_grid(all_imgs[i:i+rows * columns], rows=rows, cols=columns) for i in range(0, len(all_imgs), rows * columns)]
+        return make_image_grid(catted, rows=1, cols=len(catted))
+    def configure(self) -> None:
+        pass
+    @abc.abstractmethod
+    def init_shared_modules(self, shared_modules: dict) -> dict:
+        pass
+    def load_dataset(self):
+        dataset = load_dataset(
+            self.cfg.dataset_name,
+            self.cfg.dataset_config_name,
+            trust_remote_code=True
+        )
+        return dataset
+    @abc.abstractmethod
+    def init_train_dataloader(self, shared_modules: dict) -> torch.utils.data.DataLoader:
+        """Both init train_dataloader and test_dataset, but returns train_dataloader only"""
+        pass
+    @abc.abstractmethod
+    def forward_step(
+        self,
+        *args,
+        **kwargs
+    ) -> torch.Tensor:
+        """
+        input a batch
+        return a loss
+        """
+        self.unet_update()
+        pass
+    @abc.abstractmethod
+    def construct_pipeline(self, shared_modules, unet):
+        pass
+    @abc.abstractmethod
+    def pipeline_forward(self, pipeline, **pipeline_call_kwargs) -> tuple:
+        """
+            For inference time forward.
+        """
+        pass
+    @abc.abstractmethod
+    def batched_validation_forward(self, pipeline, **pipeline_call_kwargs) -> tuple:
+        pass
+    def do_validation(
+        self,
+        shared_modules,
+        unet,
+        global_step,
+    ):
+        self.unet_update()
+        self.logger.info("Running validation... ")
+        pipeline = self.construct_pipeline(shared_modules, unet)
+        pipeline.set_progress_bar_config(disable=True)
+        titles, images = self.batched_validation_forward(pipeline, guidance_scale=[1., 3.])
+        for tracker in self.accelerator.trackers:
+            if tracker.name == "tensorboard":
+                np_images = np.stack([np.asarray(img) for img in images])
+                tracker.writer.add_images("validation", np_images, global_step, dataformats="NHWC")
+            elif tracker.name == "wandb":
+                [image.thumbnail((512, 512)) for image, title in zip(images, titles) if 'noresize' not in title]   # inplace operation
+                tracker.log({"validation": [
+                    wandb.Image(image, caption=f"{i}: {titles[i]}", file_type="jpg")
+                    for i, image in enumerate(images)]})
+            else:
+                self.logger.warn(f"image logging not implemented for {tracker.name}")
+        del pipeline
+        torch.cuda.empty_cache()
+        return images
+    @torch.no_grad()
+    def log_validation(
+        self,
+        shared_modules,
+        unet,
+        global_step,
+        force=False
+    ):
+        if self.accelerator.is_main_process:
+            for tracker in self.accelerator.trackers:
+                if tracker.name == "wandb":
+                    tracker.log(self.ext_logs)
+        self.ext_logs = {}
+        if (global_step % self.cfg.validation_interval == 0 and not self.is_train_finished(global_step)) or force:
+            self.unet_update()
+            if self.accelerator.is_main_process:
+                self.do_validation(shared_modules, self.accelerator.unwrap_model(unet), global_step)
+    def save_model(self, unwrap_unet, shared_modules, save_dir):
+        if self.accelerator.is_main_process:
+            pipeline = self.construct_pipeline(shared_modules, unwrap_unet)
+            pipeline.save_pretrained(save_dir)
+            self.logger.info(f"{self.cfg.trainer_name} Model saved at {save_dir}")
+    def save_debug_info(self, save_name="debug", **kwargs):
+        if self.cfg.debug:
+            to_saves = {key: value.detach().cpu() if isinstance(value, torch.Tensor) else value for key, value in kwargs.items()}
+            import pickle
+            import os
+            if os.path.exists(f"{save_name}.pkl"):
+                for i in range(100):
+                    if not os.path.exists(f"{save_name}_v{i}.pkl"):
+                        save_name = f"{save_name}_v{i}"
+                        break
+            with open(f"{save_name}.pkl", "wb") as f:
+                pickle.dump(to_saves, f)

custum_3d_diffusion/trainings/config_classes.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from dataclasses import dataclass, field
+from typing import List, Optional
+@dataclass
+class TrainerSubConfig:
+    trainer_type: str = ""
+    trainer: dict = field(default_factory=dict)
+@dataclass
+class ExprimentConfig:
+    trainers: List[dict] = field(default_factory=lambda: [])
+    init_config: dict = field(default_factory=dict)
+    pretrained_model_name_or_path: str = ""
+    pretrained_unet_state_dict_path: str = ""
+    # expriments related parameters
+    linear_beta_schedule: bool = False
+    zero_snr: bool = False
+    prediction_type: Optional[str] = None
+    seed: Optional[int] = None
+    max_train_steps: int = 1000000
+    gradient_accumulation_steps: int = 1
+    learning_rate: float = 1e-4
+    lr_scheduler: str = "constant"
+    lr_warmup_steps: int = 500
+    use_8bit_adam: bool = False
+    adam_beta1: float = 0.9
+    adam_beta2: float = 0.999
+    adam_weight_decay: float = 1e-2
+    adam_epsilon: float = 1e-08
+    max_grad_norm: float = 1.0
+    mixed_precision: Optional[str] = None       # ["no", "fp16", "bf16", "fp8"]
+    skip_training: bool = False
+    debug: bool = False

custum_3d_diffusion/trainings/image2image_trainer.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import json
+import torch
+from diffusers import EulerAncestralDiscreteScheduler, DDPMScheduler
+from dataclasses import dataclass
+from custum_3d_diffusion.modules import register
+from custum_3d_diffusion.trainings.image2mvimage_trainer import Image2MVImageTrainer
+from custum_3d_diffusion.custum_pipeline.unifield_pipeline_img2img import StableDiffusionImageCustomPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+def get_HW(resolution):
+    if isinstance(resolution, str):
+        resolution = json.loads(resolution)
+    if isinstance(resolution, int):
+        H = W = resolution
+    elif isinstance(resolution, list):
+        H, W = resolution
+    return H, W
+@register("image2image_trainer")
+class Image2ImageTrainer(Image2MVImageTrainer):
+    """
+    Trainer for simple image to multiview images.
+    """
+    @dataclass
+    class TrainerConfig(Image2MVImageTrainer.TrainerConfig):
+        trainer_name: str = "image2image"
+    cfg: TrainerConfig
+    def forward_step(self, batch, unet, shared_modules, noise_scheduler: DDPMScheduler, global_step) -> torch.Tensor:
+        raise NotImplementedError()
+    def construct_pipeline(self, shared_modules, unet, old_version=False):
+        MyPipeline = StableDiffusionImageCustomPipeline
+        pipeline = MyPipeline.from_pretrained(
+            self.cfg.pretrained_model_name_or_path,
+            vae=shared_modules['vae'],
+            image_encoder=shared_modules['image_encoder'],
+            feature_extractor=shared_modules['feature_extractor'],
+            unet=unet,
+            safety_checker=None,
+            torch_dtype=self.weight_dtype,
+            latents_offset=self.cfg.latents_offset,
+            noisy_cond_latents=self.cfg.noisy_condition_input,
+        )
+        pipeline.set_progress_bar_config(disable=True)
+        scheduler_dict = {}
+        if self.cfg.zero_snr:
+            scheduler_dict.update(rescale_betas_zero_snr=True)
+        if self.cfg.linear_beta_schedule:
+            scheduler_dict.update(beta_schedule='linear')
+        pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(pipeline.scheduler.config, **scheduler_dict)
+        return pipeline
+    def get_forward_args(self):
+        if self.cfg.seed is None:
+            generator = None
+        else:
+            generator = torch.Generator(device=self.accelerator.device).manual_seed(self.cfg.seed)
+        H, W = get_HW(self.cfg.resolution)
+        H_cond, W_cond = get_HW(self.cfg.condition_image_resolution)
+        forward_args = dict(
+            num_images_per_prompt=1,
+            num_inference_steps=20,
+            height=H,
+            width=W,
+            height_cond=H_cond,
+            width_cond=W_cond,
+            generator=generator,
+        )
+        if self.cfg.zero_snr:
+            forward_args.update(guidance_rescale=0.7)
+        return forward_args
+    def pipeline_forward(self, pipeline, **pipeline_call_kwargs) -> StableDiffusionPipelineOutput:
+        forward_args = self.get_forward_args()
+        forward_args.update(pipeline_call_kwargs)
+        return pipeline(**forward_args)
+    def batched_validation_forward(self, pipeline, **pipeline_call_kwargs) -> tuple:
+        raise NotImplementedError()

custum_3d_diffusion/trainings/image2mvimage_trainer.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import torch
+from diffusers import AutoencoderKL, DDPMScheduler, EulerAncestralDiscreteScheduler, DDIMScheduler
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection, BatchFeature
+import json
+from dataclasses import dataclass
+from typing import List, Optional
+from custum_3d_diffusion.modules import register
+from custum_3d_diffusion.trainings.base import BasicTrainer
+from custum_3d_diffusion.custum_pipeline.unifield_pipeline_img2mvimg import StableDiffusionImage2MVCustomPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+def get_HW(resolution):
+    if isinstance(resolution, str):
+        resolution = json.loads(resolution)
+    if isinstance(resolution, int):
+        H = W = resolution
+    elif isinstance(resolution, list):
+        H, W = resolution
+    return H, W
+@register("image2mvimage_trainer")
+class Image2MVImageTrainer(BasicTrainer):
+    """
+    Trainer for simple image to multiview images.
+    """
+    @dataclass
+    class TrainerConfig(BasicTrainer.TrainerConfig):
+        trainer_name: str = "image2mvimage"
+        condition_image_column_name: str = "conditioning_image"
+        image_column_name: str = "image"
+        condition_dropout: float = 0.
+        condition_image_resolution: str = "512"
+        validation_images: Optional[List[str]] = None
+        noise_offset: float = 0.1
+        max_loss_drop: float = 0.
+        snr_gamma: float = 5.0
+        log_distribution: bool = False
+        latents_offset: Optional[List[float]] = None
+        input_perturbation: float = 0.
+        noisy_condition_input: bool = False                 # whether to add noise for ref unet input
+        normal_cls_offset: int = 0
+        condition_offset: bool = True
+        zero_snr: bool = False
+        linear_beta_schedule: bool = False
+    cfg: TrainerConfig
+    def configure(self) -> None:
+        return super().configure()
+    def init_shared_modules(self, shared_modules: dict) -> dict:
+        if 'vae' not in shared_modules:
+            vae = AutoencoderKL.from_pretrained(
+                self.cfg.pretrained_model_name_or_path, subfolder="vae", torch_dtype=self.weight_dtype
+            )
+            vae.requires_grad_(False)
+            vae.to(self.accelerator.device, dtype=self.weight_dtype)
+            shared_modules['vae'] = vae
+        if 'image_encoder' not in shared_modules:
+            image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+                self.cfg.pretrained_model_name_or_path, subfolder="image_encoder"
+            )
+            image_encoder.requires_grad_(False)
+            image_encoder.to(self.accelerator.device, dtype=self.weight_dtype)
+            shared_modules['image_encoder'] = image_encoder
+        if 'feature_extractor' not in shared_modules:
+            feature_extractor = CLIPImageProcessor.from_pretrained(
+                self.cfg.pretrained_model_name_or_path, subfolder="feature_extractor"
+            )
+            shared_modules['feature_extractor'] = feature_extractor
+        return shared_modules
+    def init_train_dataloader(self, shared_modules: dict) -> torch.utils.data.DataLoader:
+        raise NotImplementedError()
+    def loss_rescale(self, loss, timesteps=None):
+        raise NotImplementedError()
+    def forward_step(self, batch, unet, shared_modules, noise_scheduler: DDPMScheduler, global_step) -> torch.Tensor:
+        raise NotImplementedError()
+    def construct_pipeline(self, shared_modules, unet, old_version=False):
+        MyPipeline = StableDiffusionImage2MVCustomPipeline
+        pipeline = MyPipeline.from_pretrained(
+            self.cfg.pretrained_model_name_or_path,
+            vae=shared_modules['vae'],
+            image_encoder=shared_modules['image_encoder'],
+            feature_extractor=shared_modules['feature_extractor'],
+            unet=unet,
+            safety_checker=None,
+            torch_dtype=self.weight_dtype,
+            latents_offset=self.cfg.latents_offset,
+            noisy_cond_latents=self.cfg.noisy_condition_input,
+            condition_offset=self.cfg.condition_offset,
+        )
+        pipeline.set_progress_bar_config(disable=True)
+        scheduler_dict = {}
+        if self.cfg.zero_snr:
+            scheduler_dict.update(rescale_betas_zero_snr=True)
+        if self.cfg.linear_beta_schedule:
+            scheduler_dict.update(beta_schedule='linear')
+        pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(pipeline.scheduler.config, **scheduler_dict)
+        return pipeline
+    def get_forward_args(self):
+        if self.cfg.seed is None:
+            generator = None
+        else:
+            generator = torch.Generator(device=self.accelerator.device).manual_seed(self.cfg.seed)
+        H, W = get_HW(self.cfg.resolution)
+        H_cond, W_cond = get_HW(self.cfg.condition_image_resolution)
+        sub_img_H = H // 2
+        num_imgs = H // sub_img_H * W // sub_img_H
+        forward_args = dict(
+            num_images_per_prompt=num_imgs,
+            num_inference_steps=50,
+            height=sub_img_H,
+            width=sub_img_H,
+            height_cond=H_cond,
+            width_cond=W_cond,
+            generator=generator,
+        )
+        if self.cfg.zero_snr:
+            forward_args.update(guidance_rescale=0.7)
+        return forward_args
+    def pipeline_forward(self, pipeline, **pipeline_call_kwargs) -> StableDiffusionPipelineOutput:
+        forward_args = self.get_forward_args()
+        forward_args.update(pipeline_call_kwargs)
+        return pipeline(**forward_args)
+    def batched_validation_forward(self, pipeline, **pipeline_call_kwargs) -> tuple:
+        raise NotImplementedError()

custum_3d_diffusion/trainings/utils.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from omegaconf import DictConfig, OmegaConf
+def parse_structured(fields, cfg) -> DictConfig:
+    scfg = OmegaConf.structured(fields(**cfg))
+    return scfg
+def load_config(fields, config, extras=None):
+    if extras is not None:
+        print("Warning! extra parameter in cli is not verified, may cause erros.")
+    if isinstance(config, str):
+        cfg = OmegaConf.load(config)
+    elif isinstance(config, dict):
+        cfg = OmegaConf.create(config)
+    elif isinstance(config, DictConfig):
+        cfg = config
+    else:
+        raise NotImplementedError(f"Unsupported config type {type(config)}")
+    if extras is not None:
+        cli_conf = OmegaConf.from_cli(extras)
+        cfg = OmegaConf.merge(cfg, cli_conf)
+    OmegaConf.resolve(cfg)
+    assert isinstance(cfg, DictConfig)
+    return parse_structured(fields, cfg)

docker/Dockerfile ADDED Viewed

	@@ -0,0 +1,54 @@

+# get the development image from nvidia cuda 12.1
+FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04
+LABEL name="unique3d" maintainer="unique3d"
+# create workspace folder and set it as working directory
+RUN mkdir -p /workspace
+WORKDIR /workspace
+# update package lists and install git, wget, vim, libegl1-mesa-dev, and libglib2.0-0
+RUN apt-get update && apt-get install -y build-essential git wget vim libegl1-mesa-dev libglib2.0-0 unzip git-lfs
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends pkg-config libglvnd0 libgl1 libglx0 libegl1 libgles2 libglvnd-dev libgl1-mesa-dev libegl1-mesa-dev libgles2-mesa-dev cmake curl mesa-utils-extra
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+ENV LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH
+ENV PYOPENGL_PLATFORM=egl
+# install conda
+RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
+    chmod +x Miniconda3-latest-Linux-x86_64.sh && \
+    ./Miniconda3-latest-Linux-x86_64.sh -b -p /workspace/miniconda3 && \
+    rm Miniconda3-latest-Linux-x86_64.sh
+# update PATH environment variable
+ENV PATH="/workspace/miniconda3/bin:${PATH}"
+# initialize conda
+RUN conda init bash
+# create and activate conda environment
+RUN conda create -n unique3d python=3.10 && echo "source activate unique3d" > ~/.bashrc
+ENV PATH /workspace/miniconda3/envs/unique3d/bin:$PATH
+RUN conda install Ninja
+RUN conda install cuda -c nvidia/label/cuda-12.1.0 -y
+RUN pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 xformers triton --index-url https://download.pytorch.org/whl/cu121
+RUN pip install diffusers==0.27.2
+RUN git clone --depth 1 https://huggingface.co/spaces/Wuvin/Unique3D
+# change the working directory to the repository
+WORKDIR /workspace/Unique3D
+# other dependencies
+RUN pip install -r requirements.txt
+RUN pip install nvidia-pyindex
+RUN pip install --upgrade nvidia-tensorrt
+RUN pip install spaces

docker/README.md ADDED Viewed

	@@ -0,0 +1,35 @@

+# Docker setup
+This docker setup is tested on Windows 10.
+make sure you are under this directory yourworkspace/Unique3D/docker
+Build docker image:
+```
+docker build -t unique3d -f Dockerfile .
+```
+Run docker image at the first time:
+```
+docker run -it --name unique3d -p 7860:7860 --gpus all unique3d python app.py
+```
+After first time:
+```
+docker start unique3d
+docker exec unique3d python app.py
+```
+Stop the container:
+```
+docker stop unique3d
+```
+You can find the demo link showing in terminal, such as `https://94fc1ba77a08526e17.gradio.live/` or something similar else (it will be changed after each time to restart the container) to use the demo.
+Some notes:
+1. this docker build is using https://huggingface.co/spaces/Wuvin/Unique3D rather than this repo to clone the source.
+2. the total built time might take more than one hour.
+3. the total size of the built image will be more than 70GB.

gradio_app.py ADDED Viewed

	@@ -0,0 +1,41 @@

+if __name__ == "__main__":
+    import os
+    import sys
+    sys.path.append(os.curdir)
+    import torch
+    torch.set_float32_matmul_precision('medium')
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.set_grad_enabled(False)
+import fire
+import gradio as gr
+from app.gradio_3dgen import create_ui as create_3d_ui
+from app.all_models import model_zoo
+_TITLE = '''Unique3D: High-Quality and Efficient 3D Mesh Generation from a Single Image'''
+_DESCRIPTION = '''
+[Project page](https://wukailu.github.io/Unique3D/)
+* High-fidelity and diverse textured meshes generated by Unique3D from single-view images.
+* The demo is still under construction, and more features are expected to be implemented soon.
+'''
+def launch():
+    model_zoo.init_models()
+    with gr.Blocks(
+        title=_TITLE,
+        theme=gr.themes.Monochrome(),
+    ) as demo:
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown('# ' + _TITLE)
+        gr.Markdown(_DESCRIPTION)
+        create_3d_ui("wkl")
+    demo.queue().launch(share=True)
+if __name__ == '__main__':
+    fire.Fire(launch)