Spaces:

muxingyin
/

VisualGLM-6B

Runtime error

App Files Files Community

muxingyin commited on Jun 7, 2023

Commit

f6086aa

1 Parent(s): 8e4b175

Upload folder using huggingface_hub

Browse files

Files changed (44) hide show

.gitattributes +1 -0
.github/workflows/update_space.yml +28 -0
.gitignore +164 -0
BigModel/.gitattributes +34 -0
BigModel/README.md +13 -0
LICENSE.txt +201 -0
MODEL_LICENSE.txt +33 -0
README.md +325 -8
README_en.md +246 -0
__pycache__/finetune_visualglm.cpython-310.pyc +0 -0
__pycache__/lora_mixin.cpython-310.pyc +0 -0
api.py +51 -0
api_hf.py +49 -0
cli_demo.py +103 -0
cli_demo_hf.py +69 -0
examples/1.jpeg +3 -0
examples/2.jpeg +0 -0
examples/3.jpeg +0 -0
examples/chat_example1.png +0 -0
examples/chat_example2.png +0 -0
examples/chat_example3.png +0 -0
examples/example_inputs.jsonl +3 -0
examples/thu.png +0 -0
examples/web_demo.png +0 -0
fewshot-data.zip +3 -0
finetune/finetune_visualglm.sh +58 -0
finetune/finetune_visualglm_qlora.sh +59 -0
finetune_visualglm.py +195 -0
lora_mixin.py +260 -0
model/__init__.py +3 -0
model/__pycache__/__init__.cpython-310.pyc +0 -0
model/__pycache__/blip2.cpython-310.pyc +0 -0
model/__pycache__/chat.cpython-310.pyc +0 -0
model/__pycache__/infer_util.cpython-310.pyc +0 -0
model/__pycache__/visualglm.cpython-310.pyc +0 -0
model/blip2.py +93 -0
model/chat.py +175 -0
model/infer_util.py +53 -0
model/visualglm.py +40 -0
requirements.txt +6 -0
requirements_wo_ds.txt +10 -0
web_demo.py +129 -0
web_demo_hf.py +143 -0
your_logfile.log +2 -0

.gitattributes CHANGED Viewed

@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+examples/1.jpeg filter=lfs diff=lfs merge=lfs -text

.github/workflows/update_space.yml ADDED Viewed

	@@ -0,0 +1,28 @@

+name: Run Python script
+on:
+  push:
+    branches:
+      - main
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v2
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.9'
+    - name: Install Gradio
+      run: python -m pip install gradio
+    - name: Log in to Hugging Face
+      run: python -c 'import huggingface_hub; huggingface_hub.login(token="${{ secrets.hf_token }}")'
+    - name: Deploy to Spaces
+      run: gradio deploy

.gitignore ADDED Viewed

	@@ -0,0 +1,164 @@

+checkpoints/
+runs/
+model/__pycache__/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

BigModel/.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

BigModel/README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: BigModel
+emoji: 🌍
+colorFrom: yellow
+colorTo: red
+sdk: gradio
+sdk_version: 3.33.1
+app_file: app.py
+pinned: false
+license: openrail
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

LICENSE.txt ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright Zhengxiao Du
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

MODEL_LICENSE.txt ADDED Viewed

	@@ -0,0 +1,33 @@

+The VisualGLM-6B License
+1. Definitions
+“Licensor” means the VisualGLM-6B Model Team that distributes its Software.
+“Software” means the VisualGLM-6B model parameters made available under this license.
+2. License Grant
+Subject to the terms and conditions of this License, the Licensor hereby grants to you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty-free copyright license to use the Software solely for your non-commercial research purposes.
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+3. Restriction
+You will not use, copy, modify, merge, publish, distribute, reproduce, or create derivative works of the Software, in whole or in part, for any commercial, military, or illegal purposes.
+You will not use the Software for any act that may undermine China's national security and national unity, harm the public interest of society, or infringe upon the rights and interests of human beings.
+4. Disclaimer
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+5. Limitation of Liability
+EXCEPT TO THE EXTENT PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER BASED IN TORT, NEGLIGENCE, CONTRACT, LIABILITY, OR OTHERWISE WILL ANY LICENSOR BE LIABLE TO YOU FOR ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES, OR ANY OTHER COMMERCIAL LOSSES, EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+6. Dispute Resolution
+This license shall be governed and construed in accordance with the laws of People’s Republic of China. Any dispute arising from or in connection with this License shall be submitted to Haidian District People's Court in Beijing.
+Note that the license is subject to update to a more comprehensive version.  For any questions related to the license and copyright, please contact us.

README.md CHANGED Viewed

@@ -1,12 +1,329 @@
 ---
-title: VisualGLM 6B
-emoji: 🌖
-colorFrom: green
-colorTo: pink
 sdk: gradio
-sdk_version: 3.33.1
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: VisualGLM-6B
+app_file: web_demo_hf.py
 sdk: gradio
+sdk_version: 3.33.0
 ---
+# VisualGLM-6B
+<p align="center">
+🤗 <a href="https://huggingface.co/THUDM/visualglm-6b" target="_blank">HF Repo</a> • ⚒️ <a href="https://github.com/THUDM/SwissArmyTransformer" target="_blank">SwissArmyTransformer (sat)</a> • 🐦 <a href="https://twitter.com/thukeg" target="_blank">Twitter</a>
+</p>
+<p align="center">
+•  📃 <a href="https://arxiv.org/abs/2105.13290" target="_blank">[CogView@NeurIPS 21]</a>  <a href="https://github.com/THUDM/CogView" target="_blank">[GitHub]</a> • 📃 <a href="https://arxiv.org/abs/2103.10360" target="_blank">[GLM@ACL 22]</a> <a href="https://github.com/THUDM/GLM" target="_blank">[GitHub]</a> <br>
+</p>
+<p align="center">
+    👋 加入我们的 <a href="https://join.slack.com/t/chatglm/shared_invite/zt-1th2q5u69-7tURzFuOPanmuHy9hsZnKA" target="_blank">Slack</a> 和 <a href="resources/WECHAT.md" target="_blank">WeChat</a>
+</p>
+<!-- <p align="center">
+🤖<a href="https://huggingface.co/spaces/THUDM/visualglm-6b" target="_blank">VisualGLM-6B在线演示网站</a>
+</p> -->
+## 介绍
+VisualGLM-6B is an open-source, multi-modal dialog language model that supports **images, Chinese, and English**. The language model is based on [ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B) with 6.2 billion parameters; the image part builds a bridge between the visual model and the language model through the training of [BLIP2-Qformer](https://arxiv.org/abs/2301.12597), with the total model comprising 7.8 billion parameters. **[Click here for English version.](README_en.md)**
+VisualGLM-6B 是一个开源的，支持**图像、中文和英文**的多模态对话语言模型，语言模型基于 [ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B)，具有 62 亿参数；图像部分通过训练 [BLIP2-Qformer](https://arxiv.org/abs/2301.12597) 构建起视觉模型与语言模型的桥梁，整体模型共78亿参数。
+VisualGLM-6B 依靠来自于 [CogView](https://arxiv.org/abs/2105.13290) 数据集的30M高质量中文图文对，与300M经过筛选的英文图文对进行预训练，中英文权重相同。该训练方式较好地将视觉信息对齐到ChatGLM的语义空间；之后的微调阶段，模型在长视觉问答数据上训练，以生成符合人类偏好的答案。
+VisualGLM-6B 由 [SwissArmyTransformer](https://github.com/THUDM/SwissArmyTransformer)(简称`sat`) 库训练，这是一个支持Transformer灵活修改、训练的工具库，支持Lora、P-tuning等参数高效微调方法。本项目提供了符合用户习惯的huggingface接口，也提供了基于sat的接口。
+结合模型量化技术，用户可以在消费级的显卡上进行本地部署（INT4量化级别下最低只需8.7G显存）。
+-----
+VisualGLM-6B 开源模型旨在与开源社区一起推动大模型技术发展，恳请开发者和大家遵守开源协议，勿将该开源模型和代码及基于该开源项目产生的衍生物用于任何可能给国家和社会带来危害的用途以及用于任何未经过安全评估和备案的服务。目前，本项目官方未基于 VisualGLM-6B 开发任何应用，包括网站、安卓App、苹果 iOS应用及 Windows App 等。
+由于 VisualGLM-6B 仍处于v1版本，目前已知其具有相当多的[**局限性**](README.md#局限性)，如图像描述事实性/模型幻觉问题，图像细节信息捕捉不足，以及一些来自语言模型的局限性。尽管模型在训练的各个阶段都尽力确保数据的合规性和准确性，但由于 VisualGLM-6B 模型规模较小，且模型受概率随机性因素影响，无法保证输出内容的准确性，且模型易被误导（详见局限性部分）。在VisualGLM之后的版本中，将会着力对此类问题进行优化。本项目不承担开源模型和代码导致的数据安全、舆情风险或发生任何模型被误导、滥用、传播、不当利用而产生的风险和责任。
+## 样例
+VisualGLM-6B 可以进行图像的描述的相关知识的问答。
+![泰坦尼克号样例](examples/chat_example1.png)
+<details>
+<summary>也能结合常识或提出有趣的观点，点击展开/折叠更多样例</summary>
+![出租车熨衣服样例](examples/chat_example2.png)
+![蒙娜丽莎狗样例](examples/chat_example3.png)
+</details>
+## 友情链接
+* [XrayGLM](https://github.com/WangRongsheng/XrayGLM) 是基于visualGLM-6B在X光诊断数据集上微调的X光诊断问答的项目，能根据X光片回答医学相关询问。
+<details>
+<summary>点击查看样例</summary>
+![样例](https://github.com/WangRongsheng/XrayGLM/raw/main/assets/images/xrayglm.png)
+</details>
+## 使用
+### 模型推理
+使用pip安装依赖
+```
+pip install -i https://pypi.org/simple -r requirements.txt
+# 国内请使用aliyun镜像，TUNA等镜像同步最近出现问题，命令如下
+pip install -i https://mirrors.aliyun.com/pypi/simple/ -r requirements.txt
+```
+此时默认会安装`deepspeed`库（支持`sat`库训练），此库对于模型推理并非必要，同时部分Windows环境安装此库时会遇到问题。
+如果想绕过`deepspeed`安装，我们可以将命令改为
+```
+pip install -i https://mirrors.aliyun.com/pypi/simple/ -r requirements_wo_ds.txt
+pip install -i https://mirrors.aliyun.com/pypi/simple/ --no-deps "SwissArmyTransformer>=0.3.6"
+```
+如果使用Huggingface transformers库调用模型（**也需要安装上述依赖包！**），可以通过如下代码（其中图像路径为本地路径）：
+```python
+from transformers import AutoTokenizer, AutoModel
+tokenizer = AutoTokenizer.from_pretrained("THUDM/visualglm-6b", trust_remote_code=True)
+model = AutoModel.from_pretrained("THUDM/visualglm-6b", trust_remote_code=True).half().cuda()
+image_path = "your image path"
+response, history = model.chat(tokenizer, image_path, "描述这张图片。", history=[])
+print(response)
+response, history = model.chat(tokenizer, image_path, "这张图片可能是在什么场所拍摄的？", history=history)
+print(response)
+```
+以上代码会由 `transformers` 自动下载模型实现和参数。完整的模型实现可以在 [Hugging Face Hub](https://huggingface.co/THUDM/visualglm-6b)。如果你从 Hugging Face Hub 上下载模型参数的速度较慢，可以从[这里](https://cloud.tsinghua.edu.cn/d/43ffb021ca5f4897b56a/)手动下载模型参数文件，并从本地加载模型。具体做法请参考[从本地加载模型](https://github.com/THUDM/ChatGLM-6B#%E4%BB%8E%E6%9C%AC%E5%9C%B0%E5%8A%A0%E8%BD%BD%E6%A8%A1%E5%9E%8B)。关于基于 transformers 库模型的量化、CPU推理、Mac MPS 后端加速等内容，请参考 [ChatGLM-6B 的低成本部署](https://github.com/THUDM/ChatGLM-6B#%E4%BD%8E%E6%88%90%E6%9C%AC%E9%83%A8%E7%BD%B2)。
+如果使用SwissArmyTransformer库调用模型，方法类似，可以使用环境变量`SAT_HOME`决定模型下载位置。在本仓库目录下：
+```python
+import argparse
+from transformers import AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
+from model import chat, VisualGLMModel
+model, model_args = VisualGLMModel.from_pretrained('visualglm-6b', args=argparse.Namespace(fp16=True, skip_init=True))
+from sat.model.mixins import CachedAutoregressiveMixin
+model.add_mixin('auto-regressive', CachedAutoregressiveMixin())
+image_path = "your image path or URL"
+response, history, cache_image = chat(image_path, model, tokenizer, "描述这张图片。", history=[])
+print(response)
+response, history, cache_image = chat(None, model, tokenizer, "这张图片可能是在什么场所拍摄的？", history=history, image=cache_image)
+print(response)
+```
+使用`sat`库也可以轻松进行进行参数高效微调。<!-- TODO 具体代码 -->
+## 模型微调
+多模态任务分布广、种类多，预训练往往不能面面俱到。
+这里我们提供了一个小样本微调的例子，使用20张标注图增强模型回答“背景”问题的能力。
+解压`fewshot-data.zip`以后运行如下命令：
+```
+bash finetune/finetune_visualglm.sh
+```
+目前支持三种方式的微调：
+* LoRA：样例中为ChatGLM模型的第0层和第14层加入了rank=10的LoRA微调，可以根据具体情景和数据量调整`--layer_range`和`--lora_rank`参数。
+* QLoRA：如果资源有限，可以考虑使用`bash finetune/finetune_visualglm_qlora.sh`，QLoRA将ChatGLM的线性层进行了4-bit量化，只需要9.8GB显存即可微调。
+* P-tuning：可以将`--use_lora`替换为`--use_ptuning`，不过不推荐使用，除非模型应用场景非常固定。
+训练好以后可以使用如下命令推理：
+```
+python cli_demo.py --from_pretrained your_checkpoint_path --prompt_zh 这张图片的背景里有什么内容？
+```
+<details>
+<summary>微调前后的效果对比</summary>
+微调前：
+```
+欢迎使用 VisualGLM-6B 模型，输入图像URL或本地路径读图，继续输入内容对话，clear 重新开始，stop 终止程序
+请输入图像路径或URL（回车进入纯文本对话）： fewshot-data/2p.png
+用户：这张图片的背景里有什么内容？
+VisualGLM-6B：人们总是被生活琐事所困扰，无法摆脱束缚.
+用户：clear
+请输入图像路径或URL（回车进入纯文本对话）： fewshot-data/meme.png
+用户：这张图片的背景里有什么内容？
+VisualGLM-6B：这张图片的背景是一张狗的图片，它躺在木地板上。
+用户：clear
+请输入图像路径或URL（回车进入纯文本对话）： fewshot-data/passport.png
+用户：这张图片的背景里有什么内容？
+VisualGLM-6B：这张图片的背景是一台电脑键盘和两张护照。
+用户：clear
+请输入图像路径或URL（回车进入纯文本对话）：
+```
+微调后：
+```
+欢迎使用 VisualGLM-6B 模型��输入图像URL或本地路径读图，继续输入内容对话，clear 重新开始，stop 终止程序
+请输入图像路径或URL（回车进入纯文本对话）： fewshot-data/2p.png
+用户：这张图片的背景里有什么内容？
+VisualGLM-6B：这张图片的背景是雨天，
+用户：描述一下这张图片。
+VisualGLM-6B：这张图片描绘了雨天的情景，雨水淅淅沥沥地落在地面上。人们穿着雨衣或雨伞，走在雨中，似乎在寻找着什么.
+用户：clear
+请输入图像路径或URL（回车进入纯文本对话）： fewshot-data/meme.png
+用户：这张图片的背景里有什么内容？
+VisualGLM-6B：这张图片的背景是木质地板。
+用户：什么颜色的地板
+VisualGLM-6B：这张图片的背景是蓝色的。
+用户：描述一下这张图片。
+VisualGLM-6B：这张图片描绘了一只棕色的小狗躺在蓝色的木质地板上。
+用户：clear
+请输入图像路径或URL（回车进入纯文本对话）： fewshot-data/passport.png
+用户：这张图片的背景里有什么内容？
+VisualGLM-6B：这张图片的背景是电脑键盘。
+用户：描述一下这张图片。
+VisualGLM-6B：这张图片的特点是电脑键盘和护照。
+用户：有几张护照
+VisualGLM-6B：两张护照。
+用户：clear
+请输入图像路径或URL（回车进入纯文本对话）：
+```
+</details>
+如果希望把LoRA部分的参数合并到原始的权重，可以调用`merge_lora()`，例如：
+```python
+from finetune_visualglm import FineTuneVisualGLMModel
+import argparse
+model, args = FineTuneVisualGLMModel.from_pretrained('checkpoints/finetune-visualglm-6b-05-19-07-36',
+        args=argparse.Namespace(
+        fp16=True,
+        skip_init=True,
+        use_gpu_initialization=True,
+        device='cuda',
+    ))
+model.get_mixin('lora').merge_lora()
+args.layer_range = []
+args.save = 'merge_lora'
+args.mode = 'inference'
+from sat.training.model_io import save_checkpoint
+save_checkpoint(1, model, None, None, args)
+```
+微调需要安装`deepspeed`库，目前本流程仅支持linux系统，更多的样例说明和Windows系统的流程说明将在近期完成。
+## 部署工具
+### 命令行 Demo
+```shell
+python cli_demo.py
+```
+程序会自动下载sat模型，并在命令行中进行交互式的对话，输入指示并回车即可生成回复，输入 clear 可以清空对话历史，输入 stop 终止程序。
+![cli_demo](examples/thu.png)
+程序提供如下超参数控制生成过程与量化精度：
+```
+usage: cli_demo.py [-h] [--max_length MAX_LENGTH] [--top_p TOP_P] [--top_k TOP_K] [--temperature TEMPERATURE] [--english] [--quant {8,4}]
+optional arguments:
+  -h, --help            show this help message and exit
+  --max_length MAX_LENGTH
+                        max length of the total sequence
+  --top_p TOP_P         top p for nucleus sampling
+  --top_k TOP_K         top k for top k sampling
+  --temperature TEMPERATURE
+                        temperature for sampling
+  --english             only output English
+  --quant {8,4}         quantization bits
+```
+需要注意的是，在训练时英文问答对的提示词为`Q: A:`，而中文为`问：答：`，在网页demo中采取了中文的提示，因此英文回复会差一些且夹杂中文；如果需要英文回复，请使用`cli_demo.py`中的`--english`选项。
+我们也提供了继承自`ChatGLM-6B`的打字机效果命令行工具，此工具使用Huggingface模型：
+```shell
+python cli_demo_hf.py
+```
+### 网页版 Demo
+![web_demo](examples/web_demo.png)
+我们提供了一个基于 [Gradio](https://gradio.app) 的网页版 Demo，首先安装 Gradio：`pip install gradio`。
+然后下载并进入本仓库运行`web_demo.py`：
+```
+git clone https://github.com/THUDM/VisualGLM-6B
+cd VisualGLM-6B
+python web_demo.py
+```
+程序会自动下载 sat 模型，并运行一个 Web Server，并输出地址。在浏览器中打开输出的地址即可使用。
+我们也提供了继承自`ChatGLM-6B`的打字机效果网页版工具，此工具使用 Huggingface 模型，启动后将运行在`:8080`端口上：
+```shell
+python web_demo_hf.py
+```
+两种网页版 demo 均接受命令行参数`--share`以生成 gradio 公开链接，接受`--quant 4`和`--quant 8`以分别使用4比特量化/8比特量化减少显存占用。
+### API部署
+首先需要安装额外的依赖 `pip install fastapi uvicorn`，然后运行仓库中的 [api.py](api.py)：
+```shell
+python api.py
+```
+程序会自动下载 sat 模型，默认部署在本地的 8080 端口，通过 POST 方法进行调用。下面是用`curl`请求的例子，一般而言可以也可以使用代码方法进行POST。
+```shell
+echo "{\"image\":\"$(base64 path/to/example.jpg)\",\"text\":\"描述这张图片\",\"history\":[]}" > temp.json
+curl -X POST -H "Content-Type: application/json" -d @temp.json http://127.0.0.1:8080
+```
+得到的返回值为
+```
+  {
+    "response":"这张图片展现了一只可爱的卡通羊驼，它站在��个透明的背景上。这只羊驼长着一张毛茸茸的耳朵和一双大大的眼睛，它的身体是白色的，带有棕色斑点。",
+    "history":[('描述这张图片', '这张图片展现了一只可爱的卡通羊驼，它站在一个透明的背景上。这只羊驼长着一张毛茸茸的耳朵和一双大大的眼睛，它的身体是白色的，带有棕色斑点。')],
+    "status":200,
+    "time":"2023-05-16 20:20:10"
+  }
+```
+我们也提供了使用Huggingface模型的 [api_hf.py](api_hf.py)，用法和sat模型的api一致：
+```shell
+python api_hf.py
+```
+## 模型量化
+在Huggingface实现中，模型默认以 FP16 精度加载，运行上述代码需要大概 15GB 显存。如果你的 GPU 显存有限，可以尝试以量化方式加载模型。
+使用方法如下：
+```python
+# 按需修改，目前只支持 4/8 bit 量化。下面将只量化ChatGLM，ViT 量化时误差较大
+model = AutoModel.from_pretrained("THUDM/visualglm-6b", trust_remote_code=True).quantize(8).half().cuda()
+```
+在sat实现中，需先传参将加载位置改为`cpu`，再进行量化。方法如下，详见`cli_demo.py`：
+```python
+from sat.quantization.kernels import quantize
+model = quantize(model.transformer, args.quant).cuda()
+# 指定 model.transformer 只量化 ChatGLM，ViT 量化时误差较大
+```
+## 局限性
+本项目正处于V1版本视觉和语言模型的参数、计算量都较小，我们总结了如下主要存在的改进方向：
+- 图像描述事实性/模型幻觉问题。在生成图像长描述的时候，距离图像较远时，语言模型的将占主导，有一定可能根据上下文生成并不存在于图像的内容。
+- 属性错配问题。在多物体的场景中，部分物体的某些属性，经常被错误安插到其他物体上。
+- 分辨率问题。本项目使用了224*224的分辨率，也是视觉模型中最为常用的尺寸；然而为了进行更细粒度的理解，更大的分辨率和计算量是必要的。
+- 由于数据等方面原因，模型暂时不具有中文ocr的能力（英文ocr能力有一些），我们会在后续版本中增加这个能力。
+## 协议
+本仓库的代码依照 [Apache-2.0](LICENSE.txt) 协议开源，VisualGLM-6B 模型的权重的使用则需要遵循 [Model License](MODEL_LICENSE.txt)。
+## 引用与致谢
+如果你觉得我们的工作有帮助的话，请考虑引用下列论文
+```
+@inproceedings{du2022glm,
+  title={GLM: General Language Model Pretraining with Autoregressive Blank Infilling},
+  author={Du, Zhengxiao and Qian, Yujie and Liu, Xiao and Ding, Ming and Qiu, Jiezhong and Yang, Zhilin and Tang, Jie},
+  booktitle={Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
+  pages={320--335},
+  year={2022}
+}
+@article{ding2021cogview,
+  title={Cogview: Mastering text-to-image generation via transformers},
+  author={Ding, Ming and Yang, Zhuoyi and Hong, Wenyi and Zheng, Wendi and Zhou, Chang and Yin, Da and Lin, Junyang and Zou, Xu and Shao, Zhou and Yang, Hongxia and others},
+  journal={Advances in Neural Information Processing Systems},
+  volume={34},
+  pages={19822--19835},
+  year={2021}
+}
+```
+在VisualGLM-6B的指令微调阶段的数据集中，包含了来自[MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4)和[LLAVA](https://github.com/haotian-liu/LLaVA)项目的一部分英文图文数据，以及许多经典的跨模态工作数据集，衷心感谢他们的贡献。

README_en.md ADDED Viewed

	@@ -0,0 +1,246 @@

+# VisualGLM-6B
+<p align="center">
+🤗 <a href="https://huggingface.co/THUDM/visualglm-6b" target="_blank">HF Repo</a> • ⚒️ <a href="https://github.com/THUDM/SwissArmyTransformer" target="_blank">SwissArmyTransformer (sat)</a> • 🐦 <a href="https://twitter.com/thukeg" target="_blank">Twitter</a>
+</p>
+<p align="center">
+•  📃 <a href="https://arxiv.org/abs/2105.13290" target="_blank">[CogView@NeurIPS 21]</a>  <a href="https://github.com/THUDM/CogView" target="_blank">[GitHub]</a> • 📃 <a href="https://arxiv.org/abs/2103.10360" target="_blank">[GLM@ACL 22]</a> <a href="https://github.com/THUDM/GLM" target="_blank">[GitHub]</a> <br>
+</p>
+<p align="center">
+    👋 Join us on <a href="https://join.slack.com/t/chatglm/shared_invite/zt-1th2q5u69-7tURzFuOPanmuHy9hsZnKA" target="_blank">Slack</a> and <a href="resources/WECHAT.md" target="_blank">WeChat</a>
+</p>
+<!-- <p align="center">
+🤖<a href="https://huggingface.co/spaces/THUDM/visualglm-6b" target="_blank">VisualGLM-6B Online Demo Website</a>
+</p> -->
+## Introduction
+VisualGLM-6B is an open-source, multi-modal dialog language model that supports **images, Chinese, and English**. The language model is based on [ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B) with 6.2 billion parameters; the image part builds a bridge between the visual model and the language model through the training of [BLIP2-Qformer](https://arxiv.org/abs/2301.12597), with the total model comprising 7.8 billion parameters.
+VisualGLM-6B relies on 30M high-quality Chinese image-text pairs from the [CogView](https://arxiv.org/abs/2105.13290) dataset and 300M filtered English image-text pairs for pre-training, with equal weight for Chinese and English. This training method aligns visual information well to the semantic space of ChatGLM. In the subsequent fine-tuning phase, the model is trained on long visual question answering data to generate answers that align with human preferences.
+VisualGLM-6B is trained using the [SwissArmyTransformer](https://github.com/THUDM/SwissArmyTransformer) (abbreviated as sat) library, a utility library for flexible modification and training of Transformer, supporting efficient fine-tuning methods like Lora and P-tuning. This project provides a user-friendly huggingface interface, as well as an interface based on sat.
+However, as VisualGLM-6B is still at the v1 stage, it is known to have quite a few [**limitations**](#Limitations), such as factual inaccuracy/model hallucination in image description, lack of capturing image detail information, and some limitations from the language model. Please be aware of these issues and evaluate the potential risks before using. In future versions of VisualGLM, we will strive to optimize these issues.
+With model quantization technology, users can deploy locally on consumer-grade graphics cards (requiring as little as 8.7G memory under INT4 quantization level).
+## Examples
+VisualGLM-6B can answer questions related to image description.
+![Titanic example](examples/chat_example1.png)
+<details>
+<summary>It can also combine common sense or propose interesting views. Click to expand/collapse more examples</summary>
+![Ironing shirt taxi example](examples/chat_example2.png)
+![Mona Lisa dog example](examples/chat_example3.png)
+</details>
+## Usage
+### Model Inference
+Install dependencies with pip
+```
+pip install -i https://pypi.org/simple -r requirements.txt
+pip install -i https://mirrors.aliyun.com/pypi/simple/ -r requirements.txt
+```
+This will default to installing the deepspeed library (which supports the sat library training). This library is not necessary for model inference and can cause problems when installed in some Windows environments.
+If you want to bypass deepspeed installation, you can change the command to:
+```
+pip install -i https://mirrors.aliyun.com/pypi/simple/ -r requirements_wo_ds.txt
+pip install -i https://mirrors.aliyun.com/pypi/simple/ --no-deps "SwissArmyTransformer>=0.3.6"
+```
+If you are calling the model using the Huggingface transformers library (you also need to install the above dependency packages!), you can use the following code (where the image path is the local path):
+```python
+from transformers import AutoTokenizer, AutoModel
+tokenizer = AutoTokenizer.from_pretrained("THUDM/visualglm-6b", trust_remote_code=True)
+model = AutoModel.from_pretrained("THUDM/visualglm-6b", trust_remote_code=True).half().cuda()
+image_path = "your image path"
+response, history = model.chat(tokenizer, image_path, "描述这张图片。", history=[])
+print(response)
+response, history = model.chat(tokenizer, image_path, "这张图片可能是在什么场所拍摄的？", history=history)
+print(response)
+```
+If you use the SwissArmyTransformer library to call the model, the method is similar, and you can use the environment variable SAT_HOME to determine the model download location. In the directory of this repository:
+```python
+import argparse
+from transformers import AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
+from model import chat, VisualGLMModel
+model, model_args = VisualGLMModel.from_pretrained('visualglm-6b', args=argparse.Namespace(fp16=True, skip_init=True))
+from sat.model.mixins import CachedAutoregressiveMixin
+model.add_mixin('auto-regressive', CachedAutoregressiveMixin())
+image_path = "your image path or URL"
+response, history, cache_image = chat(image_path, model, tokenizer, "Describe this picture.", history=[])
+print(response)
+response, history, cache_image = chat(None, model, tokenizer, "Where could this picture possibly have been taken?", history=history, image=cache_image)
+print(response)
+```
+Using the `sat` library can also easily carry out efficient parameter fine-tuning. <!-- TODO specific code -->
+Please note that the Huggingface model implementation is located in the [Huggingface repository](https://huggingface.co/THUDM/visualglm-6b), and the `sat` model implementation is included in this repository.
+## Model Fine-tuning
+Multimodal tasks are wide-ranging and diverse, and pre-training often cannot cover all bases.
+Here we provide an example of small sample fine-tuning, using 20 labeled images to enhance the model's ability to answer "background" questions.
+After unzipping fewshot-data.zip, run the following command:
+```
+bash finetune/finetune_visualglm.sh
+```
+Currently we support three types of (parameter-efficient) fine-tuning:
+* LoRA: In the given example, we add rank=10 LoRA for layer 0 and layer 14 in ChatGLM. You can adjust `--layer_range` and `--lora_rank` to fit your application and data amount.
+* QLoRA: If your resource is limited, consider using `bash finetune/finetune_visualglm_qlora.sh`, which do 4-bit quantization for ChatGLM Linear layers, reducing the required GPU memory to 9.8 GB.
+* P-tuning: You can replace `--use_lora` to `--use_ptuning`, but not recommended, unless your application has a relatively fixed input and output template.
+After training, you can use the following command for inference:
+```
+python cli_demo.py --from_pretrained your_checkpoint_path --prompt_zh 这张图片的背景里有什么内容？
+```
+Fine-tuning requires the installation of the deepspeed library, and currently this process only supports the Linux system. More examples and instructions for the Windows system will be completed in the near future.
+If you want to merge LoRA weights into original weights, just call `merge_lora()`:
+```python
+from finetune_visualglm import FineTuneVisualGLMModel
+import argparse
+model, args = FineTuneVisualGLMModel.from_pretrained('checkpoints/finetune-visualglm-6b-05-19-07-36',
+        args=argparse.Namespace(
+        fp16=True,
+        skip_init=True,
+        use_gpu_initialization=True,
+        device='cuda',
+    ))
+model.get_mixin('lora').merge_lora()
+args.layer_range = []
+args.save = 'merge_lora'
+args.mode = 'inference'
+from sat.training.model_io import save_checkpoint
+save_checkpoint(1, model, None, None, args)
+```
+## Deployment Tools
+### Command Line Demo
+```shell
+python cli_demo.py
+```
+The program will automatically download the sat model and interact in the command line. You can generate replies by entering instructions and pressing enter. Enter 'clear' to clear the conversation history and 'stop' to stop the program.
+![cli_demo](examples/thu.png)
+The program provides the following hyperparameters to control the generation process and quantization accuracy:
+```
+usage: cli_demo.py [-h] [--max_length MAX_LENGTH] [--top_p TOP_P] [--top_k TOP_K] [--temperature TEMPERATURE] [--english] [--quant {8,4}]
+optional arguments:
+  -h, --help            show this help message and exit
+  --max_length MAX_LENGTH
+                        max length of the total sequence
+  --top_p TOP_P         top p for nucleus sampling
+  --top_k TOP_K         top k for top k sampling
+  --temperature TEMPERATURE
+                        temperature for sampling
+  --english             only output English
+  --quant {8,4}         quantization bits
+```
+Note that during training, the prompt words for English Q&A pairs are 'Q: A:', while in Chinese they are '问：答：'. The web demo uses Chinese prompts, so the English replies will be worse and interspersed with Chinese; if you need English replies, please use the --english option in cli_demo.py.
+We also provide a typewriter effect command line tool inherited from ChatGLM-6B, which uses the Huggingface model:
+```shell
+python cli_demo_hf.py
+```
+### Web Demo
+![web_demo](examples/web_demo.png)
+We provide a web demo based on [Gradio](https://gradio.app). First, install Gradio: `pip install gradio`.
+Then download and enter this repository and run `web_demo.py`:
+```
+git clone https://github.com/THUDM/VisualGLM-6B
+cd VisualGLM-6B
+python web_demo.py
+```
+The program will automatically download the sat model and run a Web Server, outputting the address. Open the output address in your browser to use it.
+We also provide a web tool with a typewriter effect inherited from ChatGLM-6B, which uses the Huggingface model and will run on port :8080 after starting:
+```shell
+python web_demo_hf.py
+```
+Both web demos accept the command line parameter --share to generate a public link for gradio, and accept --quant 4 and --quant 8 to use 4-bit quantization/8-bit quantization to reduce GPU memory usage.
+### API Deployment
+First, you need to install additional dependencies pip install fastapi uvicorn, then run the api.py in the repository:
+```shell
+python api.py
+```
+The program will automatically download the sat model, and by default it will be deployed on local port 8080 and called through the POST method. Below is an example of a request with curl, but in general you can also use a code method to POST.
+```shell
+echo "{\"image\":\"$(base64 path/to/example.jpg)\",\"text\":\"Describe this picture\",\"history\":[]}" > temp.json
+curl -X POST -H "Content-Type: application/json" -d @temp.json http://127.0.0.1:8080
+```
+We also provide an api_hf.py that uses the Huggingface model, which works the same way as the sat model's api:
+```shell
+python api_hf.py
+```
+## Model Quantization
+In the Huggingface implementation, the model is loaded with FP16 precision by default, and running the above code requires about 15GB of GPU memory. If your GPU memory is limited, you can try loading the model in a quantized manner.
+Here's how:
+```python
+# Modify as needed, currently only 4/8 bit quantization is supported. The following will only quantize ChatGLM, as the error is larger when quantizing ViT
+model = AutoModel.from_pretrained("THUDM/visualglm-6b", trust_remote_code=True).quantize(8).half().cuda()
+```
+In the sat implementation, you need to change the loading location to 'cpu' first, and then perform quantization. Here's how, see cli_demo.py for details:
+```python
+from sat.quantization.kernels import quantize
+model = quantize(model.transformer, args.quant).cuda()
+# Specify model.transformer to only quantize ChatGLM, as the error is larger when quantizing ViT
+```
+## Limitations
+This project is currently at V1 version of the visual and language model parameters, the amount of calculation is small, we have summarized the following main improvements:
+- Image description factuality/model hallucination problem. When generating long descriptions of images, as the distance from the image increases, the language model will dominate, and there is a certain possibility of generating content that does not exist in the image based on the context.
+- Attribute mismatch problem. In scenes with multiple objects, some attributes of some objects are often incorrectly inserted onto other objects.
+- Resolution issue. This project uses a resolution of 224*224, which is the most commonly used size in visual models; however, for more fine-grained understanding, larger resolution and computation are necessary.
+- Due to data and other reasons, the model currently does not have the ability to perform Chinese OCR (some ability for English OCR), we will add this ability in future versions.
+## License
+The code in this repository is open source under the Apache-2.0 license, while the use of the VisualGLM-6B model weights must comply with the Model License.
+## Citation & Acknowledgements
+If you find our work helpful, please consider citing the following papers
+```
+@inproceedings{du2022glm,
+  title={GLM: General Language Model Pretraining with Autoregressive Blank Infilling},
+  author={Du, Zhengxiao and Qian, Yujie and Liu, Xiao and Ding, Ming and Qiu, Jiezhong and Yang, Zhilin and Tang, Jie},
+  booktitle={Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
+  pages={320--335},
+  year={2022}
+}
+@article{ding2021cogview,
+  title={Cogview: Mastering text-to-image generation via transformers},
+  author={Ding, Ming and Yang, Zhuoyi and Hong, Wenyi and Zheng, Wendi and Zhou, Chang and Yin, Da and Lin, Junyang and Zou, Xu and Shao, Zhou and Yang, Hongxia and others},
+  journal={Advances in Neural Information Processing Systems},
+  volume={34},
+  pages={19822--19835},
+  year={2021}
+}
+```
+In the instruction fine-tuning phase of the VisualGLM-6B dataset, there are some English image-text data from the [MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4) and [LLAVA](https://github.com/haotian-liu/LLaVA) projects, as well as many classic cross-modal work datasets. We sincerely thank them for their contributions.

__pycache__/finetune_visualglm.cpython-310.pyc ADDED Viewed

Binary file (7 kB). View file

__pycache__/lora_mixin.cpython-310.pyc ADDED Viewed

Binary file (10.6 kB). View file

api.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import os
+import json
+import uvicorn
+from fastapi import FastAPI, Request
+from model import is_chinese, get_infer_setting, generate_input, chat
+import datetime
+import torch
+gpu_number = 0
+model, tokenizer = get_infer_setting(gpu_device=gpu_number)
+app = FastAPI()
+@app.post('/')
+async def visual_glm(request: Request):
+    json_post_raw = await request.json()
+    print("Start to process request")
+    json_post = json.dumps(json_post_raw)
+    request_data = json.loads(json_post)
+    input_text, input_image_encoded, history = request_data['text'], request_data['image'], request_data['history']
+    input_para = {
+        "max_length": 2048,
+        "min_length": 50,
+        "temperature": 0.8,
+        "top_p": 0.4,
+        "top_k": 100,
+        "repetition_penalty": 1.2
+    }
+    input_para.update(request_data)
+    is_zh = is_chinese(input_text)
+    input_data = generate_input(input_text, input_image_encoded, history, input_para)
+    input_image, gen_kwargs =  input_data['input_image'], input_data['gen_kwargs']
+    with torch.no_grad():
+        answer, history, _ = chat(None, model, tokenizer, input_text, history=history, image=input_image, \
+                            max_length=gen_kwargs['max_length'], top_p=gen_kwargs['top_p'], \
+                            top_k = gen_kwargs['top_k'], temperature=gen_kwargs['temperature'], english=not is_zh)
+    now = datetime.datetime.now()
+    time = now.strftime("%Y-%m-%d %H:%M:%S")
+    response = {
+        "result": answer,
+        "history": history,
+        "status": 200,
+        "time": time
+    }
+    return response
+if __name__ == '__main__':
+    uvicorn.run(app, host='0.0.0.0', port=8080, workers=1)

api_hf.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import os
+import json
+from transformers import AutoTokenizer, AutoModel
+import uvicorn
+from fastapi import FastAPI, Request
+import datetime
+from model import process_image
+import torch
+tokenizer = AutoTokenizer.from_pretrained("THUDM/visualglm-6b", trust_remote_code=True)
+model = AutoModel.from_pretrained("THUDM/visualglm-6b", trust_remote_code=True).half().cuda()
+app = FastAPI()
+@app.post('/')
+async def visual_glm(request: Request):
+    json_post_raw = await request.json()
+    print("Start to process request")
+    json_post = json.dumps(json_post_raw)
+    request_data = json.loads(json_post)
+    history = request_data.get("history")
+    image_encoded = request_data.get("image")
+    query = request_data.get("text")
+    image_path = process_image(image_encoded)
+    with torch.no_grad():
+        result = model.stream_chat(tokenizer, image_path, query, history=history)
+    last_result = None
+    for value in result:
+        last_result = value
+    answer = last_result[0]
+    if os.path.isfile(image_path):
+        os.remove(image_path)
+    now = datetime.datetime.now()
+    time = now.strftime("%Y-%m-%d %H:%M:%S")
+    response = {
+        "result": answer,
+        "history": history,
+        "status": 200,
+        "time": time
+    }
+    return response
+if __name__ == "__main__":
+   uvicorn.run(app, host='0.0.0.0', port=8080, workers=1)

cli_demo.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# -*- encoding: utf-8 -*-
+import os
+import sys
+import torch
+import argparse
+from transformers import AutoTokenizer
+from sat.model.mixins import CachedAutoregressiveMixin
+from sat.quantization.kernels import quantize
+from model import VisualGLMModel, chat
+from finetune_visualglm import FineTuneVisualGLMModel
+from sat.model import AutoModel
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--max_length", type=int, default=2048, help='max length of the total sequence')
+    parser.add_argument("--top_p", type=float, default=0.4, help='top p for nucleus sampling')
+    parser.add_argument("--top_k", type=int, default=100, help='top k for top k sampling')
+    parser.add_argument("--temperature", type=float, default=.8, help='temperature for sampling')
+    parser.add_argument("--english", action='store_true', help='only output English')
+    parser.add_argument("--quant", choices=[8, 4], type=int, default=None, help='quantization bits')
+    parser.add_argument("--from_pretrained", type=str, default="visualglm-6b", help='pretrained ckpt')
+    parser.add_argument("--prompt_zh", type=str, default="描述这张图片。", help='Chinese prompt for the first round')
+    parser.add_argument("--prompt_en", type=str, default="Describe the image.", help='English prompt for the first round')
+    args = parser.parse_args()
+    # load model
+    model, model_args = AutoModel.from_pretrained(
+        args.from_pretrained,
+        args=argparse.Namespace(
+        fp16=True,
+        skip_init=True,
+        use_gpu_initialization=True if (torch.cuda.is_available() and args.quant is None) else False,
+        device='cuda' if (torch.cuda.is_available() and args.quant is None) else 'cpu',
+    ))
+    model = model.eval()
+    if args.quant:
+        quantize(model.transformer, args.quant)
+        if torch.cuda.is_available():
+            model = model.cuda()
+    model.add_mixin('auto-regressive', CachedAutoregressiveMixin())
+    tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
+    if not args.english:
+        print('欢迎使用 VisualGLM-6B 模型，输入图像URL或本地路径读图，继续输入内容对话，clear 重新开始，stop 终止程序')
+    else:
+        print('Welcome to VisualGLM-6B model. Enter an image URL or local file path to load an image. Continue inputting text to engage in a conversation. Type "clear" to start over, or "stop" to end the program.')
+    with torch.no_grad():
+        while True:
+            history = None
+            cache_image = None
+            if not args.english:
+                image_path = input("请输入图像路径或URL（回车进入纯文本对话）： ")
+            else:
+                image_path = input("Please enter the image path or URL (press Enter for plain text conversation): ")
+            if image_path == 'stop':
+                break
+            if len(image_path) > 0:
+                query = args.prompt_en if args.english else args.prompt_zh
+            else:
+                if not args.english:
+                    query = input("用户：")
+                else:
+                    query = input("User: ")
+            while True:
+                if query == "clear":
+                    break
+                if query == "stop":
+                    sys.exit(0)
+                try:
+                    response, history, cache_image = chat(
+                        image_path,
+                        model,
+                        tokenizer,
+                        query,
+                        history=history,
+                        image=cache_image,
+                        max_length=args.max_length,
+                        top_p=args.top_p,
+                        temperature=args.temperature,
+                        top_k=args.top_k,
+                        english=args.english,
+                        invalid_slices=[slice(63823, 130000)] if args.english else []
+                        )
+                except Exception as e:
+                    print(e)
+                    break
+                sep = 'A:' if args.english else '答：'
+                print("VisualGLM-6B："+response.split(sep)[-1].strip())
+                image_path = None
+                if not args.english:
+                    query = input("用户：")
+                else:
+                    query = input("User: ")
+if __name__ == "__main__":
+    main()

cli_demo_hf.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import os
+import platform
+import signal
+from transformers import AutoTokenizer, AutoModel
+import torch
+tokenizer = AutoTokenizer.from_pretrained("THUDM/visualglm-6b", trust_remote_code=True)
+model = AutoModel.from_pretrained("THUDM/visualglm-6b", trust_remote_code=True).half().cuda()
+model = model.eval()
+os_name = platform.system()
+clear_command = 'cls' if os_name == 'Windows' else 'clear'
+stop_stream = False
+def build_prompt(history, prefix):
+    prompt = prefix
+    for query, response in history:
+        prompt += f"\n\n用户：{query}"
+        prompt += f"\n\nVisualGLM-6B：{response}"
+    return prompt
+def signal_handler(signal, frame):
+    global stop_stream
+    stop_stream = True
+def main():
+    global stop_stream
+    while True:
+        history = []
+        prefix = "欢迎使用 VisualGLM-6B 模型，输入图片路径和内容即可进行对话，clear 清空对话历史，stop 终止程序"
+        print(prefix)
+        image_path = input("\n请输入图片路径：")
+        if image_path == "stop":
+            break
+        prefix = prefix + "\n" + image_path
+        query = "描述这张图片。"
+        while True:
+            count = 0
+            with torch.no_grad():
+                for response, history in model.stream_chat(tokenizer, image_path, query, history=history):
+                    if stop_stream:
+                        stop_stream = False
+                        break
+                    else:
+                        count += 1
+                        if count % 8 == 0:
+                            os.system(clear_command)
+                            print(build_prompt(history, prefix), flush=True)
+                            signal.signal(signal.SIGINT, signal_handler)
+            os.system(clear_command)
+            print(build_prompt(history, prefix), flush=True)
+            query = input("\n用户：")
+            if query.strip() == "clear":
+                break
+            if query.strip() == "stop":
+                stop_stream = True
+                exit(0)
+            # if query.strip() == "clear":
+            #     history = []
+            #     os.system(clear_command)
+            #     print(prefix)
+            #     continue
+if __name__ == "__main__":
+    main()

examples/1.jpeg ADDED Viewed

Git LFS Details

SHA256: 821ef1b9aaf417f2438bf5395f43201b7774b482b8fd74bfd07893ab4ef03694
Pointer size: 132 Bytes
Size of remote file: 1.02 MB

examples/2.jpeg ADDED Viewed

examples/3.jpeg ADDED Viewed

examples/chat_example1.png ADDED Viewed

examples/chat_example2.png ADDED Viewed

examples/chat_example3.png ADDED Viewed

examples/example_inputs.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+{"id":1, "text": "描述一下这个场景", "image": "examples/1.jpeg"}
+{"id":2, "text": "这是什么东西", "image": "examples/2.jpeg"}
+{"id":3, "text": "这张图片描述了什么", "image": "examples/3.jpeg"}

examples/thu.png ADDED Viewed

examples/web_demo.png ADDED Viewed

fewshot-data.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e96484834c7d9bda898f8da5f658ea63268ebd3aa10ac7f0da3b3dc40a86e1b7
+size 6695260

finetune/finetune_visualglm.sh ADDED Viewed

	@@ -0,0 +1,58 @@

+#! /bin/bash
+NUM_WORKERS=1
+NUM_GPUS_PER_WORKER=8
+MP_SIZE=1
+script_path=$(realpath $0)
+script_dir=$(dirname $script_path)
+main_dir=$(dirname $script_dir)
+MODEL_TYPE="visualglm-6b"
+MODEL_ARGS="--max_source_length 64 \
+    --max_target_length 256 \
+    --lora_rank 10 \
+    --layer_range 0 14 \
+    --pre_seq_len 4"
+# OPTIONS_SAT="SAT_HOME=$1" #"SAT_HOME=/raid/dm/sat_models"
+OPTIONS_NCCL="NCCL_DEBUG=info NCCL_IB_DISABLE=0 NCCL_NET_GDR_LEVEL=2"
+HOST_FILE_PATH="hostfile"
+HOST_FILE_PATH="hostfile_single"
+train_data="./fewshot-data/dataset.json"
+eval_data="./fewshot-data/dataset.json"
+gpt_options=" \
+       --experiment-name finetune-$MODEL_TYPE \
+       --model-parallel-size ${MP_SIZE} \
+       --mode finetune \
+       --train-iters 300 \
+       --resume-dataloader \
+       $MODEL_ARGS \
+       --train-data ${train_data} \
+       --valid-data ${eval_data} \
+       --distributed-backend nccl \
+       --lr-decay-style cosine \
+       --warmup .02 \
+       --checkpoint-activations \
+       --save-interval 300 \
+       --eval-interval 10000 \
+       --save "./checkpoints" \
+       --split 1 \
+       --eval-iters 10 \
+       --eval-batch-size 8 \
+       --zero-stage 1 \
+       --lr 0.0001 \
+       --batch-size 4 \
+       --skip-init \
+       --fp16 \
+       --use_lora
+"
+run_cmd="${OPTIONS_NCCL} ${OPTIONS_SAT} deepspeed --master_port 16666 --hostfile ${HOST_FILE_PATH} finetune_visualglm.py ${gpt_options}"
+echo ${run_cmd}
+eval ${run_cmd}
+set +x

finetune/finetune_visualglm_qlora.sh ADDED Viewed

	@@ -0,0 +1,59 @@

+#! /bin/bash
+NUM_WORKERS=1
+NUM_GPUS_PER_WORKER=8
+MP_SIZE=1
+script_path=$(realpath $0)
+script_dir=$(dirname $script_path)
+main_dir=$(dirname $script_dir)
+MODEL_TYPE="visualglm-6b"
+MODEL_ARGS="--max_source_length 64 \
+    --max_target_length 256 \
+    --lora_rank 10 \
+    --layer_range 0 14 \
+    --pre_seq_len 4"
+# OPTIONS_SAT="SAT_HOME=$1" #"SAT_HOME=/raid/dm/sat_models"
+OPTIONS_NCCL="NCCL_DEBUG=info NCCL_IB_DISABLE=0 NCCL_NET_GDR_LEVEL=2"
+HOST_FILE_PATH="hostfile"
+HOST_FILE_PATH="hostfile_single"
+train_data="./fewshot-data/dataset.json"
+eval_data="./fewshot-data/dataset.json"
+gpt_options=" \
+       --experiment-name finetune-$MODEL_TYPE \
+       --model-parallel-size ${MP_SIZE} \
+       --mode finetune \
+       --train-iters 300 \
+       --resume-dataloader \
+       $MODEL_ARGS \
+       --train-data ${train_data} \
+       --valid-data ${eval_data} \
+       --distributed-backend nccl \
+       --lr-decay-style cosine \
+       --warmup .02 \
+       --checkpoint-activations \
+       --save-interval 300 \
+       --eval-interval 10000 \
+       --save "./checkpoints" \
+       --split 1 \
+       --eval-iters 10 \
+       --eval-batch-size 8 \
+       --zero-stage 1 \
+       --lr 0.0001 \
+       --batch-size 1 \
+       --gradient-accumulation-steps 4 \
+       --skip-init \
+       --fp16 \
+       --use_qlora
+"
+run_cmd="${OPTIONS_NCCL} ${OPTIONS_SAT} deepspeed --master_port 16666 --include localhost:0 --hostfile ${HOST_FILE_PATH} finetune_visualglm.py ${gpt_options}"
+echo ${run_cmd}
+eval ${run_cmd}
+set +x

finetune_visualglm.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import os
+import torch
+import argparse
+from sat import mpu, get_args, get_tokenizer
+from sat.training.deepspeed_training import training_main
+from model import VisualGLMModel
+from sat.model.finetune import PTuningV2Mixin
+from lora_mixin import LoraMixin
+class FineTuneVisualGLMModel(VisualGLMModel):
+    def __init__(self, args, transformer=None, parallel_output=True, **kw_args):
+        super().__init__(args, transformer=transformer, parallel_output=parallel_output, **kw_args)
+        if args.use_ptuning:
+            self.add_mixin("ptuning", PTuningV2Mixin(args.num_layers, args.hidden_size // args.num_attention_heads, args.num_attention_heads, args.pre_seq_len))
+        if args.use_lora:
+            # If you use lora on other "normal" Transformer, just use it with head_first=False (by default)
+            self.add_mixin("lora", LoraMixin(args.num_layers, args.lora_rank, head_first=True, num_attention_heads=args.num_attention_heads, hidden_size_per_attention_head=args.hidden_size // args.num_attention_heads, layer_range=args.layer_range), reinit=True)
+            # self.get_mixin("eva").model.glm_proj = replace_linear_with_lora(self.get_mixin("eva").model.glm_proj, LoraLinear, args.lora_rank)
+        elif args.use_qlora:
+            self.add_mixin("lora", LoraMixin(args.num_layers, args.lora_rank, head_first=True, num_attention_heads=args.num_attention_heads, hidden_size_per_attention_head=args.hidden_size // args.num_attention_heads, layer_range=args.layer_range, qlora=True), reinit=True)
+        self.args = args
+    @classmethod
+    def add_model_specific_args(cls, parser):
+        group = parser.add_argument_group('VisualGLM-finetune', 'VisualGLM finetune Configurations')
+        group.add_argument('--pre_seq_len', type=int, default=8)
+        group.add_argument('--lora_rank', type=int, default=10)
+        group.add_argument('--use_ptuning', action="store_true")
+        group.add_argument('--use_lora', action="store_true")
+        group.add_argument('--use_qlora', action="store_true")
+        group.add_argument('--layer_range', nargs='+', type=int, default=None)
+        return super().add_model_specific_args(parser)
+    def disable_untrainable_params(self):
+        enable = []
+        if self.args.use_ptuning:
+            enable.extend(['ptuning'])
+        if self.args.use_lora or self.args.use_qlora:
+            enable.extend(['matrix_A', 'matrix_B'])
+        for n, p in self.named_parameters():
+            flag = False
+            for e in enable:
+                if e.lower() in n.lower():
+                    flag = True
+                    break
+            if not flag:
+                p.requires_grad_(False)
+            else:
+                print(n)
+def get_batch(data_iterator, args, timers):
+    # Items and their type.
+    keys = ['input_ids', 'labels']
+    datatype = torch.int64
+    # Broadcast data.
+    timers('data loader').start()
+    if data_iterator is not None:
+        data = next(data_iterator)
+    else:
+        data = None
+    timers('data loader').stop()
+    data_b = mpu.broadcast_data(keys, data, datatype)
+    data_i = mpu.broadcast_data(['image'], data, torch.float32)
+    # Unpack.
+    tokens = data_b['input_ids'].long()
+    labels = data_b['labels'].long()
+    img = data_i['image']
+    if args.fp16:
+        img = img.half()
+    return tokens, labels, img, data['pre_image']
+from torch.nn import CrossEntropyLoss
+def forward_step(data_iterator, model, args, timers):
+    """Forward step."""
+    # Get the batch.
+    timers('batch generator').start()
+    tokens, labels, image, pre_image = get_batch(
+        data_iterator, args, timers)
+    timers('batch generator').stop()
+    logits = model(input_ids=tokens, image=image, pre_image=pre_image)[0]
+    dtype = logits.dtype
+    lm_logits = logits.to(torch.float32)
+    # Shift so that tokens < n predict n
+    shift_logits = lm_logits[..., :-1, :].contiguous()
+    shift_labels = labels[..., 1:].contiguous()
+    # Flatten the tokens
+    loss_fct = CrossEntropyLoss(ignore_index=-100)
+    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+    lm_logits = lm_logits.to(dtype)
+    loss = loss.to(dtype)
+    return loss, {'loss': loss}
+from model.blip2 import BlipImageEvalProcessor
+from torch.utils.data import Dataset
+import json
+from PIL import Image
+class FewShotDataset(Dataset):
+    def __init__(self, path, processor, tokenizer, args):
+        max_seq_length = args.max_source_length + args.max_target_length
+        with open(path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        self.images = []
+        self.input_ids = []
+        self.labels = []
+        for item in data:
+            image = processor(Image.open(item['img']).convert('RGB'))
+            input0 = tokenizer.encode("<img>", add_special_tokens=False)
+            input1 = [tokenizer.pad_token_id] * args.image_length
+            input2 = tokenizer.encode("</img>问："+item['prompt']+"\n答��", add_special_tokens=False)
+            a_ids = sum([input0, input1, input2], [])
+            b_ids = tokenizer.encode(text=item['label'], add_special_tokens=False)
+            if len(a_ids) > args.max_source_length - 1:
+                a_ids = a_ids[: args.max_source_length - 1]
+            if len(b_ids) > args.max_target_length - 2:
+                b_ids = b_ids[: args.max_target_length - 2]
+            pre_image = len(input0)
+            input_ids = tokenizer.build_inputs_with_special_tokens(a_ids, b_ids)
+            context_length = input_ids.index(tokenizer.bos_token_id)
+            mask_position = context_length - 1
+            labels = [-100] * context_length + input_ids[mask_position+1:]
+            pad_len = max_seq_length - len(input_ids)
+            input_ids = input_ids + [tokenizer.pad_token_id] * pad_len
+            labels = labels + [tokenizer.pad_token_id] * pad_len
+            if args.ignore_pad_token_for_loss:
+                labels = [(l if l != tokenizer.pad_token_id else -100) for l in labels]
+            self.images.append(image)
+            self.input_ids.append(input_ids)
+            self.labels.append(labels)
+        self.pre_image = pre_image
+    def __len__(self):
+        return len(self.images)
+    def __getitem__(self, idx):
+        return {
+            "image": self.images[idx],
+            "input_ids": self.input_ids[idx],
+            "labels": self.labels[idx],
+            "pre_image": self.pre_image
+        }
+def create_dataset_function(path, args):
+    tokenizer = get_tokenizer(args)
+    image_processor = BlipImageEvalProcessor(224)
+    dataset = FewShotDataset(path, image_processor, tokenizer, args)
+    return dataset
+if __name__ == '__main__':
+    py_parser = argparse.ArgumentParser(add_help=False)
+    py_parser.add_argument('--max_source_length', type=int)
+    py_parser.add_argument('--max_target_length', type=int)
+    py_parser.add_argument('--ignore_pad_token_for_loss', type=bool, default=True)
+    # py_parser.add_argument('--old_checkpoint', action="store_true")
+    py_parser.add_argument('--source_prefix', type=str, default="")
+    py_parser = FineTuneVisualGLMModel.add_model_specific_args(py_parser)
+    known, args_list = py_parser.parse_known_args()
+    args = get_args(args_list)
+    args = argparse.Namespace(**vars(args), **vars(known))
+    args.device = 'cpu'
+    model_type = 'visualglm-6b'
+    model, args = FineTuneVisualGLMModel.from_pretrained(model_type, args)
+    if torch.cuda.is_available():
+        model = model.to('cuda')
+    tokenizer = get_tokenizer(args)
+    label_pad_token_id = -100 if args.ignore_pad_token_for_loss else tokenizer.pad_token_id
+    def data_collator(examples):
+        for example in examples:
+            example['input_ids'] = torch.tensor(example['input_ids'], dtype=torch.long)
+            example['labels'] = torch.tensor(example['labels'], dtype=torch.long)
+        ret = {
+            'input_ids': torch.stack([example['input_ids'] for example in examples]),
+            'labels': torch.stack([example['labels'] for example in examples]),
+            'image': torch.stack([example['image'] for example in examples]),
+            'pre_image': example['pre_image']
+        }
+        return ret
+    training_main(args, model_cls=model, forward_step_function=forward_step, create_dataset_function=create_dataset_function, collate_fn=data_collator)

lora_mixin.py ADDED Viewed

	@@ -0,0 +1,260 @@

+"""
+In this mixin, I use a different implementation than sat/model/finetune/lora.py
+I just use a fake linear layer to replace any model with lora mixin.
+"""
+import torch
+import torch.nn as nn
+from sat.model.base_model import BaseMixin
+import math
+from sat.helpers import print_all
+from sat.model.transformer import RowParallelLinear, ColumnParallelLinear
+class HackLinear(nn.Linear):
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
+        if prefix + 'weight' in state_dict:
+            self.weight.data.copy_(state_dict[prefix+'weight'])
+        if prefix + 'bias' in state_dict:
+            self.bias.data.copy_(state_dict[prefix+'bias'])
+class HackRowParallelLinear(RowParallelLinear):
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
+        if prefix + 'weight' in state_dict:
+            self.weight.data.copy_(state_dict[prefix+'weight'])
+        if prefix + 'bias' in state_dict:
+            self.bias.data.copy_(state_dict[prefix+'bias'])
+class HackColumnParallelLinear(ColumnParallelLinear):
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
+        if prefix + 'weight' in state_dict:
+            self.weight.data.copy_(state_dict[prefix+'weight'])
+        if prefix + 'bias' in state_dict:
+            self.bias.data.copy_(state_dict[prefix+'bias'])
+try:
+    from bitsandbytes.nn import LinearNF4
+    def copy_nested_list(src, dst):
+        for i in range(len(dst)):
+            if type(dst[i]) is torch.Tensor:
+                dst[i].copy_(src[i])
+            elif type(dst[i]) is list:
+                copy_nested_list(src[i], dst[i])
+            else:
+                dst[i] = src[i]
+    class HackLinearNF4(LinearNF4):
+        def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
+            if prefix + 'weight' in state_dict:
+                self.weight.data.copy_(state_dict[prefix+'weight'])
+                if self.weight.data.dtype == torch.uint8:
+                    copy_nested_list(state_dict[prefix+'quant_state'], self.weight.quant_state)
+            if prefix + 'bias' in state_dict:
+                self.bias.data.copy_(state_dict[prefix+'bias'])
+        def _save_to_state_dict(self, destination, prefix, keep_vars):
+            super()._save_to_state_dict(destination, prefix, keep_vars)
+            destination[prefix+'quant_state'] = self.weight.quant_state
+except Exception as exception:
+    print_all("Failed to load bitsandbytes:" + str(exception), level='WARNING')
+class HackParameterList(nn.ParameterList):
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
+        for i in range(len(self)):
+            if prefix + str(i) in state_dict:
+                self[i].data.copy_(state_dict[prefix+str(i)])
+map_cls = {
+    nn.Linear: (HackLinear, {}),
+    ColumnParallelLinear: (HackColumnParallelLinear, {'gather_output': False}),
+    RowParallelLinear: (HackRowParallelLinear, {'input_is_parallel': True})
+}
+class LoraLinear(nn.Module):
+    def __init__(self, original_cls, partition, in_dim, out_dim, r, lora_alpha=1., lora_dropout=0., head_first=False, num_attention_heads=None, hidden_size_per_attention_head=None, qlora=False):
+        """
+        You can use safely with this layer, ONLY WHEN query_key_value output is query_key_value order.
+        If you use a different order like ChatGLM
+        """
+        super().__init__()
+        if lora_dropout and lora_dropout > 0:
+            self.lora_dropout = nn.Dropout(p=lora_dropout)
+        else:
+            self.lora_dropout = lambda x: x
+        self.r = r
+        self.lora_alpha = lora_alpha
+        self.scaling = self.lora_alpha / self.r
+        if qlora:
+            try:
+                self.original = HackLinearNF4(in_dim, out_dim)
+            except:
+                raise Exception('Build 4bit layer failed. You need to install the latest bitsandbytes. Try `pip install bitsandbytes`. If you still meet error after installation, try running `from bitsandbytes.nn import LinearNF4` with python and fix the error.')
+        else:
+            base_cls, kwargs = map_cls[original_cls]
+            self.original = base_cls(in_dim, out_dim, **kwargs)
+        self.matrix_A = HackParameterList([nn.Parameter(torch.empty((r, in_dim))) for _ in range(partition)])
+        self.matrix_B = HackParameterList([nn.Parameter(torch.empty((out_dim // partition, r))) for _ in range(partition)])
+        for i in range(partition):
+            nn.init.kaiming_uniform_(self.matrix_A[i], a=math.sqrt(5))
+            nn.init.zeros_(self.matrix_B[i])
+        self.head_first = head_first
+        self.partition = partition
+        if head_first:
+            assert num_attention_heads is not None and hidden_size_per_attention_head is not None, "You should set num_attention_heads and hidden_size_per_attention_head if you use head_first=True!"
+            self.num_attention_heads = num_attention_heads
+            self.hidden_size_per_attention_head = hidden_size_per_attention_head
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
+        # This is not a perfect version, becuase it doesn't handle errors and unexpected keys.
+        if prefix + 'weight' in state_dict:
+            # load from normal Linear
+            self.original._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
+        else:
+            # load from LoraLinear
+            super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
+    def forward(self, x):
+        mixed_raw_layer = self.original(x)
+        lora_outputs = []
+        for i in range(self.partition):
+            lora_outputs.append((self.lora_dropout(x) @ self.matrix_A[i].T @ self.matrix_B[i].T) * self.scaling)
+        if self.head_first:
+            new_tensor_shape = lora_outputs[0].size()[:-1] + (
+                self.num_attention_heads,
+                self.hidden_size_per_attention_head,
+            )
+            for i in range(self.partition):
+                lora_outputs[i] = lora_outputs[i].view(*new_tensor_shape)
+            mixed_raw_layer = mixed_raw_layer + torch.cat(lora_outputs, -1).view(*mixed_raw_layer.size())
+        else:
+            mixed_raw_layer = mixed_raw_layer + torch.cat(lora_outputs, -1)
+        return mixed_raw_layer
+def replace_linear_with_lora(lin, partition, r, *args, **kw_args):
+    # not supported for linear without bias for now
+    out_dim, in_dim = lin.weight.shape
+    original_cls = type(lin)
+    del lin
+    return LoraLinear(original_cls, partition, in_dim, out_dim, r, *args, **kw_args)
+def merge_linear_lora(lin):
+    if lin.original.weight.data.dtype is not torch.uint8:
+        weight = lin.original.weight
+        out_dim, in_dim = weight.shape
+        new_lin = nn.Linear(in_dim, out_dim)
+    else:
+        import bitsandbytes.functional as F
+        weight = F.dequantize_fp4(lin.original.weight.data, lin.original.weight.quant_state).to(lin.original.bias.data.dtype)
+        out_dim, in_dim = weight.shape
+        new_lin = HackLinearNF4(in_dim, out_dim)
+    new_lin.bias.data = lin.original.bias.data
+    new_qkv = []
+    for i in range(lin.partition):
+        new_qkv.append(lin.matrix_A[i].data.T.float() @ lin.matrix_B[i].data.T.float() * lin.scaling)
+    if lin.head_first:
+        ini_shape = new_qkv[0].shape
+        new_qkv = [x.view(ini_shape[0], lin.num_attention_heads, -1) for x in new_qkv]
+        new_qkv = torch.cat(new_qkv, -1).view(ini_shape[0], lin.partition*ini_shape[1])
+    else:
+        new_qkv = torch.cat(new_qkv, -1)
+    new_lin.weight.data = weight + new_qkv.T.to(lin.original.bias.data.dtype)
+    return new_lin.cuda() if torch.cuda.is_available() else new_lin
+class LoraMixin(BaseMixin):
+    def __init__(self,
+                layer_num,
+                r: int = 0,
+                lora_alpha: int = 1,
+                lora_dropout: float = 0.,
+                layer_range = None,
+                head_first = False,
+                num_attention_heads = None,
+                hidden_size_per_attention_head = None,
+                qlora = False,
+                cross_attention = True):
+        super().__init__()
+        self.r = r
+        self.lora_alpha = lora_alpha
+        self.lora_dropout = lora_dropout
+        if layer_range is None:
+            layer_range = [i for i in range(layer_num)]
+        self.layer_range = layer_range
+        self.scaling = self.lora_alpha / self.r
+        self.head_first = head_first
+        self.num_attention_heads = num_attention_heads
+        self.hidden_size_per_attention_head = hidden_size_per_attention_head
+        self.qlora = qlora
+        self.cross_attention = cross_attention
+    def reinit(self, parent_model):
+        for i in self.layer_range:
+            print(f'replacing layer {i} attention with lora')
+            parent_model.transformer.layers[i].attention.dense = replace_linear_with_lora(parent_model.transformer.layers[i].attention.dense, 1, self.r, self.lora_alpha, self.lora_dropout, qlora=self.qlora)
+            parent_model.transformer.layers[i].attention.query_key_value = replace_linear_with_lora(parent_model.transformer.layers[i].attention.query_key_value, 3, self.r, self.lora_alpha, self.lora_dropout, head_first=self.head_first, num_attention_heads=self.num_attention_heads, hidden_size_per_attention_head=self.hidden_size_per_attention_head, qlora=self.qlora)
+            if self.cross_attention and parent_model.transformer.layers[i].is_decoder:
+                print(f'replacing layer {i} cross attention with lora')
+                parent_model.transformer.layers[i].cross_attention.dense = replace_linear_with_lora(parent_model.transformer.layers[i].cross_attention.dense, 1, self.r, self.lora_alpha, self.lora_dropout, qlora=self.qlora)
+                parent_model.transformer.layers[i].cross_attention.query = replace_linear_with_lora(parent_model.transformer.layers[i].cross_attention.query, 1, self.r, self.lora_alpha, self.lora_dropout, qlora=self.qlora)
+                parent_model.transformer.layers[i].cross_attention.key_value = replace_linear_with_lora(parent_model.transformer.layers[i].cross_attention.key_value, 2, self.r, self.lora_alpha, self.lora_dropout, qlora=self.qlora)
+        if self.qlora:
+            print('replacing chatglm linear layer with 4bit')
+            def replace_linear_with_nf4(model, name=None, cache={}):
+                if type(model) in (nn.Linear, RowParallelLinear, ColumnParallelLinear):
+                    out_dim, in_dim = model.weight.shape
+                    return HackLinearNF4(in_dim, out_dim)
+                names = set()
+                for name, child in model.named_children():
+                    if name not in names:
+                        if child in cache:
+                            new_child = cache[child]
+                        else:
+                            new_child = replace_linear_with_nf4(child, name=name, cache=cache)
+                            cache[child] = new_child
+                        setattr(model, name, new_child)
+                        names.add(name)
+                flag = True
+                while flag:
+                    flag = False
+                    for name, child in model.named_children():
+                        if name not in names:
+                            setattr(model, name, cache[child])
+                            names.add(name)
+                            flag = True
+                return model
+            replace_linear_with_nf4(parent_model.transformer, None, {})
+    def merge_lora(self):
+        for i in self.layer_range:
+            print(f'merge layer {i} lora attention back to linear')
+            self.transformer.layers[i].attention.dense = merge_linear_lora(self.transformer.layers[i].attention.dense)
+            self.transformer.layers[i].attention.query_key_value = merge_linear_lora(self.transformer.layers[i].attention.query_key_value)
+            if self.transformer.layers[i].is_decoder:
+                print(f'merge layer {i} lora cross attention back to linear')
+                self.transformer.layers[i].cross_attention.dense = merge_linear_lora(self.transformer.layers[i].cross_attention.dense)
+                self.transformer.layers[i].cross_attention.query = merge_linear_lora(self.transformer.layers[i].cross_attention.query)
+                self.transformer.layers[i].cross_attention.key_value = merge_linear_lora(self.transformer.layers[i].cross_attention.key_value)
+if __name__ == '__main__':
+    class Model(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.child = nn.Linear(100, 200)
+        def forward(self, x):
+            return self.child(x)
+    model = Model()
+    torch.save(model.state_dict(), "linear.pt")
+    x = torch.randn(2, 100)
+    out1 = model(x)
+    model.child = LoraLinear(100, 200, 10)
+    model.load_state_dict(torch.load("linear.pt"), strict=False)
+    out2 = model(x)
+    torch.save(model.state_dict(), "lora.pt")
+    ckpt = torch.load("lora.pt")
+    breakpoint()
+    model.load_state_dict(ckpt, strict=False)
+    out3 = model(x)
+    breakpoint()

model/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .chat import chat
+from .infer_util import *
+from .blip2 import BlipImageEvalProcessor

model/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (235 Bytes). View file

model/__pycache__/blip2.cpython-310.pyc ADDED Viewed

Binary file (4.66 kB). View file

model/__pycache__/chat.cpython-310.pyc ADDED Viewed

Binary file (4.89 kB). View file

model/__pycache__/infer_util.cpython-310.pyc ADDED Viewed

Binary file (1.98 kB). View file

model/__pycache__/visualglm.cpython-310.pyc ADDED Viewed

Binary file (2.25 kB). View file

model/blip2.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import torch
+import torch.nn as nn
+from sat.model import ViTModel, BaseModel
+from sat.model import BaseMixin
+from sat import AutoModel
+from copy import deepcopy
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+class LNFinalyMixin(BaseMixin):
+    def __init__(self, hidden_size):
+        super().__init__()
+        self.ln_vision = nn.LayerNorm(hidden_size)
+    def final_forward(self, logits, **kw_args):
+        return self.ln_vision(logits)
+class EVAViT(ViTModel):
+    def __init__(self, args, transformer=None, parallel_output=True, **kwargs):
+        super().__init__(args, transformer=transformer, parallel_output=parallel_output, **kwargs)
+        self.del_mixin("cls")
+        self.add_mixin("cls", LNFinalyMixin(args.hidden_size))
+    def forward(self, image):
+        batch_size = image.size(0)
+        input_ids = torch.zeros(batch_size, 1, dtype=torch.long, device=image.device)
+        attention_mask = torch.tensor([[1.]], dtype=image.dtype, device=image.device)
+        return super().forward(input_ids=input_ids, position_ids=None, attention_mask=attention_mask, image=image)
+class QFormer(BaseModel):
+    def __init__(self, args, transformer=None, parallel_output=True, **kwargs):
+        super().__init__(args, transformer=transformer, parallel_output=parallel_output, activation_func=nn.functional.gelu, **kwargs)
+        self.transformer.position_embeddings = None
+    def final_forward(self, logits, **kw_args):
+        return logits
+    def position_embedding_forward(self, position_ids, **kw_args):
+        return None
+    def forward(self, encoder_outputs):
+        batch_size = encoder_outputs.size(0)
+        input_ids = torch.arange(32, dtype=torch.long, device=encoder_outputs.device).unsqueeze(0).expand(batch_size, -1)
+        attention_mask = torch.tensor([[1.]], dtype=encoder_outputs.dtype, device=encoder_outputs.device)
+        cross_attention_mask = torch.tensor([[1.]], dtype=encoder_outputs.dtype, device=encoder_outputs.device)
+        return super().forward(input_ids=input_ids, position_ids=None, attention_mask=attention_mask, encoder_outputs=encoder_outputs, cross_attention_mask=cross_attention_mask)
+class BLIP2(torch.nn.Module):
+    def __init__(self, eva_args, qformer_args, vit=None, qformer=None, **kwargs):
+        super().__init__()
+        if vit is not None:
+            self.vit = vit
+        else:
+            self.vit = EVAViT(EVAViT.get_args(**eva_args))
+        if qformer is not None:
+            self.qformer = qformer
+        else:
+            self.qformer = QFormer(QFormer.get_args(**qformer_args))
+        self.glm_proj = nn.Linear(768, 4096).to(self.qformer.parameters().__next__().device).to(self.qformer.parameters().__next__().dtype)
+    def forward(self, image, **kwargs):
+        enc = self.vit(image)[0]
+        out = self.qformer(enc)[0]
+        return self.glm_proj(out)
+class BlipImageBaseProcessor():
+    def __init__(self, mean=None, std=None):
+        if mean is None:
+            mean = (0.48145466, 0.4578275, 0.40821073)
+        if std is None:
+            std = (0.26862954, 0.26130258, 0.27577711)
+        self.normalize = transforms.Normalize(mean, std)
+class BlipImageEvalProcessor(BlipImageBaseProcessor):
+    def __init__(self, image_size=384, mean=None, std=None):
+        super().__init__(mean=mean, std=std)
+        self.transform = transforms.Compose(
+            [
+                transforms.Resize(
+                    (image_size, image_size), interpolation=InterpolationMode.BICUBIC
+                ),
+                transforms.ToTensor(),
+                self.normalize,
+            ]
+        )
+    def __call__(self, item):
+        return self.transform(item)

model/chat.py ADDED Viewed

	@@ -0,0 +1,175 @@

+# -*- encoding: utf-8 -*-
+'''
+@File    :   chat.py
+@Time    :   2023/05/08 19:10:08
+@Author  :   Ming Ding
+@Contact :   dm18@mails.tsinghua.edu.cn
+'''
+import os
+import sys
+import re
+from functools import partial
+from typing import Optional, Tuple, Union, List, Callable, Dict, Any
+import requests
+from PIL import Image
+from io import BytesIO
+import torch
+from sat.generation.autoregressive_sampling import filling_sequence, BaseStrategy
+from .blip2 import BlipImageEvalProcessor
+def get_masks_and_position_ids_glm(seq, mask_position, context_length):
+    '''GLM model, different from GPT.
+    Args:
+        seq: torch.IntTensor, [seq_len]
+        mask_position: int, the position of the masked place.
+        context_length: int, the length of context.
+    Returns:
+        tokens: torch.IntTensor, [1, seq_len]
+        attention_mask: torch.FloatTensor, [1, seq_len, seq_len]
+        position_ids: torch.IntTensor, [2, seq_len]
+    '''
+    tokens = seq.unsqueeze(0)
+    attention_mask = torch.ones((1, len(seq), len(seq)), device=tokens.device)
+    attention_mask.tril_()
+    attention_mask[..., :context_length] = 1
+    attention_mask.unsqueeze_(1)
+    # 2D position ids
+    position_ids = torch.zeros(2, len(seq), device=tokens.device, dtype=torch.long)
+    torch.arange(0, context_length, out=position_ids[0, :context_length])
+    position_ids[0, context_length:] = mask_position
+    torch.arange(1, len(seq) - context_length + 1, out=position_ids[1, context_length:])
+    position_ids = position_ids.unsqueeze(0)
+    return tokens, attention_mask, position_ids
+def process_response(response):
+    response = response.strip()
+    response = response.replace("[[训练时间]]", "2023年")
+    punkts = [
+        [",", "，"],
+        ["!", "！"],
+        [":", "："],
+        [";", "；"],
+        ["\?", "？"],
+    ]
+    for item in punkts:
+        response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response)
+        response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response)
+    return response
+def process_image(text, image=None):
+    '''Process image in text.
+    Args:
+        text: str, text.
+        image: Optional, image path / url / PIL image.
+    '''
+    image_position = text.rfind("<img>") + 5
+    # extract path from <img></img> using re
+    image_path = re.findall(r"<img>(.*?)</img>", text)
+    image_path = image_path[-1] if image_path[-1] else None
+    if image_path is not None:
+        assert image is None, "image and image_path cannot be both not None."
+        text = text.replace(image_path, "")
+        image_path = image_path.strip()
+        # url
+        if image_path.startswith("http"):
+            response = requests.get(image_path, timeout=10)
+            image = Image.open(BytesIO(response.content))
+        # local path
+        else:
+            image = Image.open(image_path)
+    if image is not None and isinstance(image, Image.Image):
+        processor = BlipImageEvalProcessor(224)
+        image = processor(image.convert('RGB'))
+        image = image.unsqueeze(0)
+    return text, image_position, image
+def chat(image_path, model, tokenizer,
+        query: str, history: List[Tuple[str, str]] = None, image: Image = None,
+        max_length: int = 1024, top_p=0.7, top_k=30, temperature=0.95, repetition_penalty=1.2,
+        invalid_slices=[], english=False
+        ):
+    if not history:
+        history = []
+    if image_path:
+        prompt = "<img>{}</img>".format(image_path if image_path else "")
+    else:
+        prompt = "<img></img>"
+    if english:
+        for i, (old_query, response) in enumerate(history):
+            prompt += "Q:{}\nA:{}\n".format(old_query, response)
+        prompt += "Q:{}\nA:".format(query)
+    else:
+        for i, (old_query, response) in enumerate(history):
+            prompt += "问：{}\n答：{}\n".format(old_query, response)
+        prompt += "问：{}\n答：".format(query)
+    # ---------------
+    # tokenizer, this is an example of huggingface tokenizer.
+    # input str, output['input_ids'] = tensor([[tokenized str, gmask, sop]])
+    prompt, image_position, torch_image = process_image(prompt, image=image)
+    if torch_image is not None:
+        torch_image = torch_image.to(next(model.parameters()).dtype).to(next(model.parameters()).device)
+    if image_position < 5: # no image
+        inputs = tokenizer([prompt], return_tensors="pt").to(model.parameters().__next__().device)['input_ids'][0]
+        pre_image = 0
+    else:
+        input0 = tokenizer.encode(prompt[:image_position], add_special_tokens=False)
+        input1 = [tokenizer.pad_token_id] * model.image_length
+        input2 = tokenizer.encode(prompt[image_position:], add_special_tokens=False)
+        inputs = sum([input0, input1, input2], [])
+        inputs = torch.tensor(tokenizer.build_inputs_with_special_tokens(inputs)).to(model.parameters().__next__().device)
+        pre_image = len(input0)
+    # ---------------
+    # Next, we manually set the format to keep flexibility.
+    mask_position = len(inputs) - 2
+    context_length = len(inputs) - 1 # all before sop
+    get_func = partial(get_masks_and_position_ids_glm, mask_position=mask_position, context_length=context_length)
+    seq = torch.cat(
+        [inputs, torch.tensor([-1]*(max_length-len(inputs)), device=inputs.device)], dim=0
+    )
+    # ---------------
+    # from sat.generation.sampling_strategies import BeamSearchStrategy
+    # strategy = BeamSearchStrategy(num_beams, length_penalty=1., prefer_min_length=5, end_tokens=[tokenizer.eos_token_id], consider_end=True, no_repeat_ngram_size=5, stop_n_iter_unchanged=30, temperature=temperature, top_p=top_p, top_k=60, repetition_penalty=1.1)
+    strategy = BaseStrategy(temperature=temperature, top_p=top_p, top_k=top_k, end_tokens=[tokenizer.eos_token_id],
+                            invalid_slices=invalid_slices, repetition_penalty=repetition_penalty)
+    output = filling_sequence(
+        model, seq,
+        batch_size=1,
+        get_masks_and_position_ids=get_func,
+        strategy=strategy,
+        pre_image=pre_image,
+        image=torch_image,
+    )[0] # drop memory
+    # ---------------
+    # port from inference_glm.py, more general than chat mode
+    # clip -1s and fill back generated things into seq
+    if type(output) is not list:
+        output_list = output.tolist()
+    else:
+        output_list = output
+    for i in range(len(output_list)):
+        output = output_list[i]
+        if type(output) is not list:
+            output = output.tolist()
+        try:
+            unfinished = output.index(-1)
+        except ValueError:
+            unfinished = len(output)
+        if output[unfinished - 1] == tokenizer.eos_token_id:
+            unfinished -= 1
+        bog = output.index(tokenizer.bos_token_id)
+        output_list[i] = output[:mask_position] + output[bog + 1:unfinished] + output[mask_position + 1:bog]
+    # ---------------
+    response = tokenizer.decode(output_list[0])
+    sep = 'A:' if english else '答：'
+    response = process_response(response).split(sep)[-1].strip()
+    history = history + [(query, response)]
+    return response, history, torch_image

model/infer_util.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import os
+from PIL import Image
+from io import BytesIO
+import base64
+import re
+import argparse
+import torch
+from transformers import AutoTokenizer
+from sat.model.mixins import CachedAutoregressiveMixin
+from sat.quantization.kernels import quantize
+import hashlib
+from .visualglm import VisualGLMModel
+def get_infer_setting(gpu_device=0, quant=None):
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_device)
+    args = argparse.Namespace(
+        fp16=True,
+        skip_init=True,
+        device='cuda' if quant is None else 'cpu',
+    )
+    model, args = VisualGLMModel.from_pretrained('visualglm-6b', args)
+    model.add_mixin('auto-regressive', CachedAutoregressiveMixin())
+    assert quant in [None, 4, 8]
+    if quant is not None:
+        quantize(model.transformer, quant)
+    model.eval()
+    model = model.cuda()
+    tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
+    return model, tokenizer
+def is_chinese(text):
+    zh_pattern = re.compile(u'[\u4e00-\u9fa5]+')
+    return zh_pattern.search(text)
+def generate_input(input_text, input_image_prompt, history=[], input_para=None, image_is_encoded=True):
+    if not image_is_encoded:
+        image = input_image_prompt
+    else:
+        decoded_image = base64.b64decode(input_image_prompt)
+        image = Image.open(BytesIO(decoded_image))
+    input_data = {'input_query': input_text, 'input_image': image, 'history': history, 'gen_kwargs': input_para}
+    return input_data
+def process_image(image_encoded):
+    decoded_image = base64.b64decode(image_encoded)
+    image = Image.open(BytesIO(decoded_image))
+    image_hash = hashlib.sha256(image.tobytes()).hexdigest()
+    image_path = f'./examples/{image_hash}.png'
+    if not os.path.isfile(image_path):
+        image.save(image_path)
+    return os.path.abspath(image_path)

model/visualglm.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import torch
+from sat.model.official import ChatGLMModel
+from sat.model.base_model import BaseMixin
+from copy import deepcopy
+import json
+from .blip2 import BLIP2
+from sat.resources.urls import MODEL_URLS
+MODEL_URLS['visualglm-6b'] = 'https://cloud.tsinghua.edu.cn/f/348b98dffcc940b6a09d/?dl=1'
+class ImageMixin(BaseMixin):
+    def __init__(self, args):
+        super().__init__()
+        self.args = deepcopy(args)
+        self.model = BLIP2(args.eva_args, args.qformer_args)
+    def word_embedding_forward(self, input_ids, output_cross_layer, **kw_args):
+        if kw_args["pre_image"] > input_ids.shape[1] or kw_args.get("image", None) is None:
+            return self.transformer.word_embeddings(input_ids)
+        image_emb = self.model(**kw_args)
+        # the image is inserted after 问：<img>, override 32 pads
+        pre_id, pads, post_id = torch.tensor_split(input_ids, [kw_args["pre_image"], kw_args["pre_image"]+self.args.image_length], dim=1)
+        pre_txt_emb = self.transformer.word_embeddings(pre_id)
+        post_txt_emb = self.transformer.word_embeddings(post_id)
+        return torch.cat([pre_txt_emb, image_emb, post_txt_emb], dim=1)
+class VisualGLMModel(ChatGLMModel):
+    def __init__(self, args, transformer=None, **kwargs):
+        super().__init__(args, transformer=transformer, **kwargs)
+        self.image_length = args.image_length
+        self.add_mixin("eva", ImageMixin(args))
+    @classmethod
+    def add_model_specific_args(cls, parser):
+        group = parser.add_argument_group('VisualGLM', 'VisualGLM Configurations')
+        group.add_argument('--image_length', type=int, default=32)
+        group.add_argument('--eva_args', type=json.loads, default={})
+        group.add_argument('--qformer_args', type=json.loads, default={})
+        return super().add_model_specific_args(parser)

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+SwissArmyTransformer>=0.3.6
+torch>1.10.0
+torchvision
+transformers>=4.27.1
+mdtex2html
+gradio

requirements_wo_ds.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+torch>1.10.0
+torchvision
+transformers>=4.27.1
+mdtex2html
+gradio
+sentencepiece
+tensorboardX
+datasets
+cpm_kernels
+einops

web_demo.py ADDED Viewed

	@@ -0,0 +1,129 @@

+#!/usr/bin/env python
+import gradio as gr
+from PIL import Image
+import os
+import json
+from model import is_chinese, get_infer_setting, generate_input, chat
+import torch
+def generate_text_with_image(input_text, image, history=[], request_data=dict(), is_zh=True):
+    input_para = {
+        "max_length": 2048,
+        "min_length": 50,
+        "temperature": 0.8,
+        "top_p": 0.4,
+        "top_k": 100,
+        "repetition_penalty": 1.2
+    }
+    input_para.update(request_data)
+    input_data = generate_input(input_text, image, history, input_para, image_is_encoded=False)
+    input_image, gen_kwargs =  input_data['input_image'], input_data['gen_kwargs']
+    with torch.no_grad():
+        answer, history, _ = chat(None, model, tokenizer, input_text, history=history, image=input_image, \
+                            max_length=gen_kwargs['max_length'], top_p=gen_kwargs['top_p'], \
+                            top_k = gen_kwargs['top_k'], temperature=gen_kwargs['temperature'], english=not is_zh)
+    return answer
+def request_model(input_text, temperature, top_p, image_prompt, result_previous):
+    result_text = [(ele[0], ele[1]) for ele in result_previous]
+    for i in range(len(result_text)-1, -1, -1):
+        if result_text[i][0] == "" or result_text[i][1] == "":
+            del result_text[i]
+    print(f"history {result_text}")
+    is_zh = is_chinese(input_text)
+    if image_prompt is None:
+        if is_zh:
+            result_text.append((input_text, '图片为空！请上传图片并重试。'))
+        else:
+            result_text.append((input_text, 'Image empty! Please upload a image and retry.'))
+        return input_text, result_text
+    elif input_text == "":
+        result_text.append((input_text, 'Text empty! Please enter text and retry.'))
+        return "", result_text
+    request_para = {"temperature": temperature, "top_p": top_p}
+    image = Image.open(image_prompt)
+    try:
+        answer = generate_text_with_image(input_text, image, result_text.copy(), request_para, is_zh)
+    except Exception as e:
+        print(f"error: {e}")
+        if is_zh:
+            result_text.append((input_text, '超时！请稍等几分钟再重试。'))
+        else:
+            result_text.append((input_text, 'Timeout! Please wait a few minutes and retry.'))
+        return "", result_text
+    result_text.append((input_text, answer))
+    print(result_text)
+    return "", result_text
+DESCRIPTION = '''# <a href="https://github.com/THUDM/VisualGLM-6B">VisualGLM</a>'''
+MAINTENANCE_NOTICE1 = 'Hint 1: If the app report "Something went wrong, connection error out", please turn off your proxy and retry.\nHint 2: If you upload a large size of image like 10MB, it may take some time to upload and process. Please be patient and wait.'
+MAINTENANCE_NOTICE2 = '提示1: 如果应用报了“Something went wrong, connection error out”的错误，请关闭代理并重试。\n提示2: 如果你上传了很大的图片，比如10MB大小，那将需要一些时间来上传和处理，请耐心等待。'
+NOTES = 'This app is adapted from <a href="https://github.com/THUDM/VisualGLM-6B">https://github.com/THUDM/VisualGLM-6B</a>. It would be recommended to check out the repo if you want to see the detail of our model and training process.'
+def clear_fn(value):
+    return "", [("", "Hi, What do you want to know about this image?")], None
+def clear_fn2(value):
+    return [("", "Hi, What do you want to know about this image?")]
+def main(args):
+    gr.close_all()
+    global model, tokenizer
+    model, tokenizer = get_infer_setting(gpu_device=0, quant=args.quant)
+    with gr.Blocks(css='style.css') as demo:
+        gr.Markdown(DESCRIPTION)
+        with gr.Row():
+            with gr.Column(scale=4.5):
+                with gr.Group():
+                    input_text = gr.Textbox(label='Input Text', placeholder='Please enter text prompt below and press ENTER.')
+                    with gr.Row():
+                        run_button = gr.Button('Generate')
+                        clear_button = gr.Button('Clear')
+                    image_prompt = gr.Image(type="filepath", label="Image Prompt", value=None)
+                with gr.Row():
+                    temperature = gr.Slider(maximum=1, value=0.8, minimum=0, label='Temperature')
+                    top_p = gr.Slider(maximum=1, value=0.4, minimum=0, label='Top P')
+                with gr.Group():
+                    with gr.Row():
+                        maintenance_notice = gr.Markdown(MAINTENANCE_NOTICE1)
+            with gr.Column(scale=5.5):
+                result_text = gr.components.Chatbot(label='Multi-round conversation History', value=[("", "Hi, What do you want to know about this image?")]).style(height=550)
+        gr.Markdown(NOTES)
+        print(gr.__version__)
+        run_button.click(fn=request_model,inputs=[input_text, temperature, top_p, image_prompt, result_text],
+                         outputs=[input_text, result_text])
+        input_text.submit(fn=request_model,inputs=[input_text, temperature, top_p, image_prompt, result_text],
+                         outputs=[input_text, result_text])
+        clear_button.click(fn=clear_fn, inputs=clear_button, outputs=[input_text, result_text, image_prompt])
+        image_prompt.upload(fn=clear_fn2, inputs=clear_button, outputs=[result_text])
+        image_prompt.clear(fn=clear_fn2, inputs=clear_button, outputs=[result_text])
+        print(gr.__version__)
+    demo.queue(concurrency_count=10)
+    demo.launch(share=args.share)
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--quant", choices=[8, 4], type=int, default=None)
+    parser.add_argument("--share", action="store_true")
+    args = parser.parse_args()
+    args.share = "True"
+    main(args)

web_demo_hf.py ADDED Viewed

	@@ -0,0 +1,143 @@

+from transformers import AutoModel, AutoTokenizer
+import gradio as gr
+import mdtex2html
+import torch
+"""Override Chatbot.postprocess"""
+def postprocess(self, y):
+    if y is None:
+        return []
+    for i, (message, response) in enumerate(y):
+        y[i] = (
+            None if message is None else mdtex2html.convert((message)),
+            None if response is None else mdtex2html.convert(response),
+        )
+    return y
+gr.Chatbot.postprocess = postprocess
+def parse_text(text):
+    """copy from https://github.com/GaiZhenbiao/ChuanhuChatGPT/"""
+    lines = text.split("\n")
+    lines = [line for line in lines if line != ""]
+    count = 0
+    for i, line in enumerate(lines):
+        if "```" in line:
+            count += 1
+            items = line.split('`')
+            if count % 2 == 1:
+                lines[i] = f'<pre><code class="language-{items[-1]}">'
+            else:
+                lines[i] = f'<br></code></pre>'
+        else:
+            if i > 0:
+                if count % 2 == 1:
+                    line = line.replace("`", "\`")
+                    line = line.replace("<", "&lt;")
+                    line = line.replace(">", "&gt;")
+                    line = line.replace(" ", "&nbsp;")
+                    line = line.replace("*", "&ast;")
+                    line = line.replace("_", "&lowbar;")
+                    line = line.replace("-", "&#45;")
+                    line = line.replace(".", "&#46;")
+                    line = line.replace("!", "&#33;")
+                    line = line.replace("(", "&#40;")
+                    line = line.replace(")", "&#41;")
+                    line = line.replace("$", "&#36;")
+                lines[i] = "<br>"+line
+    text = "".join(lines)
+    return text
+def predict(input, image_path, chatbot, max_length, top_p, temperature, history):
+    if image_path is None:
+        return [(input, "图片不能为空。请重新上传图片并重试。")], []
+    chatbot.append((parse_text(input), ""))
+    with torch.no_grad():
+        for response, history in model.stream_chat(tokenizer, image_path, input, history, max_length=max_length, top_p=top_p,
+                                               temperature=temperature):
+            chatbot[-1] = (parse_text(input), parse_text(response))
+            yield chatbot, history
+def predict_new_image(image_path, chatbot, max_length, top_p, temperature):
+    input, history = "描述这张图片。", []
+    chatbot.append((parse_text(input), ""))
+    with torch.no_grad():
+        for response, history in model.stream_chat(tokenizer, image_path, input, history, max_length=max_length,
+                                               top_p=top_p,
+                                               temperature=temperature):
+            chatbot[-1] = (parse_text(input), parse_text(response))
+            yield chatbot, history
+def reset_user_input():
+    return gr.update(value='')
+def reset_state():
+    return None, [], []
+DESCRIPTION = '''<h1 align="center"><a href="https://github.com/THUDM/VisualGLM-6B">VisualGLM</a></h1>'''
+MAINTENANCE_NOTICE = 'Hint 1: If the app report "Something went wrong, connection error out", please turn off your proxy and retry.\nHint 2: If you upload a large size of image like 10MB, it may take some time to upload and process. Please be patient and wait.'
+NOTES = 'This app is adapted from <a href="https://github.com/THUDM/VisualGLM-6B">https://github.com/THUDM/VisualGLM-6B</a>. It would be recommended to check out the repo if you want to see the detail of our model and training process.'
+def main(args):
+    global model, tokenizer
+    tokenizer = AutoTokenizer.from_pretrained("THUDM/visualglm-6b", trust_remote_code=True)
+    if args.quant in [4, 8]:
+        model = AutoModel.from_pretrained("THUDM/visualglm-6b", trust_remote_code=True).quantize(args.quant).half().cuda()
+    else:
+        model = AutoModel.from_pretrained("THUDM/visualglm-6b", trust_remote_code=True).half().cuda()
+    model = model.eval()
+    with gr.Blocks(css='style.css') as demo:
+        gr.HTML(DESCRIPTION)
+        with gr.Row():
+            with gr.Column(scale=2):
+                image_path = gr.Image(type="filepath", label="Image Prompt", value=None).style(height=504)
+            with gr.Column(scale=4):
+                chatbot = gr.Chatbot().style(height=480)
+        with gr.Row():
+            with gr.Column(scale=2, min_width=100):
+                max_length = gr.Slider(0, 4096, value=2048, step=1.0, label="Maximum length", interactive=True)
+                top_p = gr.Slider(0, 1, value=0.4, step=0.01, label="Top P", interactive=True)
+                temperature = gr.Slider(0, 1, value=0.8, step=0.01, label="Temperature", interactive=True)
+            with gr.Column(scale=4):
+                with gr.Box():
+                    with gr.Row():
+                        with gr.Column(scale=2):
+                            user_input = gr.Textbox(show_label=False, placeholder="Input...", lines=4).style(
+                                container=False)
+                        with gr.Column(scale=1, min_width=64):
+                            submitBtn = gr.Button("Submit", variant="primary")
+                            emptyBtn = gr.Button("Clear History")
+                    gr.Markdown(MAINTENANCE_NOTICE + '\n' + NOTES)
+        history = gr.State([])
+        submitBtn.click(predict, [user_input, image_path, chatbot, max_length, top_p, temperature, history], [chatbot, history],
+                        show_progress=True)
+        image_path.upload(predict_new_image, [image_path, chatbot, max_length, top_p, temperature], [chatbot, history],
+                        show_progress=True)
+        image_path.clear(reset_state, outputs=[image_path, chatbot, history], show_progress=True)
+        submitBtn.click(reset_user_input, [], [user_input])
+        emptyBtn.click(reset_state, outputs=[image_path, chatbot, history], show_progress=True)
+        print(gr.__version__)
+        demo.queue().launch(share=args.share, inbrowser=True, server_name='0.0.0.0', server_port=8080)
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--quant", choices=[8, 4], type=int, default=None)
+    parser.add_argument("--share", action="store_true")
+    args = parser.parse_args()
+    main(args)

your_logfile.log ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ nohup: 忽略输入
2	+ python: can't open file '/root/VisualGLM-6B/your_program.py': [Errno 2] No such file or directory