Update lyraChatGLM/model.py

#21

by dawnranger - opened Jun 2, 2023

base: refs/heads/main

←

from: refs/pr/21

Discussion Files changed

+510

-560

Files changed (13) hide show

CHANGES.rst +0 -10
LISENCE +0 -420
README.md +30 -68
demo.py +8 -10
lyraChatGLM/config.py +1 -1
lyraChatGLM/ftlib/{libth_transformer_sm80_cu11.so → libth_transformer_sm70.so} +2 -2
lyraChatGLM/ftlib/libth_transformer_sm70_cu12.so +0 -3
lyraChatGLM/ftlib/{libth_transformer_sm80_cu12.so → libth_transformer_sm80.so} +2 -2
lyraChatGLM/lyra_glm.py +8 -11
lyraChatGLM/model.py +454 -24
models/1-gpu-fp16.bin +0 -3
lyraChatGLM/ftlib/libth_transformer_sm70_cu11.so → models/1-gpu-fp16.h5 +2 -2
requirements.txt +0 -1

CHANGES.rst DELETED Viewed

@@ -1,10 +0,0 @@
-Changelog (lyraChatGLM)
-## 2.0
-- rebuild whole system using modified Fastertransformer
-- add dynamic library & models for Volta architecture.
-- further acceleration, remove token generation limits.
-## 1.0
-- add lyraChatGLM model, from original weights

LISENCE DELETED Viewed

@@ -1,420 +0,0 @@
-MIT License
-Copyright (c) 2023 Tencent Music Entertainment
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-Other dependencies and licenses:
-Open Source Software Licensed under The ChatGLM-6B License and the Apache License Version 2.0 :
---------------------------------------------------------------------
-1. chatglm-6b
-File：https://github.com/THUDM/ChatGLM-6B
-License：The ChatGLM-6B License and Apache Licnese Version 2.0
-For details：https://github.com/THUDM/ChatGLM-6B/blob/main/MODEL_LICENSE
-             https://github.com/THUDM/ChatGLM-6B/blob/main/LICENSE
-APPENDIX: How to apply the Apache License to your work.
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-Copyright Zhengxiao Du
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-       http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-A copy of the Apache License Version 2.0 is included in this file.
-Terms of The ChatGLM-6B License:
---------------------------------------------------------------------
-一、定义
-“许可方”是指分发其软件的 ChatGLM-6B 模型团队。
-“软件”是指根据本许可提供的 ChatGLM-6B 模型参数。
-2. 许可授予
-根据本许可的条款和条件，许可方特此授予您非排他性、全球性、不可转让、不可再许可、可撤销、免版税的版权许可，仅用于您的非商业研究目的。
-上述版权声明和本许可声明应包含在本软件的所有副本或重要部分中。
-3.限制
-您不得出于任何商业、军事或非法目的使用、复制、修改、合并、发布、分发、复制或创建本软件的全部或部分衍生作品。
-您不得利用本软件从事任何危害国家安全和国家统一、危害社会公共利益、侵犯人身权益的行为。
-4.免责声明
-本软件“按原样”提供，不提供任何明示或暗示的保证，包括但不限于对适销性、特定用途的适用性和非侵权性的保证。 在任何情况下，作者或版权持有人均不对任何索赔、损害或其他责任负责，无论是在合同诉讼、侵权行为还是其他方面，由软件或软件的使用或其他交易引起、由软件引起或与之相关 软件。
-5. 责任限制
-除适用法律禁止的范围外，在任何情况下且根据任何法律理论，无论是基于侵权行为、疏忽、合同、责任或其他原因，任何许可方均不对您承担任何直接、间接、特殊、偶然、示范性、 或间接损害，或任何其他商业损失，即使许可人已被告知此类损害的可能性。
-6.争议解决
-本许可受中华人民共和国法律管辖并按其解释。 因本许可引起的或与本许可有关的任何争议应提交北京市海淀区人民法院。
-请注意，许可证可能会更新到更全面的版本。 有关许可和版权的任何问题，请通过 glm-130b@googlegroups.com 与我们联系。
-1. Definitions
-“Licensor” means the ChatGLM-6B Model Team that distributes its Software.
-“Software” means the ChatGLM-6B model parameters made available under this license.
-2. License Grant
-Subject to the terms and conditions of this License, the Licensor hereby grants to you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty-free copyright license to use the Software solely for your non-commercial research purposes.
-The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-3. Restriction
-You will not use, copy, modify, merge, publish, distribute, reproduce, or create derivative works of the Software, in whole or in part, for any commercial, military, or illegal purposes.
-You will not use the Software for any act that may undermine China's national security and national unity, harm the public interest of society, or infringe upon the rights and interests of human beings.
-4. Disclaimer
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-5. Limitation of Liability
-EXCEPT TO THE EXTENT PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER BASED IN TORT, NEGLIGENCE, CONTRACT, LIABILITY, OR OTHERWISE WILL ANY LICENSOR BE LIABLE TO YOU FOR ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES, OR ANY OTHER COMMERCIAL LOSSES, EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
-6. Dispute Resolution
-This license shall be governed and construed in accordance with the laws of People’s Republic of China. Any dispute arising from or in connection with this License shall be submitted to Haidian District People's Court in Beijing.
-Note that the license is subject to update to a more comprehensive version.  For any questions related to the license and copyright, please contact us at glm-130b@googlegroups.com.
-Open Source Software Licensed under the Apache License Version 2.0:
---------------------------------------------------------------------
-1. huggingface/transformers
-Copyright 2018- The Hugging Face team. All rights reserved.
-Terms of the Apache License Version 2.0:
---------------------------------------------------------------------
-Apache License
-Version 2.0, January 2004
-http://www.apache.org/licenses/
-TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-1. Definitions.
-"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
-"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
-"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
-"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
-"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
-"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
-"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
-"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
-"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
-"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
-2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
-3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
-4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
-You must give any other recipients of the Work or Derivative Works a copy of this License; and
-You must cause any modified files to carry prominent notices stating that You changed the files; and
-You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
-If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.
-You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
-5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
-6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
-7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
-8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
-9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
-END OF TERMS AND CONDITIONS
-Open Source Software Licensed under the Modified BSD License:
---------------------------------------------------------------------
-1. pytorch
-From PyTorch:
-Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
-Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
-Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
-Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
-Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
-Copyright (c) 2011-2013 NYU                      (Clement Farabet)
-Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
-Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
-Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
-From Caffe2:
-Copyright (c) 2016-present, Facebook Inc. All rights reserved.
-All contributions by Facebook:
-Copyright (c) 2016 Facebook Inc.
-All contributions by Google:
-Copyright (c) 2015 Google Inc.
-All rights reserved.
-All contributions by Yangqing Jia:
-Copyright (c) 2015 Yangqing Jia
-All rights reserved.
-All contributions by Kakao Brain:
-Copyright 2019-2020 Kakao Brain
-All contributions by Cruise LLC:
-Copyright (c) 2022 Cruise LLC.
-All rights reserved.
-All contributions from Caffe:
-Copyright(c) 2013, 2014, 2015, the respective contributors
-All rights reserved.
-All other contributions:
-Copyright(c) 2015, 2016 the respective contributors
-All rights reserved.
-Caffe2 uses a copyright model similar to Caffe: each contributor holds
-copyright over their contributions to Caffe2. The project versioning records
-all such contribution and copyright details. If a contributor wants to further
-mark their specific copyright on a particular contribution, they should
-indicate their copyright solely in the commit message of the change when it is
-committed.
-All rights reserved.
-Terms of the Modified BSD License:
--------------------------------------------------------------------
-This project is licensed under the terms of the Modified BSD License, as follows:
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-1. Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-   notice, this list of conditions and the following disclaimer in the
-   documentation and/or other materials provided with the distribution.
-3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
-   and IDIAP Research Institute nor the names of its contributors may be
-   used to endorse or promote products derived from this software without
-   specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-Open Source Software Licensed under the Python Software Foundation License Version 2:
---------------------------------------------------------------------------
-1. Python/cpython
-Copyright © 2001-2023 Python Software Foundation. All rights reserved
-A. HISTORY OF THE SOFTWARE
-==========================
-Python was created in the early 1990s by Guido van Rossum at Stichting
-Mathematisch Centrum (CWI, see https://www.cwi.nl) in the Netherlands
-as a successor of a language called ABC.  Guido remains Python's
-principal author, although it includes many contributions from others.
-In 1995, Guido continued his work on Python at the Corporation for
-National Research Initiatives (CNRI, see https://www.cnri.reston.va.us)
-in Reston, Virginia where he released several versions of the
-software.
-In May 2000, Guido and the Python core development team moved to
-BeOpen.com to form the BeOpen PythonLabs team.  In October of the same
-year, the PythonLabs team moved to Digital Creations, which became
-Zope Corporation.  In 2001, the Python Software Foundation (PSF, see
-https://www.python.org/psf/) was formed, a non-profit organization
-created specifically to own Python-related Intellectual Property.
-Zope Corporation was a sponsoring member of the PSF.
-All Python releases are Open Source (see https://opensource.org for
-the Open Source Definition).  Historically, most, but not all, Python
-releases have also been GPL-compatible; the table below summarizes
-the various releases.
-    Release         Derived     Year        Owner       GPL-
-                    from                                compatible? (1)
-    0.9.0 thru 1.2              1991-1995   CWI         yes
-    1.3 thru 1.5.2  1.2         1995-1999   CNRI        yes
-    1.6             1.5.2       2000        CNRI        no
-    2.0             1.6         2000        BeOpen.com  no
-    1.6.1           1.6         2001        CNRI        yes (2)
-    2.1             2.0+1.6.1   2001        PSF         no
-    2.0.1           2.0+1.6.1   2001        PSF         yes
-    2.1.1           2.1+2.0.1   2001        PSF         yes
-    2.1.2           2.1.1       2002        PSF         yes
-    2.1.3           2.1.2       2002        PSF         yes
-    2.2 and above   2.1.1       2001-now    PSF         yes
-Footnotes:
-(1) GPL-compatible doesn't mean that we're distributing Python under
-    the GPL.  All Python licenses, unlike the GPL, let you distribute
-    a modified version without making your changes open source.  The
-    GPL-compatible licenses make it possible to combine Python with
-    other software that is released under the GPL; the others don't.
-(2) According to Richard Stallman, 1.6.1 is not GPL-compatible,
-    because its license has a choice of law clause.  According to
-    CNRI, however, Stallman's lawyer has told CNRI's lawyer that 1.6.1
-    is "not incompatible" with the GPL.
-Thanks to the many outside volunteers who have worked under Guido's
-direction to make these releases possible.
-B. TERMS AND CONDITIONS FOR ACCESSING OR OTHERWISE USING PYTHON
-===============================================================
-Python software and documentation are licensed under the
-Python Software Foundation License Version 2.
-Starting with Python 3.8.6, examples, recipes, and other code in
-the documentation are dual licensed under the PSF License Version 2
-and the Zero-Clause BSD license.
-Some software incorporated into Python is under different licenses.
-The licenses are listed with code falling under that license.
-PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
---------------------------------------------
-1. This LICENSE AGREEMENT is between the Python Software Foundation
-("PSF"), and the Individual or Organization ("Licensee") accessing and
-otherwise using this software ("Python") in source or binary form and
-its associated documentation.
-2. Subject to the terms and conditions of this License Agreement, PSF hereby
-grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce,
-analyze, test, perform and/or display publicly, prepare derivative works,
-distribute, and otherwise use Python alone or in any derivative version,
-provided, however, that PSF's License Agreement and PSF's notice of copyright,
-i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
-2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023 Python Software Foundation;
-All Rights Reserved" are retained in Python alone or in any derivative version
-prepared by Licensee.
-3. In the event Licensee prepares a derivative work that is based on
-or incorporates Python or any part thereof, and wants to make
-the derivative work available to others as provided herein, then
-Licensee hereby agrees to include in any such work a brief summary of
-the changes made to Python.
-4. PSF is making Python available to Licensee on an "AS IS"
-basis.  PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
-IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND
-DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
-FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT
-INFRINGE ANY THIRD PARTY RIGHTS.
-5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
-FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
-A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON,
-OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
-6. This License Agreement will automatically terminate upon a material
-breach of its terms and conditions.
-7. Nothing in this License Agreement shall be deemed to create any
-relationship of agency, partnership, or joint venture between PSF and
-Licensee.  This License Agreement does not grant permission to use PSF
-trademarks or trade name in a trademark sense to endorse or promote
-products or services of Licensee, or any third party.
-8. By copying, installing or otherwise using Python, Licensee
-agrees to be bound by the terms and conditions of this License
-Agreement.
-Open Source Software：
---------------------------------------------------------------------
-1. icetk
-File：https://github.com/THUDM/icetk

README.md CHANGED Viewed

@@ -1,84 +1,65 @@
 ---
-license: mit
 language: en
 tags:
-- LLM
-- ChatGLM6B
 ---
 ## Breakings!
-**We know what you want, and here you go!**
-- Newly released lyraChatGLM model, suitable for Ampere (A100/A10) as well as Volta (V100)
-- lyraChatGLM has been further optimized, reaching **9000 tokens/s** on A100 and **3900 tokens/s** on V100, about **5.5x** faster than the up-to-date official version (2023/6/1).
 - The memory usage was optimized too, now we can set batch_size up to **256** on A100!
-- INT8 weight only PTQ is supported
-**Note that the code was fully updated too, you need to use the new API, see `Uses` below**
-If you like our work and consider to join us, feel free to drop a line to benbinwu@tencent.com.
-P.S. Recently we have received a lot of inquiries on accelerating customized models. Actually, we **do not have plan** to release the convertion tool at this moment, nor do we think it would be possible to apply your customized models based on our current release.
-****
 ## Model Card for lyraChatGLM
 lyraChatGLM is currently the **fastest ChatGLM-6B** available. To the best of our knowledge, it is the **first accelerated version of ChatGLM-6B**.
-The inference speed of lyraChatGLM has achieved **300x** acceleration upon the early original version. We are still working hard to further improve the performance.
-Among its main features are (updated on 2023-06-20):
 - weights: original ChatGLM-6B weights released by THUDM.
 - device: Nvidia GPU with Amperer architecture or Volta architecture (A100, A10, V100...).
-- batch_size: compiled with dynamic batch size, maximum depends on device.
-- We now support cuda version of both 11.X and 12.X
-- lyraChatGLM has been further optimized, with faster model load speed from few minutes to less than 10s for non-int8 mode, and around 1 min for int8 mode!
-## Speed
 - orginal version(fixed batch infer): commit id 1d240ba
 ### test on A100 40G
-1. The maximum batch size and maximum speed table for each version of the model.
 |version|max_batch_size|max_speed|
 |:-:|:-:|:-:|
 |original|1|30 tokens/s|
-|original(fxied batch infer)|192|1638.52 tokens/s|
-|lyraChatGLM(current)|256|9082.60 tokens/s|
-2. The speed table for the same batch size.
-|version|1 batch_size|8 batch_size| 64 batch_size | 128 batch_size |
-|:-:|:-:|:-:|:-:|:-:|
-|original|30 tokens/s| - | - | - |
-|original(fxied batch infer)|34.48 tokens/s|356.29 tokens/s|1638.52 tokens/s|1338.45 tokens/s|
-|lyraChatGLM(current)|110.05 tokens/s|843.60 tokens/s|4926.92 tokens/s|7235.04 tokens/s|
 ### test on V100
-1. The maximum batch size and maximum speed table for each version of the model.
 |version|max_batch_size|max_speed|
 |:-:|:-:|:-:|
 |original|1|17.83 tokens/s|
-|original(fxied batch infer)|128|992.20 tokens/s|
-|lyraChatGLM(current)|192|3958.39 tokens/s|
-2. The speed table for the same batch size.
-|version|1 batch_size|8 batch_size| 64 batch_size | 128 batch_size |
-|:-:|:-:|:-:|:-:|:-:|
-|original|17.83 tokens/s| - | - | - |
-|original(fxied batch infer)|17.83 tokens/s|228.95 tokens/s|889.7 tokens/s|922.20 tokens/s|
-|lyraChatGLM(current)|59.33 tokens/s|514.15 tokens/s|2849.88 tokens/s|3958.39 tokens/s|
 ## Model Sources
 - **Repository:** https://huggingface.co/THUDM/chatglm-6b
-## Docker Environment Recommendation
-- For Cuda 11.X: we recommend ```nvcr.io/nvidia/pytorch:22.12-py3```
-- For Cuda 12.0: we recommend ```nvcr.io/nvidia/pytorch:23.02-py3```
-```bash
-docker pull nvcr.io/nvidia/pytorch:23.02-py3
-docker run --rm -it --gpus all -v ./:/lyraChatGLM nvcr.io/nvidia/pytorch:23.02-py3
-pip install -r requirements.txt
-python demo.py
 ```
 ## Uses
@@ -86,15 +67,14 @@ python demo.py
 ```python
 from lyraChatGLM import LyraChatGLM6B
-model_path = "./models/1-gpu-fp16.bin"
 tokenizer_path = "./models"
 data_type = "fp16"
-int8_mode = 0   # 1 for INT8 WEIGHT ONLY PTQ
 max_output_length = 150
 arch = "Ampere" # Ampere or Volta
-cuda_version = 12
-model = LyraChatGLM6B(model_path, tokenizer_path, data_type, int8_mode, arch, cuda_version)
 prompt = "列出3个不同的机器学习算法，并说明它们的适用范围."
 test_batch_size = 256
@@ -120,29 +100,11 @@ print(output_texts)
 3. 支持向量机(Support Vector Machine):支持向量机是一种监督学习方法,通常用于分类问题。它可以处理高维数据,并且具有较高的准确性。适用于需要对高维数据进行分类或回归的问题,例如图像识别、自然语言处理等。
-## INT8
-**Int8 usage**:
-Our current version supports INT8 weight only PTQ. To enable this mode, simply modify the `int8_mode` to `1` in the demo.py file.
-**In this mode, gpu memory can be further reduced by about half and the speed can be doubled.**
-This solves the issue mentioned in https://github.com/THUDM/ChatGLM-6B/issues/1042.
-However, the speed gain is best achieved with a batch size of no more than 128. If you don't use A100 GPU, you can adjust the
-batch size to reduce it and get the benefits. We recommend a batch size of 64.This mode is very suitable for GPUs with
-limited VRAM or scenarios where it is difficult to use larger batch sizes in real-time services.
-It should be noted that although we have aligned the accuracy in our test cases, there may be slight differences
-in accuracy in some untested scenarios with int8. Please be aware of this.
 ## Citation
 ``` bibtex
 @Misc{lyraChatGLM2023,
   author =       {Kangjian Wu, Zhengtao Wang, Yibo Lu, Bin Wu},
-  title =        {lyraChatGLM: Accelerating ChatGLM to 9000+ tokens/s},
   howpublished = {\url{https://huggingface.co/TMElyralab/lyraChatGLM}},
   year =         {2023}
 }
@@ -150,4 +112,4 @@ in accuracy in some untested scenarios with int8. Please be aware of this.
 ## Report bug
 - start a discussion to report any bugs!--> https://huggingface.co/TMElyralab/lyraChatGLM/discussions
-- report bug with a `[bug]` mark in the title.

 ---
+license: creativeml-openrail-m
 language: en
 tags:
+  - LLM
+  - ChatGLM6B
 ---
 ## Breakings!
+**We know what you want, and here they are!**
+- Newly released lyraChatGLM model, suitable for Ampere(A100/A10) as well as Volta(V100)
+- lyraChatGLM has been further optimized, reaches **9000tokens/s** on A100 and **3900 tokens/s** on V100, about **5.5x** faster than original version(2023/6/1).
 - The memory usage was optimized too, now we can set batch_size up to **256** on A100!
+**Note that the code was fully updated too, you need to use new API, see `Uses` below**
 ## Model Card for lyraChatGLM
 lyraChatGLM is currently the **fastest ChatGLM-6B** available. To the best of our knowledge, it is the **first accelerated version of ChatGLM-6B**.
+The inference speed of lyraChatGLM has achieved **300x** acceleration upon the ealry original version. We are still working hard to further improve the performance.
+Among its main features are:
 - weights: original ChatGLM-6B weights released by THUDM.
 - device: Nvidia GPU with Amperer architecture or Volta architecture (A100, A10, V100...).
+- batch_size: compiled with dynamic batch size, maximum depends on device.
+## Speed
 - orginal version(fixed batch infer): commit id 1d240ba
 ### test on A100 40G
 |version|max_batch_size|max_speed|
 |:-:|:-:|:-:|
 |original|1|30 tokens/s|
+|original(fxied batch infer)|192|1638.52 toekns/s|
+|lyraChatGLM(current)|256|9082.60+ tokens/s|
 ### test on V100
 |version|max_batch_size|max_speed|
 |:-:|:-:|:-:|
 |original|1|17.83 tokens/s|
+|original(fxied batch infer)|128|992.20 toekns/s|
+|lyraChatGLM(current)|192|3911.45+ tokens/s|
 ## Model Sources
 - **Repository:** https://huggingface.co/THUDM/chatglm-6b
+## Docker Environment
+- **docker image available** at [https://hub.docker.com/repository/docker/bigmoyan/lyrallm/general], pull image by:
+```
+docker pull bigmoyan/lyrallm:v0.1
 ```
 ## Uses
 ```python
 from lyraChatGLM import LyraChatGLM6B
+model_path = "./models/1-gpu-fp16.h5"
 tokenizer_path = "./models"
 data_type = "fp16"
+int8_mode = 0
 max_output_length = 150
 arch = "Ampere" # Ampere or Volta
+model = LyraChatGLM6B(model_path, tokenizer_path, data_type, int8_mode, arch)
 prompt = "列出3个不同的机器学习算法，并说明它们的适用范围."
 test_batch_size = 256
 3. 支持向量机(Support Vector Machine):支持向量机是一种监督学习方法,通常用于分类问题。它可以处理高维数据,并且具有较高的准确性。适用于需要对高维数据进行分类或回归的问题,例如图像识别、自然语言处理等。
 ## Citation
 ``` bibtex
 @Misc{lyraChatGLM2023,
   author =       {Kangjian Wu, Zhengtao Wang, Yibo Lu, Bin Wu},
+  title =        {lyraChatGLM: Accelerating ChatGLM by 5.5x+},
   howpublished = {\url{https://huggingface.co/TMElyralab/lyraChatGLM}},
   year =         {2023}
 }
 ## Report bug
 - start a discussion to report any bugs!--> https://huggingface.co/TMElyralab/lyraChatGLM/discussions
+- report bug with a `[bug]` mark in the title.

demo.py CHANGED Viewed

@@ -1,22 +1,20 @@
 from lyraChatGLM import LyraChatGLM6B
-import numpy as np
-model_path = "./models/1-gpu-fp16.bin"
 tokenizer_path = "./models"
-inference_data_type = "fp16"
 int8_mode = 0
 max_output_length = 150
-arch = "Volta" # Ampere or Volta
-cuda_version = 11 # cuda version, we currently support 11 and 12
-model = LyraChatGLM6B(model_path, tokenizer_path, inference_data_type, int8_mode, arch, cuda_version)
 prompt = "今天天气大概 25度，有点小雨，吹着风，我想去户外散步，应该穿什么样的衣服裤子鞋子搭配。"
-# test_batch_size = 256
 prompts = [prompt, ]
-# # If you want to get different output in same batch, you can set do_sample to True
 output_texts = model.generate(prompts, output_length=max_output_length,top_k=30, top_p=0.85, temperature=0.35, repetition_penalty=1.2, do_sample=False)
-print(output_texts)

 from lyraChatGLM import LyraChatGLM6B
+model_path = "./models/1-gpu-fp16.h5"
 tokenizer_path = "./models"
+data_type = "fp16"
 int8_mode = 0
 max_output_length = 150
+arch = "Ampere" # Ampere or Volta
+model = LyraChatGLM6B(model_path, tokenizer_path, data_type, int8_mode, arch)
 prompt = "今天天气大概 25度，有点小雨，吹着风，我想去户外散步，应该穿什么样的衣服裤子鞋子搭配。"
+test_batch_size = 256
 prompts = [prompt, ]
+# If you want to get different output in same batch, you can set do_sample to True
 output_texts = model.generate(prompts, output_length=max_output_length,top_k=30, top_p=0.85, temperature=0.35, repetition_penalty=1.2, do_sample=False)
+print(output_texts)

lyraChatGLM/config.py CHANGED Viewed

@@ -14,7 +14,7 @@ class ChatGLM6BParam:
     tensor_para_size: int = 1
     pipeline_para_size: int = 1
     remove_padding: bool = True
-    shared_contexts_ratio: float = 0.0
     layernorm_eps: float = 1e-5
     weights_data_type: str = "fp16"

     tensor_para_size: int = 1
     pipeline_para_size: int = 1
     remove_padding: bool = True
+    shared_contexts_ratio: float = 1.0
     layernorm_eps: float = 1e-5
     weights_data_type: str = "fp16"

lyraChatGLM/ftlib/{libth_transformer_sm80_cu11.so → libth_transformer_sm70.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:60a06f87ca10c5d556f965a5178aac50cbcbcec0265a7bcf18751e6ef73a807c
-size 200894104

 version https://git-lfs.github.com/spec/v1
+oid sha256:74ba35dfae0d02b89594bad9458c15fba2b57fb2d96b698cbd94d78368f3f246
+size 114138600

lyraChatGLM/ftlib/libth_transformer_sm70_cu12.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2d9829541f5edccf8d59e275e1259404168750e3419902fc4c88f789baad3f20
-size 114203064

lyraChatGLM/ftlib/{libth_transformer_sm80_cu12.so → libth_transformer_sm80.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:146841b4ef362048507a576d20cb1e5bb02e0d67f3fcfce351ce25f00989dfbd
-size 200980552

 version https://git-lfs.github.com/spec/v1
+oid sha256:c814d3d493d25d64925261cac48aaf8e1a33722fba4ce3eb8bc7abdcc51f37cf
+size 200886848

lyraChatGLM/lyra_glm.py CHANGED Viewed

@@ -10,15 +10,15 @@ import transformers
 from .config import CHATGLM_6B_PARAM
 from .model import ChatGLM6BModel
 class LyraChatGLM6B:
-    def __init__(self, model_path, tokenizer_path=None, dtype='fp16', int8_mode=0, arch="Ampere", cuda_version="11") -> None:
         self.model_path = model_path
         self.tokenizer_path = tokenizer_path
         self.dtype = dtype
         self.arch=arch
-        # if dtype != 'int8':
-        #     int8_mode = 0
-        self.cuda_version = cuda_version
         self.int8_mode = int8_mode
         self.model, self.tokenizer = self.load_model_and_tokenizer()
@@ -81,9 +81,7 @@ class LyraChatGLM6B:
             max_seq_len=0,  # for position seq embedding
             pipeline_para_size=CHATGLM_6B_PARAM.pipeline_para_size,
             shared_contexts_ratio=CHATGLM_6B_PARAM.shared_contexts_ratio,
-            int8_mode=self.int8_mode,
-            model_path=self.model_path,
-            cuda_version=self.cuda_version,
         ))
         print('[INFO] Load Our Highly Optimized LyraChatGLM6B model')
@@ -106,6 +104,8 @@ class LyraChatGLM6B:
         print(f'Loading tokenizer from {self.model_path}')
         model = ChatGLM6BModel(arch=self.arch,**model_args)
         return model, tokenizer
@@ -134,10 +134,7 @@ class LyraChatGLM6B:
         ones_int = torch.ones(size=[batch_size], dtype=torch.int32)
         ones_float = torch.ones(size=[batch_size], dtype=torch.float32)
-        # input_token_ids = self.tokenizer(prompts, return_tensors="pt", padding=True).input_ids.int()
-        raw_input_token_ids = self.tokenizer(prompts, padding=True)
-        input_token_ids = torch.tensor (raw_input_token_ids["input_ids"],dtype=torch.int32)
         input_lengths = torch.IntTensor([len(ids) for ids in input_token_ids])
         mask_positions = torch.IntTensor([seq.index(130001) for seq in input_token_ids.tolist()])

 from .config import CHATGLM_6B_PARAM
 from .model import ChatGLM6BModel
 class LyraChatGLM6B:
+    def __init__(self, model_path, tokenizer_path=None, dtype='fp16', int8_mode=0, arch="Ampere") -> None:
         self.model_path = model_path
         self.tokenizer_path = tokenizer_path
         self.dtype = dtype
         self.arch=arch
+        if dtype != 'int8':
+            int8_mode = 0
         self.int8_mode = int8_mode
         self.model, self.tokenizer = self.load_model_and_tokenizer()
             max_seq_len=0,  # for position seq embedding
             pipeline_para_size=CHATGLM_6B_PARAM.pipeline_para_size,
             shared_contexts_ratio=CHATGLM_6B_PARAM.shared_contexts_ratio,
+            int8_mode=self.int8_mode
         ))
         print('[INFO] Load Our Highly Optimized LyraChatGLM6B model')
         print(f'Loading tokenizer from {self.model_path}')
         model = ChatGLM6BModel(arch=self.arch,**model_args)
+        if not model.load(ckpt_path=self.model_path):
+            print('[WARNING] Skip model loading since no checkpoints are found')
         return model, tokenizer
         ones_int = torch.ones(size=[batch_size], dtype=torch.int32)
         ones_float = torch.ones(size=[batch_size], dtype=torch.float32)
+        input_token_ids = self.tokenizer(prompts, return_tensors="pt", padding=True).input_ids.int()
         input_lengths = torch.IntTensor([len(ids) for ids in input_token_ids])
         mask_positions = torch.IntTensor([seq.index(130001) for seq in input_token_ids.tolist()])

lyraChatGLM/model.py CHANGED Viewed

@@ -8,6 +8,402 @@ import torch
 import torch.distributed as dist
 import torch.nn as nn
 class ChatGLM6BModel(nn.Module):
     def __init__(self,
                  head_num, size_per_head,
@@ -19,8 +415,6 @@ class ChatGLM6BModel(nn.Module):
                  tensor_para_size: int,
                  pipeline_para_size: int,
                  inference_data_type: str,
-                 model_path,
-                 cuda_version,
                  inter_size: int = 0,
                  # glm_variant_params
                  layernorm_eps: float = 1e-5,
@@ -49,7 +443,6 @@ class ChatGLM6BModel(nn.Module):
         self.layer_num = layer_num
         self.inter_size = inter_size if inter_size != 0 else 4 * self.head_num * self.size_per_head
         self.arch = arch
-        self.model_path = model_path
         # gpt_variant_params
         self.layernorm_eps = layernorm_eps
         self.layernorm_type = layernorm_type
@@ -79,28 +472,62 @@ class ChatGLM6BModel(nn.Module):
         assert head_num % tensor_para_size == 0, "head_num must be a multiple of tensor_para_size."
         assert layer_num % pipeline_para_size == 0, "layer_num must be a multiple of pipeline_para_size."
-        self.device = 0
         # Load the C++ model into Pytorch model.
-        sm = "sm80"
         if arch == "Ampere":
-            sm = "sm80"
         elif arch == "Volta":
-            sm = "sm70"
-        else:
-            raise Exception(f"unsupported arch: {arch}")
-        cu = 'cu11'
-        if cuda_version == 11:
-            cu = 'cu11'
-        elif cuda_version == 12:
-            cu = 'cu12'
-        else:
-            raise Exception(f"unsupported cuda version: {cuda_version}")
-        lib_path = pathlib.Path(__file__).parent / "ftlib" / f"libth_transformer_{sm}_{cu}.so"
-        torch.classes.load_library(os.path.abspath(lib_path))
         self.model = torch.classes.FasterTransformer.GlmOp(
             self.head_num, self.size_per_head, self.inter_size,
@@ -122,9 +549,9 @@ class ChatGLM6BModel(nn.Module):
             self.has_adapters,
             self.adapter_inter_size,
             self.use_attention_linear_bias,
-            self.model_path,
-            self.weights_data_type,
-            inference_data_type,
             self.shared_contexts_ratio)
         self.build_model = True
@@ -146,7 +573,10 @@ class ChatGLM6BModel(nn.Module):
                 bad_words_list: typing.Optional[torch.IntTensor] = None,
                 return_output_length: bool = False,
                 return_cum_log_probs: int = 0):
         input_len = start_ids.size(1)
         assert input_len > 0, "input len must be larger than zero. For an unconditional case, use start_id as the first token."

 import torch.distributed as dist
 import torch.nn as nn
+str_type_map = {"fp32": torch.float32, "fp16": torch.float16, "bf16": torch.bfloat16}
+class ChatGLM6BWeights:
+    def __init__(
+            self, head_num, size_per_head, layer_num, vocab_size, max_seq_len, tensor_para_size, pipeline_para_size,
+            weights_data_type: typing.Union[str, np.dtype],
+            inference_data_type: str, has_adapters: bool = False, adapter_inter_size: int = 0, gpt_with_moe: bool = False,
+            has_positional_encoding: bool = False, has_pre_decoder_layernorm: bool = False,
+            has_post_decoder_layernorm: bool = True, int8_mode: int = 0, inter_size: int = 0):
+        assert(head_num % tensor_para_size == 0)
+        if int8_mode == 1:
+            torch_infer_dtype = str_type_map[inference_data_type]
+            assert torch_infer_dtype == torch.float16 or torch_infer_dtype == torch.bfloat16, "Weight only quant only supported for infer type fp16 or bf16."
+            quant = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix
+            self.weight_transpose_calibrate_quantize = lambda x: quant(x, torch.int8)
+        else:
+            assert int8_mode == 0, "Invalid int8 mode for GPT. Must be 0 or 1"
+        self.head_num = head_num
+        self.size_per_head = size_per_head
+        self.layer_num = layer_num
+        self.vocab_size = vocab_size
+        self.max_seq_len = max_seq_len
+        self.tensor_para_size = tensor_para_size
+        self.pipeline_para_size = pipeline_para_size
+        self.layers_per_device = layer_num // pipeline_para_size
+        self.has_adapters = has_adapters
+        self.adapter_inter_size = adapter_inter_size
+        self.gpt_with_moe = gpt_with_moe
+        self.has_positional_encoding = has_positional_encoding
+        self.has_pre_decoder_layernorm = has_pre_decoder_layernorm
+        self.has_post_decoder_layernorm = has_post_decoder_layernorm
+        local_head_num = head_num // tensor_para_size
+        global_head_num = head_num
+        local_hidden_units = local_head_num * size_per_head
+        global_hidden_units = global_head_num * size_per_head
+        local_inter_size = local_hidden_units * 4
+        if inter_size != 0:
+            assert inter_size % tensor_para_size == 0, f"inter_size({inter_size}) \% tensor_para_size({tensor_para_size}) must be 0"
+            local_inter_size = inter_size // tensor_para_size
+        local_adapter_inter_size = self.adapter_inter_size // tensor_para_size
+        self.local_head_num = local_head_num
+        self.global_head_num = global_head_num
+        self.local_hidden_units = local_hidden_units
+        self.global_hidden_units = global_hidden_units
+        self.local_inter_size = local_inter_size
+        self.int8_mode = int8_mode
+        self.share_embed = False
+        if isinstance(weights_data_type, str):
+            try:
+                weights_data_type = {
+                    "fp16": np.float16,
+                    "fp32": np.float32,
+                    "float16": np.float16,
+                    "float32": np.float32,
+                }[weights_data_type]
+            except KeyError:
+                raise ValueError(f"Don't know how to interpret weights_data_type: {weights_data_type}")
+        assert weights_data_type in [np.float32, np.float16]
+        self.weights_data_type = weights_data_type
+        self.inference_data_type = inference_data_type
+        self.w = []
+        self.int8_w = []
+        self.scale = []
+        # Transformer blocks
+        self.w.extend([torch.zeros(global_hidden_units, dtype=str_type_map[self.inference_data_type])]
+                      * layer_num)   # self_layernorm_gamma
+        self.w.extend([torch.zeros(global_hidden_units, dtype=str_type_map[self.inference_data_type])]
+                      * layer_num)   # self_layernorm_beta
+        self.w.extend([torch.zeros(global_hidden_units, local_hidden_units * 3,
+                      dtype=str_type_map[self.inference_data_type])] * layer_num)   # self_kernel
+        self.w.extend([torch.zeros(local_hidden_units * 3, dtype=str_type_map[self.inference_data_type])]
+                      * layer_num)   # self_bias
+        self.w.extend(
+            [torch.zeros(local_hidden_units, global_hidden_units, dtype=str_type_map[self.inference_data_type])] *
+            layer_num)  # self_output_kernel
+        self.w.extend([torch.zeros(global_hidden_units, dtype=str_type_map[self.inference_data_type])]
+                      * layer_num)   # self_output_bias
+        self.w.extend([torch.zeros(global_hidden_units, dtype=str_type_map[self.inference_data_type])]
+                      * layer_num)   # ffn_layernorm_gamma
+        self.w.extend([torch.zeros(global_hidden_units, dtype=str_type_map[self.inference_data_type])]
+                      * layer_num)   # ffn_layernorm_beta
+        self.w.extend(
+            [torch.zeros(global_hidden_units, local_inter_size, dtype=str_type_map[self.inference_data_type])] *
+            layer_num)  # ffn_kernel1
+        self.w.extend([torch.zeros(local_inter_size, dtype=str_type_map[self.inference_data_type])]
+                      * layer_num)   # ffn_bias1
+        self.w.extend(
+            [torch.zeros(local_inter_size, global_hidden_units, dtype=str_type_map[self.inference_data_type])] *
+            layer_num)  # ffn_kernel2
+        self.w.extend([torch.zeros(global_hidden_units, dtype=str_type_map[self.inference_data_type])]
+                      * layer_num)   # ffn_bias2
+        optional_adapter_offset = 0
+        # After Transformer blocks
+        if self.has_pre_decoder_layernorm:
+            self.w.append(torch.zeros(global_hidden_units, dtype=str_type_map[
+                self.inference_data_type]))   # embedding layernorm gamma
+            self.w.append(torch.zeros(global_hidden_units, dtype=str_type_map[
+                self.inference_data_type]))   # embedding layernorm beta
+            optional_adapter_offset += 2
+        if self.has_post_decoder_layernorm:
+            self.w.append(torch.zeros(global_hidden_units, dtype=str_type_map[
+                self.inference_data_type]))   # final layernorm gamma
+            self.w.append(torch.zeros(global_hidden_units, dtype=str_type_map[
+                self.inference_data_type]))   # final layernorm beta
+            optional_adapter_offset += 2
+        if self.has_positional_encoding:
+            self.w.append(torch.zeros(max_seq_len, global_hidden_units, dtype=str_type_map[
+                self.inference_data_type]))   # position_encoding_table
+            optional_adapter_offset += 1
+        self.pre_embed_idx = len(self.w)
+        self.w.append(torch.zeros(vocab_size, global_hidden_units,
+                      dtype=str_type_map[self.inference_data_type]))   # embedding_table
+        self.post_embed_idx = len(self.w)
+        self.w.append(torch.zeros(vocab_size, global_hidden_units, dtype=str_type_map[
+            self.inference_data_type]))   # post embedding_kernel
+        self.adapter_offset = 2 + optional_adapter_offset
+        self.w.extend([torch.empty(0, dtype=str_type_map[self.inference_data_type])] * layer_num)   # gating_weight
+        self.adapter_offset += layer_num
+        # adapters
+        if self.has_adapters:
+            self.w.extend([torch.zeros(global_hidden_units, local_adapter_inter_size,
+                          dtype=str_type_map[self.inference_data_type])] * layer_num)   # adaptor1_kernel1
+            self.w.extend([torch.zeros(local_adapter_inter_size, dtype=str_type_map[
+                self.inference_data_type])] * layer_num)   # adaptor1_bias1
+            self.w.extend([torch.zeros(local_adapter_inter_size, global_hidden_units,
+                          dtype=str_type_map[self.inference_data_type])] * layer_num)   # adaptor1_kernel2
+            self.w.extend([torch.zeros(global_hidden_units, dtype=str_type_map[
+                self.inference_data_type])] * layer_num)   # adaptor1_bias2
+            self.w.extend([torch.zeros(global_hidden_units, local_adapter_inter_size,
+                          dtype=str_type_map[self.inference_data_type])] * layer_num)   # adaptor2_kernel1
+            self.w.extend([torch.zeros(local_adapter_inter_size, dtype=str_type_map[
+                self.inference_data_type])] * layer_num)   # adaptor2_bias1
+            self.w.extend([torch.zeros(local_adapter_inter_size, global_hidden_units,
+                          dtype=str_type_map[self.inference_data_type])] * layer_num)   # adaptor2_kernel2
+            self.w.extend([torch.zeros(global_hidden_units, dtype=str_type_map[
+                self.inference_data_type])] * layer_num)   # adaptor2_bias2
+        # Initialization
+        # self._map(lambda w: torch.nn.init.normal_(w, mean=0., std=1.))
+        if (self.int8_mode != 0):
+            self.int8_w.extend([torch.zeros(global_hidden_units, local_hidden_units *
+                               3, dtype=torch.int8)] * layer_num)   # self_int8_kernel
+            self.scale.extend([torch.zeros(local_hidden_units * 3, dtype=torch.float)] * layer_num)   # self_scale
+            self.int8_w.extend([torch.zeros(local_hidden_units, global_hidden_units, dtype=torch.int8)]
+                               * layer_num)   # self_output_int8_kernel
+            self.scale.extend([torch.zeros(global_hidden_units, dtype=torch.float)] * layer_num)   # self_output_scale
+            self.int8_w.extend([torch.zeros(global_hidden_units, local_inter_size,
+                               dtype=torch.int8)] * layer_num)   # ffn_int8_kernel1
+            self.scale.extend([torch.zeros(local_inter_size, dtype=torch.float)] * layer_num)   # ffn_scale1
+            self.int8_w.extend([torch.zeros(local_inter_size, global_hidden_units,
+                               dtype=torch.int8)] * layer_num)   # ffn_int8_kernel2
+            self.scale.extend([torch.zeros(global_hidden_units, dtype=torch.float)] * layer_num)   # ffn_scale2
+            if self.has_adapters:
+                self.int8_w.extend([torch.zeros(global_hidden_units, local_adapter_inter_size,
+                                   dtype=torch.int8)] * layer_num)   # adaptor1_int8_kernel1
+                self.scale.extend([torch.zeros(local_adapter_inter_size, dtype=torch.float)]
+                                  * layer_num)   # adaptor1_scale1
+                self.int8_w.extend([torch.zeros(local_adapter_inter_size, global_hidden_units,
+                                   dtype=torch.int8)] * layer_num)   # adaptor1_int8_kernel2
+                self.scale.extend([torch.zeros(global_hidden_units, dtype=torch.float)] * layer_num)   # adaptor1_scale2
+                self.int8_w.extend([torch.zeros(global_hidden_units, local_adapter_inter_size,
+                                   dtype=torch.int8)] * layer_num)   # adaptor2_int8_kernel1
+                self.scale.extend([torch.zeros(local_adapter_inter_size, dtype=torch.float)]
+                                  * layer_num)   # adaptor2_scale1
+                self.int8_w.extend([torch.zeros(local_adapter_inter_size, global_hidden_units,
+                                   dtype=torch.int8)] * layer_num)   # adaptor2_int8_kernel2
+                self.scale.extend([torch.zeros(global_hidden_units, dtype=torch.float)] * layer_num)   # adaptor2_scale2
+    def __getitem__(self, idx):
+        return self.w[idx]
+    def __setitem__(self, idx, val):
+        self.w[idx] = val
+    def __len__(self):
+        return len(self.w)
+    def _map(self, func):
+        assert(self.pre_embed_idx < self.post_embed_idx,
+               "Pre decoder embedding index should be lower than post decoder embedding index.")
+        for i in range(len(self.w)):
+            if isinstance(self.w[i], list):
+                for j in range(len(self.w[i])):
+                    self.w[i][j] = func(self.w[i][j])
+            else:
+                if self.share_embed and i == self.post_embed_idx:
+                    # If sharing the pre and post embedding, any mapping to
+                    # the pre decoder weight will give the same output to the
+                    # post decoder weight, so we just copy here.
+                    self.w[self.post_embed_idx] = self.w[self.pre_embed_idx]
+                else:
+                    self.w[i] = func(self.w[i])
+    def _map_int8(self, func):
+        for i in range(len(self.int8_w)):
+            if isinstance(self.int8_w[i], list):
+                for j in range(len(self.int8_w[i])):
+                    self.int8_w[i][j] = func(self.int8_w[i][j])
+            else:
+                self.int8_w[i] = func(self.int8_w[i])
+        for i in range(len(self.scale)):
+            if isinstance(self.scale[i], list):
+                for j in range(len(self.scale[i])):
+                    self.scale[i][j] = func(self.scale[i][j])
+            else:
+                self.scale[i] = func(self.scale[i])
+    def _map_int8_scales(self, func):
+        for i in range(len(self.scale)):
+            if isinstance(self.scale[i], list):
+                for j in range(len(self.scale[i])):
+                    self.scale[i][j] = func(self.scale[i][j])
+            else:
+                self.scale[i] = func(self.scale[i])
+    def load(self, ckpt_path, tp_rank, pipeline_para_rank):
+        if not os.path.exists(ckpt_path):
+            raise FileNotFoundError(f"Failed to find {ckpt_path}")
+        w = []
+        type_map = {np.float32: torch.float32, np.float16: torch.float16}
+        # Load
+        def is_load(i): return i >= self.layers_per_device * \
+            pipeline_para_rank and i < self.layers_per_device * (pipeline_para_rank + 1)
+        h5f = h5py.File(ckpt_path, "r")
+        def load_to_torch(key, is_load: bool):
+            if is_load:
+                npdata = h5f[key]["weights"][:]
+                return torch.from_numpy(npdata).to(str_type_map[self.inference_data_type])
+            else:
+                return torch.empty(0).to(str_type_map[self.inference_data_type])
+        w.extend([load_to_torch(f"model.layers.{i}.input_layernorm.weight", is_load(i))
+                 for i in range(self.layer_num)])
+        w.extend([load_to_torch(f"model.layers.{i}.input_layernorm.bias", is_load(i))
+                 for i in range(self.layer_num)])
+        w.extend(
+            [load_to_torch(
+                f"model.layers.{i}.attention.query_key_value.weight.{tp_rank}", is_load(i))
+             for i in range(self.layer_num)])
+        w.extend([
+            load_to_torch(
+                f"model.layers.{i}.attention.query_key_value.bias.{tp_rank}", is_load(i))
+            for i in range(self.layer_num)])
+        w.extend([load_to_torch(f"model.layers.{i}.attention.dense.weight.{tp_rank}",
+                 is_load(i)) for i in range(self.layer_num)])
+        w.extend([load_to_torch(f"model.layers.{i}.attention.dense.bias", is_load(i))
+                 for i in range(self.layer_num)])
+        w.extend([load_to_torch(f"model.layers.{i}.post_attention_layernorm.weight",
+                 is_load(i)) for i in range(self.layer_num)])
+        w.extend([load_to_torch(f"model.layers.{i}.post_attention_layernorm.bias",
+                 is_load(i)) for i in range(self.layer_num)])
+        w.extend(
+            [load_to_torch(f"model.layers.{i}.mlp.dense_h_to_4h.weight.{tp_rank}", is_load(i))
+             for i in range(self.layer_num)])
+        w.extend(
+            [load_to_torch(f"model.layers.{i}.mlp.dense_h_to_4h.bias.{tp_rank}", is_load(i))
+             for i in range(self.layer_num)])
+        w.extend(
+            [load_to_torch(f"model.layers.{i}.mlp.dense_4h_to_h.weight.{tp_rank}", is_load(i))
+             for i in range(self.layer_num)])
+        w.extend([load_to_torch(f"model.layers.{i}.mlp.dense_4h_to_h.bias", is_load(i)) for i in range(self.layer_num)])
+        if self.has_pre_decoder_layernorm:
+            w.append(load_to_torch(f"model.pre_decoder_layernorm.weight", True))
+            w.append(load_to_torch(f"model.pre_decoder_layernorm.bias", True))
+        if self.has_post_decoder_layernorm:
+            w.append(load_to_torch(f"model.final_layernorm.weight", True))
+            w.append(load_to_torch(f"model.final_layernorm.bias", True))
+        if self.has_positional_encoding:
+            wpe = load_to_torch(f"model.wpe", True).reshape(-1, self.global_hidden_units)
+            assert self.max_seq_len <= wpe.size(0), (
+                f"max_seq_len ({self.max_seq_len} must not exceed "
+                f"the value of maximum sequence length during training ({wpe.size(0)})."
+            )
+            w.append(wpe)
+        w.append(load_to_torch(f"model.wte", True))
+        self.share_embed = True
+        w.append(torch.empty(0).to(str_type_map[self.inference_data_type]))
+        gate_list = []
+        for i in range(self.layer_num):
+            gate_list.append(load_to_torch(f"model.layers.{i}.mlp.moe.gate.wg.weight", False))
+        w.extend(gate_list)
+        if self.has_adapters:
+            w.extend(
+                [load_to_torch(
+                    f"model.layers.{i}.after_attention_adapter.dense_h_to_4h.weight.{tp_rank}", is_load(i))
+                 for i in range(self.layer_num)])
+            w.extend([
+                load_to_torch(
+                    f"model.layers.{i}.after_attention_adapter.dense_h_to_4h.bias.{tp_rank}", is_load(i))
+                for i in range(self.layer_num)])
+            w.extend(
+                [load_to_torch(
+                    f"model.layers.{i}.after_attention_adapter.dense_4h_to_h.weight.{tp_rank}", is_load(i))
+                 for i in range(self.layer_num)])
+            w.extend(
+                [load_to_torch(f"model.layers.{i}.after_attention_adapter.dense_4h_to_h.bias", is_load(i))
+                 for i in range(self.layer_num)])
+            w.extend(
+                [load_to_torch(f"model.layers.{i}.after_ffn_adapter.dense_h_to_4h.weight.{tp_rank}", is_load(i))
+                 for i in range(self.layer_num)])
+            w.extend(
+                [load_to_torch(f"model.layers.{i}.after_ffn_adapter.dense_h_to_4h.bias.{tp_rank}", is_load(i))
+                 for i in range(self.layer_num)])
+            w.extend(
+                [load_to_torch(f"model.layers.{i}.after_ffn_adapter.dense_4h_to_h.weight.{tp_rank}", is_load(i))
+                 for i in range(self.layer_num)])
+            w.extend([load_to_torch(
+                f"model.layers.{i}.after_ffn_adapter.dense_4h_to_h.bias", is_load(i)) for i in range(self.layer_num)])
+        assert len(self.w) == len(w)
+        # Reshape
+        try:
+            for i in range(len(w)):
+                if w[i].nelement() == self.w[i].nelement():
+                    self.w[i] = w[i].reshape(self.w[i].shape)
+                else:
+                    self.w[i] = w[i]
+        except RuntimeError:
+            raise RuntimeError(
+                f"head_num, size_per_head, vocab_size, and max_seq_len must be the same as the ones during training "
+                f"(idx: {i} expected shape: {self.w[i].shape} got shape: {w[i].shape})."
+            )
+        # transpose calibrate quantize the kernel
+        layer_num = self.layer_num
+        if self.int8_mode != 0:
+            for i in range(layer_num):
+                self.int8_w[i + 0 * layer_num], self.scale[i + 0 *
+                                                           layer_num] = self.weight_transpose_calibrate_quantize(self.w[2 * layer_num + i])
+                self.int8_w[i + 1 * layer_num], self.scale[i + 1 *
+                                                           layer_num] = self.weight_transpose_calibrate_quantize(self.w[4 * layer_num + i])
+                self.int8_w[i + 2 * layer_num], self.scale[i + 2 *
+                                                           layer_num] = self.weight_transpose_calibrate_quantize(self.w[8 * layer_num + i])
+                self.int8_w[i + 3 * layer_num], self.scale[i + 3 *
+                                                           layer_num] = self.weight_transpose_calibrate_quantize(self.w[10 * layer_num + i])
+                # We clear the original weights since they are no longer needed
+                if self.int8_mode == 1:
+                    self.w[2 * layer_num + i] = torch.empty(0).to(str_type_map[self.inference_data_type])
+                    self.w[4 * layer_num + i] = torch.empty(0).to(str_type_map[self.inference_data_type])
+                    self.w[8 * layer_num + i] = torch.empty(0).to(str_type_map[self.inference_data_type])
+                    self.w[10 * layer_num + i] = torch.empty(0).to(str_type_map[self.inference_data_type])
+                if self.has_adapters:
+                    self.int8_w[i + 4 * layer_num], self.scale[i + 4 * layer_num] = self.weight_transpose_calibrate_quantize(
+                        self.w[12 * layer_num + i + self.adapter_offset])
+                    self.int8_w[i + 5 * layer_num], self.scale[i + 5 * layer_num] = self.weight_transpose_calibrate_quantize(
+                        self.w[14 * layer_num + i + self.adapter_offset])
+                    self.int8_w[i + 6 * layer_num], self.scale[i + 6 * layer_num] = self.weight_transpose_calibrate_quantize(
+                        self.w[16 * layer_num + i + self.adapter_offset])
+                    self.int8_w[i + 7 * layer_num], self.scale[i + 7 * layer_num] = self.weight_transpose_calibrate_quantize(
+                        self.w[18 * layer_num + i + self.adapter_offset])
+                    # Similar to above:
+                    if self.int8_mode == 1:
+                        self.w[12 * layer_num + i + self.adapter_offset] = torch.empty(
+                            0).to(str_type_map[self.inference_data_type])
+                        self.w[14 * layer_num + i + self.adapter_offset] = torch.empty(
+                            0).to(str_type_map[self.inference_data_type])
+                        self.w[16 * layer_num + i + self.adapter_offset] = torch.empty(
+                            0).to(str_type_map[self.inference_data_type])
+                        self.w[18 * layer_num + i + self.adapter_offset] = torch.empty(
+                            0).to(str_type_map[self.inference_data_type])
+        return True
 class ChatGLM6BModel(nn.Module):
     def __init__(self,
                  head_num, size_per_head,
                  tensor_para_size: int,
                  pipeline_para_size: int,
                  inference_data_type: str,
                  inter_size: int = 0,
                  # glm_variant_params
                  layernorm_eps: float = 1e-5,
         self.layer_num = layer_num
         self.inter_size = inter_size if inter_size != 0 else 4 * self.head_num * self.size_per_head
         self.arch = arch
         # gpt_variant_params
         self.layernorm_eps = layernorm_eps
         self.layernorm_type = layernorm_type
         assert head_num % tensor_para_size == 0, "head_num must be a multiple of tensor_para_size."
         assert layer_num % pipeline_para_size == 0, "layer_num must be a multiple of pipeline_para_size."
         # Load the C++ model into Pytorch model.
         if arch == "Ampere":
+            lib_path = pathlib.Path(__file__).parent / "ftlib" / "libth_transformer_sm80.so"
         elif arch == "Volta":
+            lib_path = pathlib.Path(__file__).parent / "ftlib" / "libth_transformer_sm70.so"
+        torch.classes.load_library(os.path.abspath(lib_path))
+        # Prepare weights
+        self.weights = ChatGLM6BWeights(head_num, size_per_head, layer_num, vocab_size,
+                                        max_seq_len, tensor_para_size, pipeline_para_size,
+                                        weights_data_type=weights_data_type,
+                                        inference_data_type=inference_data_type,
+                                        gpt_with_moe=self.gpt_with_moe,
+                                        has_positional_encoding=self.has_positional_encoding,
+                                        has_pre_decoder_layernorm=self.has_pre_decoder_layernorm,
+                                        has_post_decoder_layernorm=self.has_post_decoder_layernorm,
+                                        has_adapters=self.has_adapters,
+                                        adapter_inter_size=self.adapter_inter_size,
+                                        int8_mode=int8_mode,
+                                        inter_size=inter_size)
+        # Prepare for tensor/pipeline parallel
+        try:
+            dist.init_process_group(backend='mpi')
+        except:
+            print("[INFO] WARNING: Have initialized the process group")
+        self.rank = dist.get_rank()
+        self.device_count = torch.cuda.device_count()
+        self.device = self.rank % self.device_count
+        torch.cuda.set_device(self.device)
+        world_size = dist.get_world_size()
+        assert world_size == tensor_para_size * pipeline_para_size, "tensor_para_size * pipeline_para_size must be equal to world_size."
+        self.tensor_para_rank = self.rank % self.tensor_para_size
+        self.pipeline_para_rank = self.rank // self.tensor_para_size
+    def load(self, ckpt_path):
+        is_load = self.weights.load(ckpt_path, tp_rank=self.tensor_para_rank,
+                                    pipeline_para_rank=self.pipeline_para_rank)
+        self.cuda()
+        torch.cuda.empty_cache()  # clean cache for model weight preprocessing
+        return is_load
+    def sparse(self):
+        if not self.use_sparse_gemm:
+            self.use_sparse_gemm = True
+    def cuda(self):
+        self.weights._map(lambda w: w.cuda(self.device))
+        if self.int8_mode != 0:
+            self.weights._map_int8(lambda w: w.cuda(self.device))
+        if self.build_model:
+            del self.model
+            self.build_model = False
         self.model = torch.classes.FasterTransformer.GlmOp(
             self.head_num, self.size_per_head, self.inter_size,
             self.has_adapters,
             self.adapter_inter_size,
             self.use_attention_linear_bias,
+            self.weights.w,
+            self.weights.int8_w,
+            self.weights.scale,
             self.shared_contexts_ratio)
         self.build_model = True
                 bad_words_list: typing.Optional[torch.IntTensor] = None,
                 return_output_length: bool = False,
                 return_cum_log_probs: int = 0):
+        if not self.build_model:
+            # for the cases we don't load model
+            self.cuda()
+            torch.cuda.empty_cache()  # clean cache for model weight preprocessing
         input_len = start_ids.size(1)
         assert input_len > 0, "input len must be larger than zero. For an unconditional case, use start_id as the first token."

models/1-gpu-fp16.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9bab22c98c57766bc31410c819858fa704490ca76dc04df7331d188c56fba1b1
-size 12346572800

lyraChatGLM/ftlib/libth_transformer_sm70_cu11.so → models/1-gpu-fp16.h5 RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0826346c748380e8e9fdd7e1f7130bad0f2485a65a8ecd4beb33d19e85c4d79e
-size 114280392

 version https://git-lfs.github.com/spec/v1
+oid sha256:3012c698d6084bf154f78bd9c0734ba8026670a16ac3f3944b41476472f1561a
+size 12347066528

requirements.txt CHANGED Viewed

@@ -5,5 +5,4 @@ huggingface_hub
 numpy
 setuptools
 torch
-h5py
 protobuf==3.20.3

 numpy
 setuptools
 torch
 protobuf==3.20.3