ChatGLM-6B

Runtime error

File size: 5,992 Bytes

from transformers import AutoModel, AutoTokenizer, LlamaTokenizer, LlamaForCausalLM
import gradio as gr
import torch
import os
import io
import sys
import platform
import intel_extension_for_pytorch as ipex
import intel_extension_for_pytorch._C as ipex_core
from cpuinfo import get_cpu_info
from contextlib import redirect_stdout


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

ROOT = '/'
SELF_ROOT = '/proc/self/root'

tokenizer = LlamaTokenizer.from_pretrained(
    "lmsys/vicuna-7b-v1.3", trust_remote_code=True
)
model = LlamaForCausalLM.from_pretrained(
    "lmsys/vicuna-7b-v1.3", trust_remote_code=True
).to(DEVICE)
model = model.eval()


def in_chroot():
    '''
    Return true if running in a chroot environment.
    '''
    try:
        root_stat = os.stat(ROOT)
        self_stat = os.stat(SELF_ROOT)
    except FileNotFoundError as e:
        sys.exit(f"ERROR: Failed to stat: {e}")

    root_inode = root_stat.st_ino
    self_inode = self_stat.st_ino

    # Inode 2 is the root inode for most filesystems.
    # However, XFS uses 128 for root.
    if root_inode not in [2, 128]:
        return True

    return not (root_inode == self_inode)


def get_features():
    '''
    Returns a dictionary of all feature:

    key: feature name.
    value: Boolean showing if feature available.
    '''

    cpu_info = get_cpu_info()
    flags = cpu_info["flags"]

    detect_ipex_amx_enabled = lambda: ipex_core._get_current_isa_level() == 'AMX'
    detect_ipex_amx_available = (
        lambda: ipex_core._get_highest_cpu_support_isa_level() == 'AMX'
    )

    features = {
        'VM': 'hypervisor' in flags,
        'TDX TD': 'tdx_guest' in flags,
        'AMX available': 'amx_tile' in flags,
        'AMX-BF16 available': 'amx_bf16' in flags,
        'AMX-INT8 available': 'amx_int8' in flags,
        'AVX-VNNI available': 'avx_vnni' in flags,
        'AVX512-VNNI available': 'avx512_vnni' in flags,
        'AVX512-FP16 available': 'avx512_fp16' in flags,
        'AVX512-BF16 available': 'avx512_bf16' in flags,
        'AMX IPEX available': detect_ipex_amx_available(),
        'AMX IPEX enabled': detect_ipex_amx_enabled(),
    }

    return features


def get_debug_details():
    '''
    Return a block of markdown text that shows useful debug
    information.
    '''

    # ipex.version() prints to stdout, so redirect stdout to
    # capture the output.
    buffer = io.StringIO()

    with redirect_stdout(buffer):
        ipex.version()

    ipex_version_details = buffer.getvalue().replace("\n", ", ")

    ipex_current_isa_level = ipex_core._get_current_isa_level()
    ipex_max_isa_level = ipex_core._get_highest_cpu_support_isa_level()

    ipex_env_var = os.getenv('ATEN_CPU_CAPABILITY')
    onednn_env_var = os.getenv('ONEDNN_MAX_CPU_ISA')

    in_chroot_result = in_chroot()

    cpu_info = get_cpu_info()
    flags = cpu_info["flags"]

    # Note that rather than using `<details>`, we could use gradio.Accordian(),
    # but the markdown version is more visually compact.
    md = f"""
    <details>
      <summary>Click to show debug details</summary>

      | Feature | Value |
      |-|-|
      | Arch | `{cpu_info['arch']}` |
      | CPU | `{cpu_info['brand_raw']}` |
      | CPU flags | `{flags}` |
      | Python version | `{sys.version}` (implementation: `{platform.python_implementation()}`) |
      | Python version details | `{sys.version_info}` |
      | PyTorch version | `{torch.__version__}` |
      | IPEX version | `{ipex.ipex_version}` |
      | IPEX CPU detected | `{ipex_core._has_cpu()}` |
      | IPEX XPU detected | `{ipex_core._has_xpu()}` |
      | IPEX version details | `{ipex_version_details}` |
      | IPEX env var `ATEN_CPU_CAPABILITY` | `{ipex_env_var}` |
      | IPEX current ISA level | `{ipex_current_isa_level}` |
      | IPEX max ISA level | `{ipex_max_isa_level}` |
      | oneDNN env var `ONEDNN_MAX_CPU_ISA` | `{onednn_env_var}` |
      | in chroot | `{in_chroot_result}` |

    </details>
    """

    return md


def predict(input, history=None):
    if history is None:
        history = []
    new_user_input_ids = tokenizer.encode(
        input + tokenizer.eos_token, return_tensors='pt'
    )
    bot_input_ids = torch.cat([torch.LongTensor(history), new_user_input_ids], dim=-1)
    history = model.generate(
        bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id
    ).tolist()
    # convert the tokens to text, and then split the responses into the right format
    response = tokenizer.decode(history[0]).split("<|endoftext|>")
    response = [
        (response[i], response[i + 1]) for i in range(0, len(response) - 1, 2)
    ]  # convert to tuples of list
    return response, history


with gr.Blocks() as demo:
    gr.Markdown(
        '''## Confidential HuggingFace Runner
    '''
    )
    state = gr.State([])
    chatbot = gr.Chatbot([], elem_id="chatbot").style(height=400)
    with gr.Row():
        with gr.Column(scale=4):
            txt = gr.Textbox(
                show_label=False, placeholder="Enter text and press enter"
            ).style(container=False)
        with gr.Column(scale=1):
            button = gr.Button("Generate")
    txt.submit(predict, [txt, state], [chatbot, state])
    button.click(predict, [txt, state], [chatbot, state])

    with gr.Row():
        features_dict = get_features()

        all_features = features_dict.keys()

        # Get a list of feature names that are actually set/available
        set_features = [key for key in features_dict if features_dict[key]]

        gr.CheckboxGroup(
            all_features,
            label="Features",
            # Make the boxes read-only
            interactive=False,
            # Specify which features were detected
            value=set_features,
            info="Features detected from environment",
        )

    with gr.Row():
        debug_details = get_debug_details()
        gr.Markdown(debug_details)

demo.queue().launch(share=True, server_name="0.0.0.0")