Spaces:

tafxle
/

Bloom_chat

Runtime error

File size: 2,143 Bytes

59afef3
09c0a40
d8e2347
09c0a40
d8e2347
09c0a40
 
d30d41e
d8e2347
d45c55b
 
d8e2347
09c0a40
 
 
 
 
 
d8e2347
 
09c0a40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d8e2347
09c0a40
 
 
d8e2347
 
 
 
d30d41e
 
d8e2347
 
 
 
 
 
 
d30d41e
09c0a40
d8e2347
09c0a40
d30d41e

import torch
import transformers
import time
from huggingface_hub import hf_hub_download
import streamlit as st


@st.cache(allow_output_mutation=True)
def load_model():
    fpath = hf_hub_download("OpenDungeon/gpt-j-8bit-ffbgem", "model.pt")
    qmodel = torch.load(fpath)
    return transformers.AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B"), qmodel


def PrintContinuation(prompt, local_model, single_hook=None, batch=1, limit_tokens = 50):
    past_key_values = None  # used to keep track of conversation history
    input_dict = tokenizer([prompt] * batch, return_tensors='pt', padding=False)
    output = [""] * batch
    batch_time = 0
    
    with torch.inference_mode():
        for i in range(limit_tokens + 20):
            if i == 5:
                start_time = time.perf_counter()

            outputs = local_model.forward(**input_dict, use_cache=True, past_key_values=past_key_values)
            last_logits = outputs.logits[:, -1]

            for j in range(batch):
                last_logits[j, last_logits[j].topk(k=10).indices] += 10

            past_key_values = outputs.past_key_values
            token_ix = torch.multinomial(last_logits.softmax(-1), 1)
            output = [stream + tokenizer.decode(ix) for stream, ix in zip(output, token_ix)]

            if single_hook is not None:
                single_hook(tokenizer.decode(token_ix[0]))
            if i == limit_tokens:
                batch_time = (time.perf_counter() - start_time) / (i - 4)
                break

            input_dict = dict(input_ids=token_ix)
    return output, batch_time


tokenizer, model = load_model()
text = st.text_area("Prefix", value="DM: You enter the room.")
batch = st.number_input("Variants", value=5)

t = st.empty()
firstline = ""

def PrintSome(text):
    global t, firstline
    firstline += text
    t.markdown(f"{firstline}...")

choices, batch_time = PrintContinuation(text, model, PrintSome, batch, 50)

final_page = ""
for i in range(batch):
    final_page += f"## choice №{i + 1}  \n{choices[i]}  \n______  \n"
final_page = f"Seconds per batch: {batch_time}, Batch: {batch}"

t.markdown(final_page)