from transformers import BloomTokenizerFast, BloomModel import torch import gradio as gr mname = "bigscience/bloom-1b7" tokenizer = BloomTokenizerFast.from_pretrained(mname, use_cache=True) model = BloomModel.from_pretrained(mname, use_cache=True) def take_last_tokens(inputs, note_history, history): """Filter the last 256 tokens""" if inputs['input_ids'].shape[1] > 256: inputs['input_ids'] = torch.tensor([inputs['input_ids'][0][-256:].tolist()]) inputs['attention_mask'] = torch.tensor([inputs['attention_mask'][0][-256:].tolist()]) note_history = [' '.join(note_history[0].split(' ')[2:])] history = history[1:] return inputs, note_history, history def add_note_to_history(note, note_history): """Add a note to the historical information""" note_history.append(note) note_history = ' '.join(note_history) return [note_history] def chat(message, history): history = history or [] if history: history_useful = [' '.join([str(a[0])+' '+str(a[1]) for a in history])] else: history_useful = [] history_useful = add_note_to_history(message, history_useful) inputs = tokenizer(history_useful, return_tensors="pt") inputs, history_useful, history = take_last_tokens(inputs, history_useful, history) reply_ids = model.generate(**inputs) response = tokenizer.batch_decode(reply_ids, skip_special_tokens=True)[0] history_useful = add_note_to_history(response, history_useful) list_history = history_useful[0].split(' ') history.append((list_history[-2], list_history[-1])) return history, history gr.Interface( fn=chat, theme="huggingface", css=".footer {display:none !important}", inputs=["text", "state"], outputs=["message", "state"], title="Bloom 1b3 chat", allow_flagging="never", ).launch()