Spaces:
Sleeping
Sleeping
import torch | |
import random | |
from quart import Quart | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
app = Quart(__name__) | |
tokenizer = AutoTokenizer.from_pretrained("OpenBuddy/openbuddy-openllama-3b-v10-bf16") | |
model = AutoModelForCausalLM.from_pretrained("OpenBuddy/openbuddy-openllama-3b-v10-bf16") | |
model.eval() | |
with open('../system.prompt', 'r', encoding='utf-8') as f: | |
prompt = f.read() | |
async def echo(): | |
data = await request.get_json() | |
if data.get("max_tokens") != None and data.get("max_tokens") > 500: data['max_tokens'] = 500 | |
userPrompt = prompt + "\n\nUser: " + data['request'] + "\nAssistant: " | |
input_ids = tokenizer.encode(prompt, return_tensors='pt') | |
with torch.no_grad(): | |
output_ids = model.generate( | |
input_ids=input_ids, | |
do_sample=random.choice([True, False]), temperature=float(random.randint(7,20)) / 10.0, | |
max_new_tokens=data.get("max_tokens") or random.randomint(200,500), | |
eos_token_id=tokenizer.eos_token_id, return_full_text = False) | |
return {"output": tokenizer.decode(output_ids[0], skip_special_tokens=True)} | |
async def get(): | |
return "better to run it on own container" |