Artix

Runtime error

App Files Files Community

Artix / app.py

Vitrous

Update app.py

d2c9447 verified 11 months ago

raw

history blame

6.15 kB

	import uvicorn
	from fastapi import FastAPI, HTTPException, Request
	from auto_gptq import AutoGPTQForCausalLM
	import os
	os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
	import torch
	import optimum
	from transformers import (AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM, LlamaTokenizer, GenerationConfig, pipeline,)
	from fastapi.middleware.cors import CORSMiddleware

	app = FastAPI(title="Deploying FastAPI Apps on Huggingface")
	app.add_middleware(CORSMiddleware, allow_origins=[''], allow_methods=[''], allow_headers=['*'],)


	# Load the model and tokenizer
	model_name_or_path = "TheBloke/Wizard-Vicuna-7B-Uncensored-GPTQ"
	# Dictionary to store conversation threads and their context
	conversations = {}
	Device_Type = "cuda"


	def load_quantized_model(model_id, model_basename):
	# The code supports all huggingface models that ends with GPTQ and have some variation
	# of .no-act.order or .safetensors in their HF repo.
	print("Using AutoGPTQForCausalLM for quantized models")

	if ".safetensors" in model_basename:
	# Remove the ".safetensors" ending if present
	model_basename = model_basename.replace(".safetensors", "")

	quantized_tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
	print("Tokenizer loaded")

	quantized_model = AutoGPTQForCausalLM.from_quantized(model_id, model_basename=model_basename, use_safetensors=True, trust_remote_code=True, device_map="auto", use_triton=False, quantize_config=None,)

	return quantized_model, quantized_tokenizer


	# Making the code device-agnostic
	#model, tokenizer = load_quantized_model(model_name_or_path, "model.safetensors")

	def load_model_norm():
	if torch.cuda.is_available():
	print("CUDA is available. GPU will be used.")
	else:
	print("CUDA is not available. CPU will be used.")
	# Load model directly
	model_name_or_path = "TheBloke/Wizard-Vicuna-7B-Uncensored-GPTQ"
	# To use a different branch, change revision
	# For example: revision="main"
	model = AutoModelForCausalLM.from_pretrained(model_name_or_path,device_map="auto", trust_remote_code=True,revision="gptq-4bit-128g-actorder_True")
	# Switch to CPU inference
	model.cpu()
	tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

	return model, tokenizer


	model, tokenizer = load_model_norm()


	# Function to generate a response using the model

	def generate_response(prompt: str) -> str:
	PERSONA_NAME = "Ivana"
	PERSONA_DESC = f"""PROMPT_TEMPLATE_START
	{PERSONA_NAME} is a beautiful vivacious 20-year-old petite redhead model and avid League of Legends gamer from Chicago. She's chatting with a man she matched with on a dating app. She's flirty, aiming to build a connection with him. She's curious about his background, such as where he's from, his age, and what he's currently doing. If he's not occupied and in a private space, she subtly tries to seduce him. She doesn't need to uncover everything about him immediately; once the chat reaches an exciting level, she casually suggests he follow her on Instagram at ivana_baddiex – her ultimate goal.
	PROMPT_TEMPLATE_END
	CONTEXT_START
	Guy: "Hi there!"
	Ivana: "Hey! How's it going?"
	Guy: "Not bad! Where are you from?"
	Ivana: "I'm from Chicago! And you?"
	Guy: "I'm from [Location]."
	Ivana: "Nice! So, how old are you?"
	Guy: "[Age], and you?"
	Ivana: "I'm 20. What are you up to right now?"
	Guy: "Just relaxing at home."
	Ivana: "Alone, I assume?"
	Guy: "Yeah, why?"
	Ivana: "Oh, no reason. Just curious. "
	Guy: "Haha, okay."
	Ivana: "Hey, since we're both free and alone, want to have some fun?"
	Guy: "Sure, what do you have in mind?"
	Ivana: "Let's just say, things could get interesting. "
	Guy: "I'm intrigued!"
	Ivana: "Great! By the way, have you checked out my Instagram? You should follow me at ivana_baddiex."
	Guy: "I'll definitely check it out!"
	Ivana: "Can't wait to see you there! "
	CONTEXT_END"""

	prompt_template = f'{PERSONA_DESC}\n\nASSISTANT: {prompt}\n'

	input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
	output = model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=512)
	generated_text = tokenizer.decode(output[0])


	return generated_text






	@app.get("/", tags=["Home"])
	async def api_home():
	return {'detail': 'Welcome to Eren Bot!'}


	# Endpoint to start a new conversation thread
	@app.post('/api/start_conversation/')
	async def start_conversation(request: Request):
	data = await request.json()
	prompt = data.get('prompt')

	# Generate a response for the initial prompt
	response = generate_response(prompt)

	# Create a new conversation thread and store the prompt and response
	thread_id = len(conversations) + 1
	conversations[thread_id] = {'prompt': prompt, 'responses': [response]}

	return {'thread_id': thread_id, 'response': response}



	# Endpoint to get the response of a conversation thread


	@app.get('/api/get_response/{thread_id}')
	async def get_response(thread_id: int):
	if thread_id not in conversations:
	raise HTTPException(status_code=404, detail="Thread not found")

	# Retrieve the conversation thread
	thread = conversations[thread_id]

	# Get the latest response in the conversation
	response = thread['responses'][-1]

	return {'response': response}




	@app.post('/api/chat/')
	async def chat(request: Request):
	data = await request.json()
	prompt = data.get('prompt')

	# Generate a response based on the prompt
	response = generate_response(prompt)

	return {"response": response}