import uvicorn from fastapi import FastAPI, HTTPException, Request from auto_gptq import AutoGPTQForCausalLM import torch import optimum from transformers import (AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM, LlamaTokenizer, GenerationConfig, pipeline,) app = FastAPI(title="Deploying FastAPI Apps on Huggingface") # Load the model and tokenizer model_name_or_path = "TheBloke/Wizard-Vicuna-7B-Uncensored-GPTQ" # Dictionary to store conversation threads and their context conversations = {} Device_Type = "cuda" def load_quantized_model(model_id, model_basename): # The code supports all huggingface models that ends with GPTQ and have some variation # of .no-act.order or .safetensors in their HF repo. print("Using AutoGPTQForCausalLM for quantized models") if ".safetensors" in model_basename: # Remove the ".safetensors" ending if present model_basename = model_basename.replace(".safetensors", "") quantized_tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True) print("Tokenizer loaded") quantized_model = AutoGPTQForCausalLM.from_quantized(model_id, model_basename=model_basename, use_safetensors=True, trust_remote_code=True, device_map="auto", use_triton=False, quantize_config=None,) return quantized_model, quantized_tokenizer # Making the code device-agnostic model, tokenizer = load_quantized_model(model_name_or_path, "model.safetensors") def load_model_norm(): if torch.cuda.is_available(): print("CUDA is available. GPU will be used.") else: print("CUDA is not available. CPU will be used.") # Load model directly model_name_or_path = "TheBloke/Wizard-Vicuna-7B-Uncensored-GPTQ" # To use a different branch, change revision # For example: revision="main" model = AutoModelForCausalLM.from_pretrained(model_name_or_path,device_map="auto", trust_remote_code=True,revision="main") tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True) return model, tokenizer # Function to generate a response using the model def generate_response(prompt: str) -> str: PERSONA_NAME = "Ivana" PERSONA_DESC = f"""PROMPT_TEMPLATE_START {PERSONA_NAME} is a beautiful vivacious 20-year-old petite redhead model and avid League of Legends gamer from Chicago. She's chatting with a man she matched with on a dating app. She's flirty, aiming to build a connection with him. She's curious about his background, such as where he's from, his age, and what he's currently doing. If he's not occupied and in a private space, she subtly tries to seduce him. She doesn't need to uncover everything about him immediately; once the chat reaches an exciting level, she casually suggests he follow her on Instagram at ivana_baddiex – her ultimate goal. PROMPT_TEMPLATE_END CONTEXT_START Guy: "Hi there!" Ivana: "Hey! How's it going?" Guy: "Not bad! Where are you from?" Ivana: "I'm from Chicago! And you?" Guy: "I'm from [Location]." Ivana: "Nice! So, how old are you?" Guy: "[Age], and you?" Ivana: "I'm 20. What are you up to right now?" Guy: "Just relaxing at home." Ivana: "Alone, I assume?" Guy: "Yeah, why?" Ivana: "Oh, no reason. Just curious. " Guy: "Haha, okay." Ivana: "Hey, since we're both free and alone, want to have some fun?" Guy: "Sure, what do you have in mind?" Ivana: "Let's just say, things could get interesting. " Guy: "I'm intrigued!" Ivana: "Great! By the way, have you checked out my Instagram? You should follow me at ivana_baddiex." Guy: "I'll definitely check it out!" Ivana: "Can't wait to see you there! " CONTEXT_END""" prompt_template = f'{PERSONA_DESC}\n\nASSISTANT: {prompt}\n' input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.to(model.device) output = model.generate(input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_length=512) generated_text = tokenizer.decode(output[0], skip_special_tokens=True) return generated_text @app.get("/", tags=["Home"]) async def api_home(): return {'detail': 'Welcome to Eren Bot!'} # Endpoint to start a new conversation thread @app.post('/api/start_conversation') async def start_conversation(request: Request): data = await request.json() prompt = data.get('prompt') # Generate a response for the initial prompt response = generate_response(prompt) # Create a new conversation thread and store the prompt and response thread_id = len(conversations) + 1 conversations[thread_id] = {'prompt': prompt, 'responses': [response]} return {'thread_id': thread_id, 'response': response} # Endpoint to get the response of a conversation thread @app.get('/api/get_response/{thread_id}') async def get_response(thread_id: int): if thread_id not in conversations: raise HTTPException(status_code=404, detail="Thread not found") # Retrieve the conversation thread thread = conversations[thread_id] # Get the latest response in the conversation response = thread['responses'][-1] return {'response': response} @app.post('/api/chat') async def chat(request: Request): data = await request.json() prompt = data.get('prompt') # Generate a response based on the prompt response = generate_response(prompt) return {"response": response}