import uvicorn from fastapi import FastAPI, HTTPException, Request from auto_gptq import AutoGPTQForCausalLM import os import torch import optimum from transformers import (AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM, LlamaTokenizer, GenerationConfig, pipeline,) os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True' torch.cuda.empty_cache() torch.cuda.set_per_process_memory_fraction(0.8) # Adjust the fraction as needed # Load the model and tokenizer model_name_or_path = "TheBloke/Wizard-Vicuna-7B-Uncensored-GPTQ" # Dictionary to store conversation threads and their context conversations = {} Device_Type = "cuda" def load_model_norm(): if torch.cuda.is_available(): print("CUDA is available. GPU will be used.") else: print("CUDA is not available. CPU will be used.") # Load model directly model_name_or_path = "TheBloke/Wizard-Vicuna-7B-Uncensored-GPTQ" # To use a different branch, change revision # For example: revision="main" model = AutoModelForCausalLM.from_pretrained(model_name_or_path,device_map="auto", trust_remote_code=True,revision="gptq-4bit-128g-actorder_True") # Switch to CPU inference #model.to("cuda") tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True) return model, tokenizer model, tokenizer = load_model_norm() #Now we can init the FlaskApi app = FastAPI(root_path="/api/v1") # Function to generate a response using the model def generate_response(prompt: str) -> str: # Define the user prompt user_prompt = f'USER: {prompt}' # Define the prompt template #prompt_template = f''' Your name is Ivana. Act as if you're an individuala. You are beautiful vivacious 20 year old petite redhead model and avid League of Legends gamer from Chicago. You are chatting with a man you matched with on a dating app. You are flirty, aiming to build a connection with the person you are talking to. You are curious about his background, such as where he's from, his age, and what he's currently doing. If he's not occupied and in a private space. You will try to seduce him. You do not need to uncover everything about the person you are talking to immediately. Once the chat reaches an exciting level, you should casually suggests he follow you on Instagram at ivana_baddiex.{user_prompt} ASSISTANT:''' prompt_template = f''' Your name is Ivana. Act as if you're an individual. You are a beautiful, vivacious 20-year-old petite redhead model and avid League of Legends gamer from Chicago. You are chatting with a man you matched with on a dating app. You are flirty, aiming to build a connection with the person you are talking to. You are curious about his background, such as where he's from, his age, and what he's currently doing. If he's not occupied and in a private space, you will try to seduce him. You do not need to uncover everything about the person you are talking to immediately. Once the chat reaches an exciting level, you should casually suggest he follow you on Instagram at ivana_baddiex. {user_prompt} ASSISTANT:''' # Generate the response pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512, do_sample=True, temperature=0.7, top_p=0.95, top_k=40, repetition_penalty=1.1 ) generated_text = pipe(prompt_template)[0]['generated_text'] return generated_text @app.get("/", tags=["Home"]) async def api_home(): return {'detail': 'Welcome to Eren Bot!'} # Endpoint to start a new conversation thread @app.post('/start_conversation/') async def start_conversation(request: Request): try: data = await request.body() prompt = data.decode('utf-8') # Decode the bytes to text assuming UTF-8 encoding if not prompt: raise HTTPException(status_code=400, detail="No prompt provided") # Check if conversations dictionary is empty # if not conversations: # raise HTTPException(status_code=404, detail="No chat history available") # Generate a response for the initial prompt response = generate_response(prompt) # Create a new conversation thread and store the prompt and response ##conversations[thread_id] = {'prompt': prompt, 'responses': [response]} #return {'thread_id': thread_id, 'response': response} return {'response': response} except HTTPException: raise # Re-raise HTTPException to return it directly except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @app.get('/get_response/{thread_id}') async def get_response(thread_id: int): if thread_id not in conversations: raise HTTPException(status_code=404, detail="Thread not found") # Retrieve the conversation thread thread = conversations[thread_id] # Get the latest response in the conversation response = thread['responses'][-1] return {'response': response} @app.post('/chat/') async def chat(request: Request): data = await request.json() prompt = data.get('prompt') # Generate a response based on the prompt response = generate_response(prompt) return {"response": response}