|
import uvicorn |
|
from fastapi import FastAPI, HTTPException, Request |
|
from auto_gptq import AutoGPTQForCausalLM |
|
import os |
|
import torch |
|
import optimum |
|
from transformers import (AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM, LlamaTokenizer, GenerationConfig, pipeline,) |
|
from fastapi.middleware.cors import CORSMiddleware |
|
from pyngrok import ngrok |
|
|
|
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True' |
|
torch.cuda.empty_cache() |
|
torch.cuda.set_per_process_memory_fraction(0.8) |
|
|
|
app = FastAPI(title="Deploying FastAPI Apps on Huggingface") |
|
app.add_middleware(CORSMiddleware, allow_origins=['*'], allow_methods=['*'], allow_headers=['*'],) |
|
|
|
ngrok.set_auth_token("Your token here") |
|
ngrok.kill() |
|
ngrok_tunnel = ngrok.connect(7860) |
|
print(ngrok_tunnel.public_url) |
|
|
|
|
|
model_name_or_path = "TheBloke/Wizard-Vicuna-7B-Uncensored-GPTQ" |
|
|
|
conversations = {} |
|
Device_Type = "cuda" |
|
|
|
|
|
def load_model_norm(): |
|
if torch.cuda.is_available(): |
|
print("CUDA is available. GPU will be used.") |
|
else: |
|
print("CUDA is not available. CPU will be used.") |
|
|
|
model_name_or_path = "TheBloke/Wizard-Vicuna-7B-Uncensored-GPTQ" |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained(model_name_or_path,device_map="auto", trust_remote_code=True,revision="gptq-4bit-128g-actorder_True") |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True) |
|
|
|
return model, tokenizer |
|
|
|
|
|
model, tokenizer = load_model_norm() |
|
|
|
|
|
|
|
|
|
def generate_response(prompt: str) -> str: |
|
PERSONA_NAME = "Ivana" |
|
PERSONA_DESC = f"""PROMPT_TEMPLATE_START |
|
{PERSONA_NAME} is a beautiful vivacious 20-year-old petite redhead model and avid League of Legends gamer from Chicago. She's chatting with a man she matched with on a dating app. She's flirty, aiming to build a connection with him. She's curious about his background, such as where he's from, his age, and what he's currently doing. If he's not occupied and in a private space, she subtly tries to seduce him. She doesn't need to uncover everything about him immediately; once the chat reaches an exciting level, she casually suggests he follow her on Instagram at ivana_baddiex – her ultimate goal. |
|
PROMPT_TEMPLATE_END |
|
CONTEXT_START |
|
Guy: "Hi there!" |
|
Ivana: "Hey! How's it going?" |
|
Guy: "Not bad! Where are you from?" |
|
Ivana: "I'm from Chicago! And you?" |
|
Guy: "I'm from [Location]." |
|
Ivana: "Nice! So, how old are you?" |
|
Guy: "[Age], and you?" |
|
Ivana: "I'm 20. What are you up to right now?" |
|
Guy: "Just relaxing at home." |
|
Ivana: "Alone, I assume?" |
|
Guy: "Yeah, why?" |
|
Ivana: "Oh, no reason. Just curious. " |
|
Guy: "Haha, okay." |
|
Ivana: "Hey, since we're both free and alone, want to have some fun?" |
|
Guy: "Sure, what do you have in mind?" |
|
Ivana: "Let's just say, things could get interesting. " |
|
Guy: "I'm intrigued!" |
|
Ivana: "Great! By the way, have you checked out my Instagram? You should follow me at ivana_baddiex." |
|
Guy: "I'll definitely check it out!" |
|
Ivana: "Can't wait to see you there! " |
|
CONTEXT_END""" |
|
|
|
prompt_template = f'{PERSONA_DESC}\n\nASSISTANT: {prompt}\n' |
|
|
|
pipe = pipeline( |
|
"text-generation", |
|
model=model, |
|
tokenizer=tokenizer, |
|
max_new_tokens=512, |
|
do_sample=True, |
|
temperature=0.7, |
|
top_p=0.95, |
|
top_k=40, |
|
repetition_penalty=1.1 |
|
) |
|
generated_text = (pipe(prompt_template)[0]['generated_text']) |
|
|
|
|
|
return generated_text |
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.get("/", tags=["Home"]) |
|
async def api_home(): |
|
return {'detail': 'Welcome to Eren Bot!'} |
|
|
|
|
|
|
|
@app.post('/api/start_conversation/') |
|
async def start_conversation(request: Request): |
|
data = await request.json() |
|
prompt = data.get('prompt') |
|
|
|
|
|
response = generate_response(prompt) |
|
|
|
|
|
thread_id = len(conversations) + 1 |
|
conversations[thread_id] = {'prompt': prompt, 'responses': [response]} |
|
|
|
return {'thread_id': thread_id, 'response': response} |
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.get('/api/get_response/{thread_id}') |
|
async def get_response(thread_id: int): |
|
if thread_id not in conversations: |
|
raise HTTPException(status_code=404, detail="Thread not found") |
|
|
|
|
|
thread = conversations[thread_id] |
|
|
|
|
|
response = thread['responses'][-1] |
|
|
|
return {'response': response} |
|
|
|
|
|
|
|
|
|
@app.post('/api/chat/') |
|
async def chat(request: Request): |
|
data = await request.json() |
|
prompt = data.get('prompt') |
|
|
|
|
|
response = generate_response(prompt) |
|
|
|
return {"response": response} |
|
|
|
|
|
|
|
|