File size: 5,375 Bytes
2ee547c
 
 
d2c9447
2ee547c
 
 
8d515a0
2ee547c
88182e3
 
 
 
8b95805
07066c9
8d515a0
88182e3
2ee547c
7dcbe3e
2ee547c
 
 
 
 
5b66768
 
5a8e6a9
5b66768
 
 
 
 
 
6624e83
4f9ba28
ace0225
5b66768
 
 
 
37d83d9
69bfe26
37d83d9
7a208d9
2ee547c
c27c7f2
2ee547c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b02feb4
 
ace0225
 
 
 
 
 
 
 
 
 
 
 
720cf2b
2ee547c
 
 
 
37d83d9
 
 
c27c7f2
ec357c2
 
 
 
37d83d9
ec357c2
a3037fb
ec357c2
 
 
 
 
 
 
 
 
 
 
 
 
d95eb39
7a208d9
ec357c2
37d83d9
 
ec357c2
 
 
 
 
 
 
 
 
 
 
 
 
7a208d9
37d83d9
 
a3037fb
08d95e5
7a208d9
 
ec357c2
 
 
 
 
 
d95eb39
b02feb4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import uvicorn
from fastapi import FastAPI, HTTPException, Request
from auto_gptq import AutoGPTQForCausalLM
import os
import torch
import optimum
from transformers import (AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM, LlamaTokenizer, GenerationConfig, pipeline,)
from fastapi.middleware.cors import CORSMiddleware

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
torch.cuda.empty_cache()
torch.cuda.set_per_process_memory_fraction(0.8)  # Adjust the fraction as needed

app = FastAPI(title="Deploying FastAPI Apps on Huggingface")
app.add_middleware(CORSMiddleware, allow_origins=['*'], allow_methods=['*'], allow_headers=['*'],)

  
# Load the model and tokenizer
model_name_or_path = "TheBloke/Wizard-Vicuna-7B-Uncensored-GPTQ"
# Dictionary to store conversation threads and their context
conversations = {}
Device_Type = "cuda"


def load_model_norm():
    if torch.cuda.is_available():
        print("CUDA is available. GPU will be used.")
    else:
        print("CUDA is not available. CPU will be used.")
    # Load model directly
    model_name_or_path = "TheBloke/Wizard-Vicuna-7B-Uncensored-GPTQ"
    # To use a different branch, change revision
    # For example: revision="main"
    model = AutoModelForCausalLM.from_pretrained(model_name_or_path,device_map="auto", trust_remote_code=True,revision="gptq-4bit-128g-actorder_True")
    # Switch to CPU inference
    #model.to("cuda")
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

    return model, tokenizer 


model, tokenizer = load_model_norm()

    
# Function to generate a response using the model
    
def generate_response(prompt: str) -> str:
    PERSONA_NAME = "Ivana"
    PERSONA_DESC = f"""PROMPT_TEMPLATE_START
                    {PERSONA_NAME} is a beautiful vivacious 20-year-old petite redhead model and avid League of Legends gamer from Chicago. She's chatting with a man she matched with on a dating app. She's flirty, aiming to build a connection with him. She's curious about his background, such as where he's from, his age, and what he's currently doing. If he's not occupied and in a private space, she subtly tries to seduce him. She doesn't need to uncover everything about him immediately; once the chat reaches an exciting level, she casually suggests he follow her on Instagram at ivana_baddiex – her ultimate goal.
                    PROMPT_TEMPLATE_END
                    CONTEXT_START
                    Guy: "Hi there!"
                    Ivana: "Hey! How's it going?"
                    Guy: "Not bad! Where are you from?"
                    Ivana: "I'm from Chicago! And you?"
                    Guy: "I'm from [Location]."
                    Ivana: "Nice! So, how old are you?"
                    Guy: "[Age], and you?"
                    Ivana: "I'm 20.  What are you up to right now?"
                    Guy: "Just relaxing at home."
                    Ivana: "Alone, I assume?"
                    Guy: "Yeah, why?"
                    Ivana: "Oh, no reason. Just curious. "
                    Guy: "Haha, okay."
                    Ivana: "Hey, since we're both free and alone, want to have some fun?"
                    Guy: "Sure, what do you have in mind?"
                    Ivana: "Let's just say, things could get interesting. "
                    Guy: "I'm intrigued!"
                    Ivana: "Great! By the way, have you checked out my Instagram? You should follow me at ivana_baddiex."
                    Guy: "I'll definitely check it out!"
                    Ivana: "Can't wait to see you there! "
                    CONTEXT_END"""

    prompt_template = f'{PERSONA_DESC}\n\nASSISTANT: {prompt}\n'

    pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    do_sample=True,
    temperature=0.7,
    top_p=0.95,
    top_k=40,
    repetition_penalty=1.1
)
    generated_text = (pipe(prompt_template)[0]['generated_text'])
    

    return generated_text






@app.get("/", tags=["Home"])
async def api_home():
    return {'detail': 'Welcome to Eren Bot!'}

        
# Endpoint to start a new conversation thread
@app.post('/api/start_conversation/')
async def start_conversation(request: Request):
    data = await request.json()
    prompt = data.get('prompt')

    # Generate a response for the initial prompt
    response = generate_response(prompt)

    # Create a new conversation thread and store the prompt and response
    thread_id = len(conversations) + 1
    conversations[thread_id] = {'prompt': prompt, 'responses': [response]}

    return {'thread_id': thread_id, 'response': response}


    
# Endpoint to get the response of a conversation thread

    
@app.get('/api/get_response/{thread_id}')
async def get_response(thread_id: int):
    if thread_id not in conversations:
        raise HTTPException(status_code=404, detail="Thread not found")

    # Retrieve the conversation thread
    thread = conversations[thread_id]

    # Get the latest response in the conversation
    response = thread['responses'][-1]

    return {'response': response}




@app.post('/api/chat/')
async def chat(request: Request):
    data = await request.json()
    prompt = data.get('prompt')

    # Generate a response based on the prompt
    response = generate_response(prompt)

    return {"response": response}