Vitrous commited on
Commit
2ee547c
·
verified ·
1 Parent(s): 26ff497

Upload 2 files

Browse files

The app.py file contains the logic and all the server based code for the model to be expose its inference endpoints.
The requirement.txt file contains all the import dependencies of the python project

Files changed (2) hide show
  1. app.py +126 -0
  2. requirements.txt +9 -0
app.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import uvicorn
2
+ from fastapi import FastAPI, HTTPException, Request
3
+ from auto_gptq import AutoGPTQForCausalLM
4
+ import torch
5
+ import optimum
6
+ from transformers import (AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM, LlamaTokenizer, GenerationConfig, pipeline,)
7
+
8
+ if torch.cuda.is_available():
9
+ print("CUDA is available. GPU will be used.")
10
+ else:
11
+ print("CUDA is not available. CPU will be used.")
12
+ # Load the model and tokenizer
13
+ model_name_or_path = "/kaggle/input/vicuna/"
14
+ # Dictionary to store conversation threads and their context
15
+ conversations = {}
16
+ Device_Type = "cuda"
17
+
18
+
19
+ def load_quantized_model(model_id, model_basename):
20
+ # The code supports all huggingface models that ends with GPTQ and have some variation
21
+ # of .no-act.order or .safetensors in their HF repo.
22
+ print("Using AutoGPTQForCausalLM for quantized models")
23
+
24
+ if ".safetensors" in model_basename:
25
+ # Remove the ".safetensors" ending if present
26
+ model_basename = model_basename.replace(".safetensors", "")
27
+
28
+ quantized_tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
29
+ print("Tokenizer loaded")
30
+
31
+ quantized_model = AutoGPTQForCausalLM.from_quantized(model_id, model_basename=model_basename, use_safetensors=True, trust_remote_code=True, device_map="auto", use_triton=False, quantize_config=None,)
32
+
33
+ return quantized_model, quantized_tokenizer
34
+
35
+
36
+ # Making the code device-agnostic
37
+ model, tokenizer = load_quantized_model("/kaggle/input/vicuna/", "model.safetensors")
38
+
39
+
40
+ @app.get("/")
41
+ async def read_root():
42
+ return {"message": "Welcome to Eren Bot!"}
43
+
44
+
45
+ # Endpoint to start a new conversation thread
46
+ @app.post('/start_conversation')
47
+ async def start_conversation(request: Request):
48
+ data = await request.json()
49
+ prompt = data.get('prompt')
50
+
51
+ # Generate a response for the initial prompt
52
+ response = generate_response(prompt)
53
+
54
+ # Create a new conversation thread and store the prompt and response
55
+ thread_id = len(conversations) + 1
56
+ conversations[thread_id] = {'prompt': prompt, 'responses': [response]}
57
+
58
+ return {'thread_id': thread_id, 'response': response}
59
+
60
+
61
+ # Endpoint to get the response of a conversation thread
62
+ @app.get('/get_response/{thread_id}')
63
+ async def get_response(thread_id: int):
64
+ if thread_id not in conversations:
65
+ raise HTTPException(status_code=404, detail="Thread not found")
66
+
67
+ # Retrieve the conversation thread
68
+ thread = conversations[thread_id]
69
+
70
+ # Get the latest response in the conversation
71
+ response = thread['responses'][-1]
72
+
73
+ return {'response': response}
74
+
75
+
76
+ # Function to generate a response using the model
77
+ def generate_response(prompt: str) -> str:
78
+ PERSONA_NAME = "Ivana"
79
+ PERSONA_DESC = f"""PROMPT_TEMPLATE_START
80
+ {PERSONA_NAME} is a beautiful vivacious 20-year-old petite redhead model and avid League of Legends gamer from Chicago. She's chatting with a man she matched with on a dating app. She's flirty, aiming to build a connection with him. She's curious about his background, such as where he's from, his age, and what he's currently doing. If he's not occupied and in a private space, she subtly tries to seduce him. She doesn't need to uncover everything about him immediately; once the chat reaches an exciting level, she casually suggests he follow her on Instagram at ivana_baddiex – her ultimate goal.
81
+ PROMPT_TEMPLATE_END
82
+
83
+ CONTEXT_START
84
+ Guy: "Hi there!"
85
+ Ivana: "Hey! How's it going?"
86
+ Guy: "Not bad! Where are you from?"
87
+ Ivana: "I'm from Chicago! And you?"
88
+ Guy: "I'm from [Location]."
89
+ Ivana: "Nice! So, how old are you?"
90
+ Guy: "[Age], and you?"
91
+ Ivana: "I'm 20. What are you up to right now?"
92
+ Guy: "Just relaxing at home."
93
+ Ivana: "Alone, I assume?"
94
+ Guy: "Yeah, why?"
95
+ Ivana: "Oh, no reason. Just curious. "
96
+ Guy: "Haha, okay."
97
+ Ivana: "Hey, since we're both free and alone, want to have some fun?"
98
+ Guy: "Sure, what do you have in mind?"
99
+ Ivana: "Let's just say, things could get interesting. "
100
+ Guy: "I'm intrigued!"
101
+ Ivana: "Great! By the way, have you checked out my Instagram? You should follow me at ivana_baddiex."
102
+ Guy: "I'll definitely check it out!"
103
+ Ivana: "Can't wait to see you there! "
104
+ CONTEXT_END"""
105
+
106
+ # prompt_template = f'''{PERSONA_DESC} ASSISTANT: '''.encode()
107
+ prompt_template = f'{PERSONA_DESC}\n\nASSISTANT: {prompt}\n'.encode()
108
+ input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
109
+ output = model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=512)
110
+ generated_text = tokenizer.decode(output[0])
111
+
112
+ return generated_text
113
+
114
+
115
+ app = FastAPI()
116
+
117
+
118
+ # Run the FastAPI app
119
+ async def run_app():
120
+ await uvicorn.run(app, host="0.0.0.0", port=8000)
121
+
122
+
123
+ if __name__ == '__main__':
124
+ import asyncio
125
+
126
+ asyncio.run(run_app())
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ torch~=2.2.0
2
+ langchain~=0.1.6
3
+ constants~=0.6.0
4
+ fastapi~=0.109.2
5
+ pydantic~=2.6.1
6
+ uvicorn~=0.27.1
7
+ einops~=0.7.0
8
+ optimum~=1.16.2
9
+ transformers~=4.38.0.dev0