Hjgugugjhuhjggg commited on
Commit
62287b7
·
verified ·
1 Parent(s): f660b72

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +183 -0
app.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel
2
+ from llama_cpp_agent import Llama
3
+ from concurrent.futures import ThreadPoolExecutor, as_completed
4
+ import re
5
+ import httpx
6
+ import asyncio
7
+ import gradio as gr
8
+ import os
9
+ import gptcache
10
+ from dotenv import load_dotenv
11
+ from fastapi import FastAPI, Request
12
+ from fastapi.responses import JSONResponse
13
+ import uvicorn
14
+ from threading import Thread
15
+
16
+ load_dotenv()
17
+
18
+ HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
19
+
20
+ global_data = {
21
+ 'models': {},
22
+ 'tokens': {
23
+ 'eos': 'eos_token',
24
+ 'pad': 'pad_token',
25
+ 'padding': 'padding_token',
26
+ 'unk': 'unk_token',
27
+ 'bos': 'bos_token',
28
+ 'sep': 'sep_token',
29
+ 'cls': 'cls_token',
30
+ 'mask': 'mask_token'
31
+ },
32
+ 'model_metadata': {},
33
+ 'max_tokens': 256,
34
+ 'tokenizers': {},
35
+ 'model_params': {},
36
+ 'model_size': {},
37
+ 'model_ftype': {},
38
+ 'n_ctx_train': {},
39
+ 'n_embd': {},
40
+ 'n_layer': {},
41
+ 'n_head': {},
42
+ 'n_head_kv': {},
43
+ 'n_rot': {},
44
+ 'n_swa': {},
45
+ 'n_embd_head_k': {},
46
+ 'n_embd_head_v': {},
47
+ 'n_gqa': {},
48
+ 'n_embd_k_gqa': {},
49
+ 'n_embd_v_gqa': {},
50
+ 'f_norm_eps': {},
51
+ 'f_norm_rms_eps': {},
52
+ 'f_clamp_kqv': {},
53
+ 'f_max_alibi_bias': {},
54
+ 'f_logit_scale': {},
55
+ 'n_ff': {},
56
+ 'n_expert': {},
57
+ 'n_expert_used': {},
58
+ 'causal_attn': {},
59
+ 'pooling_type': {},
60
+ 'rope_type': {},
61
+ 'rope_scaling': {},
62
+ 'freq_base_train': {},
63
+ 'freq_scale_train': {},
64
+ 'n_ctx_orig_yarn': {},
65
+ 'rope_finetuned': {},
66
+ 'ssm_d_conv': {},
67
+ 'ssm_d_inner': {},
68
+ 'ssm_d_state': {},
69
+ 'ssm_dt_rank': {},
70
+ 'ssm_dt_b_c_rms': {},
71
+ 'vocab_type': {},
72
+ 'model_type': {}
73
+ }
74
+
75
+ model_configs = [
76
+ {"repo_id": "Hjgugugjhuhjggg/testing_semifinal-Q2_K-GGUF", "filename": "testing_semifinal-q2_k.gguf", "name": "testing"}
77
+ ]
78
+
79
+ class ModelManager:
80
+ def __init__(self):
81
+ self.models = {}
82
+
83
+ def load_model(self, model_config):
84
+ if model_config['name'] not in self.models:
85
+ try:
86
+ self.models[model_config['name']] = Llama.from_pretrained(
87
+ repo_id=model_config['repo_id'],
88
+ filename=model_config['filename'],
89
+ use_auth_token=HUGGINGFACE_TOKEN,
90
+ n_threads=8,
91
+ use_gpu=False
92
+ )
93
+ except Exception as e:
94
+ pass
95
+
96
+ def load_all_models(self):
97
+ with ThreadPoolExecutor() as executor:
98
+ for config in model_configs:
99
+ executor.submit(self.load_model, config)
100
+ return self.models
101
+
102
+ model_manager = ModelManager()
103
+ global_data['models'] = model_manager.load_all_models()
104
+
105
+ class ChatRequest(BaseModel):
106
+ message: str
107
+
108
+ def normalize_input(input_text):
109
+ return input_text.strip()
110
+
111
+ def remove_duplicates(text):
112
+ lines = text.split('\n')
113
+ unique_lines = []
114
+ seen_lines = set()
115
+ for line in lines:
116
+ if line not in seen_lines:
117
+ unique_lines.append(line)
118
+ seen_lines.add(line)
119
+ return '\n'.join(unique_lines)
120
+
121
+ def cache_response(func):
122
+ def wrapper(*args, **kwargs):
123
+ cache_key = f"{args}-{kwargs}"
124
+ if gptcache.get(cache_key):
125
+ return gptcache.get(cache_key)
126
+ response = func(*args, **kwargs)
127
+ gptcache.set(cache_key, response)
128
+ return response
129
+ return wrapper
130
+
131
+ @cache_response
132
+ def generate_model_response(model, inputs):
133
+ try:
134
+ response = model(inputs)
135
+ return remove_duplicates(response['choices'][0]['text'])
136
+ except Exception as e:
137
+ return ""
138
+
139
+ def remove_repetitive_responses(responses):
140
+ unique_responses = {}
141
+ for response in responses:
142
+ if response['model'] not in unique_responses:
143
+ unique_responses[response['model']] = response['response']
144
+ return unique_responses
145
+
146
+ async def process_message(message):
147
+ inputs = normalize_input(message)
148
+ with ThreadPoolExecutor() as executor:
149
+ futures = [
150
+ executor.submit(generate_model_response, model, inputs)
151
+ for model in global_data['models'].values()
152
+ ]
153
+ responses = [{'model': model_name, 'response': future.result()} for model_name, future in zip(global_data['models'].keys(), as_completed(futures))]
154
+ unique_responses = remove_repetitive_responses(responses)
155
+ formatted_response = ""
156
+ for model, response in unique_responses.items():
157
+ formatted_response += f"**{model}:**\n{response}\n\n"
158
+ return formatted_response
159
+
160
+ app = FastAPI()
161
+
162
+ @app.post("/generate")
163
+ async def generate(request: ChatRequest):
164
+ response = await process_message(request.message)
165
+ return JSONResponse(content={"response": response})
166
+
167
+ def run_uvicorn():
168
+ uvicorn.run(app, host="0.0.0.0", port=7860)
169
+
170
+ iface = gr.Interface(
171
+ fn=process_message,
172
+ inputs=gr.Textbox(lines=2, placeholder="Enter your message here..."),
173
+ outputs=gr.Markdown(),
174
+ title="Multi-Model LLM API (CPU Optimized)",
175
+ description="Enter a message and get responses from multiple LLMs using CPU."
176
+ )
177
+
178
+ def run_gradio():
179
+ iface.launch(server_port=7860, prevent_thread_lock=True)
180
+
181
+ if __name__ == "__main__":
182
+ Thread(target=run_uvicorn).start()
183
+ Thread(target=run_gradio).start()