File size: 8,524 Bytes
9883ddb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 |
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from langchain import LLMChain
from langchain.llms import Llama
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import uvicorn
from dotenv import load_dotenv
import io
import requests
import asyncio
import time
# Cargar variables de entorno
load_dotenv()
# Inicializar aplicaci贸n FastAPI
app = FastAPI()
# Configuraci贸n de los modelos
model_configs = [
{"repo_id": "Ffftdtd5dtft/gpt2-xl-Q2_K-GGUF", "filename": "gpt2-xl-q2_k.gguf", "name": "GPT-2 XL"},
{"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-instruct-q2_k.gguf", "name": "Meta Llama 3.1-8B Instruct"},
{"repo_id": "Ffftdtd5dtft/gemma-2-9b-it-Q2_K-GGUF", "filename": "gemma-2-9b-it-q2_k.gguf", "name": "Gemma 2-9B IT"},
{"repo_id": "Ffftdtd5dtft/gemma-2-27b-Q2_K-GGUF", "filename": "gemma-2-27b-q2_k.gguf", "name": "Gemma 2-27B"},
{"repo_id": "Ffftdtd5dtft/Phi-3-mini-128k-instruct-Q2_K-GGUF", "filename": "phi-3-mini-128k-instruct-q2_k.gguf", "name": "Phi-3 Mini 128K Instruct"},
{"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-q2_k.gguf", "name": "Meta Llama 3.1-8B"},
{"repo_id": "Ffftdtd5dtft/Qwen2-7B-Instruct-Q2_K-GGUF", "filename": "qwen2-7b-instruct-q2_k.gguf", "name": "Qwen2 7B Instruct"},
{"repo_id": "Ffftdtd5dtft/starcoder2-3b-Q2_K-GGUF", "filename": "starcoder2-3b-q2_k.gguf", "name": "Starcoder2 3B"},
{"repo_id": "Ffftdtd5dtft/Qwen2-1.5B-Instruct-Q2_K-GGUF", "filename": "qwen2-1.5b-instruct-q2_k.gguf", "name": "Qwen2 1.5B Instruct"},
{"repo_id": "Ffftdtd5dtft/starcoder2-15b-Q2_K-GGUF", "filename": "starcoder2-15b-q2_k.gguf", "name": "Starcoder2 15B"},
{"repo_id": "Ffftdtd5dtft/gemma-2-2b-it-Q2_K-GGUF", "filename": "gemma-2-2b-it-q2_k.gguf", "name": "Gemma 2-2B IT"},
{"repo_id": "Ffftdtd5dtft/sarvam-2b-v0.5-Q2_K-GGUF", "filename": "sarvam-2b-v0.5-q2_k.gguf", "name": "Sarvam 2B v0.5"},
{"repo_id": "Ffftdtd5dtft/WizardLM-13B-Uncensored-Q2_K-GGUF", "filename": "wizardlm-13b-uncensored-q2_k.gguf", "name": "WizardLM 13B Uncensored"},
{"repo_id": "Ffftdtd5dtft/Qwen2-Math-72B-Instruct-Q2_K-GGUF", "filename": "qwen2-math-72b-instruct-q2_k.gguf", "name": "Qwen2 Math 72B Instruct"},
{"repo_id": "Ffftdtd5dtft/WizardLM-7B-Uncensored-Q2_K-GGUF", "filename": "wizardlm-7b-uncensored-q2_k.gguf", "name": "WizardLM 7B Uncensored"},
{"repo_id": "Ffftdtd5dtft/Qwen2-Math-7B-Instruct-Q2_K-GGUF", "filename": "qwen2-math-7b-instruct-q2_k.gguf", "name": "Qwen2 Math 7B Instruct"}
]
# Clase para gestionar modelos
class ModelManager:
def __init__(self):
self.models = []
self.configs = {}
async def download_model_to_memory(self, model_config):
print(f"Descargando modelo: {model_config['name']}...")
url = f"https://huggingface.co/{model_config['repo_id']}/resolve/main/{model_config['filename']}"
response = requests.get(url)
if response.status_code == 200:
model_file = io.BytesIO(response.content)
return model_file
else:
raise Exception(f"Error al descargar el modelo: {response.status_code}")
async def load_model(self, model_config):
try:
start_time = time.time()
model_file = await self.download_model_to_memory(model_config)
print(f"Cargando modelo: {model_config['name']}...")
# Simulaci贸n de divisi贸n de carga si el tiempo excede 1 segundo
async def load_part(part):
# Esta funci贸n simula la carga de una parte del modelo
await asyncio.sleep(0.1) # Simula un peque帽o retraso en la carga
# Se divide la carga en partes si excede 1 segundo
if time.time() - start_time > 1:
print(f"Modelo {model_config['name']} tard贸 m谩s de 1 segundo en cargarse, dividiendo la carga...")
await asyncio.gather(*(load_part(part) for part in range(5))) # Simulaci贸n de divisi贸n en 5 partes
else:
model = await asyncio.get_event_loop().run_in_executor(
None,
lambda: Llama.from_pretrained(model_file)
)
model = await asyncio.get_event_loop().run_in_executor(
None,
lambda: Llama.from_pretrained(model_file)
)
tokenizer = model.tokenizer
# Almacenar tokens y tokenizer en la RAM
model_data = {
'model': model,
'tokenizer': tokenizer,
'pad_token': tokenizer.pad_token,
'pad_token_id': tokenizer.pad_token_id,
'eos_token': tokenizer.eos_token,
'eos_token_id': tokenizer.eos_token_id,
'bos_token': tokenizer.bos_token,
'bos_token_id': tokenizer.bos_token_id,
'unk_token': tokenizer.unk_token,
'unk_token_id': tokenizer.unk_token_id
}
self.models.append({"model_data": model_data, "name": model_config['name']})
except Exception as e:
print(f"Error al cargar el modelo: {e}")
async def load_all_models(self):
print("Iniciando carga de modelos...")
start_time = time.time()
tasks = [self.load_model(config) for config in model_configs]
await asyncio.gather(*tasks)
end_time = time.time()
print(f"Todos los modelos han sido cargados en {end_time - start_time:.2f} segundos.")
# Instanciar ModelManager y cargar modelos
model_manager = ModelManager()
@app.on_event("startup")
async def startup_event():
await model_manager.load_all_models()
# Modelo global para la solicitud de chat
class ChatRequest(BaseModel):
message: str
top_k: int = 50
top_p: float = 0.95
temperature: float = 0.7
# L铆mite de tokens para respuestas
TOKEN_LIMIT = 1000 # Define el l铆mite de tokens permitido por respuesta
# Funci贸n para generar respuestas de chat
async def generate_chat_response(request, model_data):
try:
user_input = normalize_input(request.message)
llm = model_data['model_data']['model']
tokenizer = model_data['model_data']['tokenizer']
# Generar respuesta de manera r谩pida
response = await asyncio.get_event_loop().run_in_executor(
None,
lambda: llm(user_input, max_length=TOKEN_LIMIT, do_sample=True, top_k=request.top_k, top_p=request.top_p, temperature=request.temperature)
)
generated_text = response['generated_text']
# Dividir respuesta larga
split_response = split_long_response(generated_text)
return {"response": split_response, "literal": user_input, "model_name": model_data['name']}
except Exception as e:
print(f"Error al generar la respuesta: {e}")
return {"response": "Error al generar la respuesta", "literal": user_input, "model_name": model_data['name']}
def split_long_response(response):
""" Divide la respuesta en partes m谩s peque帽as si excede el l铆mite de tokens. """
parts = []
while len(response) > TOKEN_LIMIT:
part = response[:TOKEN_LIMIT]
response = response[TOKEN_LIMIT:]
parts.append(part.strip())
if response:
parts.append(response.strip())
return '\n'.join(parts)
def remove_duplicates(text):
""" Elimina duplicados en el texto. """
lines = text.splitlines()
unique_lines = list(dict.fromkeys(lines))
return '\n'.join(unique_lines)
def remove_repetitive_responses(responses):
unique_responses = []
seen_responses = set()
for response in responses:
normalized_response = remove_duplicates(response['response'])
if normalized_response not in seen_responses:
seen_responses.add(normalized_response)
response['response'] = normalized_response
unique_responses.append(response)
return unique_responses
@app.post("/chat")
async def chat(request: ChatRequest):
results = []
for model_data in model_manager.models:
response = await generate_chat_response(request, model_data)
results.append(response)
unique_results = remove_repetitive_responses(results)
return {"results": unique_results}
# Ejecutar la aplicaci贸n FastAPI
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)
|