Yjhhh commited on
Commit
3e1e0dc
1 Parent(s): c0c79d0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +133 -145
app.py CHANGED
@@ -1,20 +1,15 @@
1
- from fastapi import FastAPI, HTTPException
2
  from pydantic import BaseModel
3
- from langchain import LLMChain
4
- from langchain.llms import LlamaCpp
5
- from concurrent.futures import ThreadPoolExecutor, as_completed
6
- from tqdm import tqdm
7
  import uvicorn
8
- from dotenv import load_dotenv
9
- import io
10
  import requests
11
  import asyncio
 
 
12
  import time
 
 
 
13
 
14
- # Cargar variables de entorno
15
- load_dotenv()
16
-
17
- # Inicializar aplicación FastAPI
18
  app = FastAPI()
19
 
20
  # Configuración de los modelos
@@ -32,153 +27,146 @@ model_configs = [
32
  {"repo_id": "Ffftdtd5dtft/gemma-2-2b-it-Q2_K-GGUF", "filename": "gemma-2-2b-it-q2_k.gguf", "name": "Gemma 2-2B IT"},
33
  {"repo_id": "Ffftdtd5dtft/sarvam-2b-v0.5-Q2_K-GGUF", "filename": "sarvam-2b-v0.5-q2_k.gguf", "name": "Sarvam 2B v0.5"},
34
  {"repo_id": "Ffftdtd5dtft/WizardLM-13B-Uncensored-Q2_K-GGUF", "filename": "wizardlm-13b-uncensored-q2_k.gguf", "name": "WizardLM 13B Uncensored"},
 
35
  {"repo_id": "Ffftdtd5dtft/WizardLM-7B-Uncensored-Q2_K-GGUF", "filename": "wizardlm-7b-uncensored-q2_k.gguf", "name": "WizardLM 7B Uncensored"},
36
  {"repo_id": "Ffftdtd5dtft/Qwen2-Math-7B-Instruct-Q2_K-GGUF", "filename": "qwen2-math-7b-instruct-q2_k.gguf", "name": "Qwen2 Math 7B Instruct"}
37
  ]
38
 
39
- # Clase para gestionar modelos
40
  class ModelManager:
41
  def __init__(self):
42
- self.models = []
43
- self.configs = {}
 
 
 
44
 
45
  async def download_model_to_memory(self, model_config):
46
- print(f"Descargando modelo: {model_config['name']}...")
47
  url = f"https://huggingface.co/{model_config['repo_id']}/resolve/main/{model_config['filename']}"
48
- response = requests.get(url)
49
- if response.status_code == 200:
50
- model_file = io.BytesIO(response.content)
51
- return model_file
52
- else:
53
- raise Exception(f"Error al descargar el modelo: {response.status_code}")
54
-
55
- async def load_model(self, model_config):
56
  try:
57
  start_time = time.time()
58
- model_file = await self.download_model_to_memory(model_config)
59
- print(f"Cargando modelo: {model_config['name']}...")
60
-
61
- # Simulación de división de carga si el tiempo excede 1 segundo
62
- async def load_part(part):
63
- # Esta función simula la carga de una parte del modelo
64
- await asyncio.sleep(0.1) # Simula un pequeño retraso en la carga
65
-
66
- # Se divide la carga en partes si excede 1 segundo
67
- if time.time() - start_time > 1:
68
- print(f"Modelo {model_config['name']} tardó más de 1 segundo en cargarse, dividiendo la carga...")
69
- await asyncio.gather(*(load_part(part) for part in range(5))) # Simulación de división en 5 partes
70
- else:
71
- model = await asyncio.get_event_loop().run_in_executor(
72
- None,
73
- lambda: Llama.from_pretrained(model_file)
74
- )
75
-
76
- model = await asyncio.get_event_loop().run_in_executor(
77
- None,
78
- lambda: Llama.from_pretrained(model_file)
79
- )
80
- tokenizer = model.tokenizer
81
-
82
- # Almacenar tokens y tokenizer en la RAM
83
- model_data = {
84
- 'model': model,
85
- 'tokenizer': tokenizer,
86
- 'pad_token': tokenizer.pad_token,
87
- 'pad_token_id': tokenizer.pad_token_id,
88
- 'eos_token': tokenizer.eos_token,
89
- 'eos_token_id': tokenizer.eos_token_id,
90
- 'bos_token': tokenizer.bos_token,
91
- 'bos_token_id': tokenizer.bos_token_id,
92
- 'unk_token': tokenizer.unk_token,
93
- 'unk_token_id': tokenizer.unk_token_id
94
- }
95
-
96
- self.models.append({"model_data": model_data, "name": model_config['name']})
97
- except Exception as e:
98
- print(f"Error al cargar el modelo: {e}")
99
-
100
- async def load_all_models(self):
101
- print("Iniciando carga de modelos...")
102
- start_time = time.time()
103
- tasks = [self.load_model(config) for config in model_configs]
104
- await asyncio.gather(*tasks)
105
- end_time = time.time()
106
- print(f"Todos los modelos han sido cargados en {end_time - start_time:.2f} segundos.")
107
-
108
- # Instanciar ModelManager y cargar modelos
109
- model_manager = ModelManager()
110
 
111
- @app.on_event("startup")
112
- async def startup_event():
113
- await model_manager.load_all_models()
114
-
115
- # Modelo global para la solicitud de chat
116
- class ChatRequest(BaseModel):
117
- message: str
118
- top_k: int = 50
119
- top_p: float = 0.95
120
- temperature: float = 0.7
121
-
122
- # Límite de tokens para respuestas
123
- TOKEN_LIMIT = 1000 # Define el límite de tokens permitido por respuesta
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
- # Función para generar respuestas de chat
126
- async def generate_chat_response(request, model_data):
127
  try:
128
- user_input = normalize_input(request.message)
129
- llm = model_data['model_data']['model']
130
- tokenizer = model_data['model_data']['tokenizer']
131
-
132
- # Generar respuesta de manera rápida
133
- response = await asyncio.get_event_loop().run_in_executor(
134
- None,
135
- lambda: llm(user_input, max_length=TOKEN_LIMIT, do_sample=True, top_k=request.top_k, top_p=request.top_p, temperature=request.temperature)
136
- )
137
- generated_text = response['generated_text']
138
- # Dividir respuesta larga
139
- split_response = split_long_response(generated_text)
140
- return {"response": split_response, "literal": user_input, "model_name": model_data['name']}
141
  except Exception as e:
142
- print(f"Error al generar la respuesta: {e}")
143
- return {"response": "Error al generar la respuesta", "literal": user_input, "model_name": model_data['name']}
144
-
145
- def split_long_response(response):
146
- """ Divide la respuesta en partes más pequeñas si excede el límite de tokens. """
147
- parts = []
148
- while len(response) > TOKEN_LIMIT:
149
- part = response[:TOKEN_LIMIT]
150
- response = response[TOKEN_LIMIT:]
151
- parts.append(part.strip())
152
- if response:
153
- parts.append(response.strip())
154
- return '\n'.join(parts)
155
-
156
- def remove_duplicates(text):
157
- """ Elimina duplicados en el texto. """
158
- lines = text.splitlines()
159
- unique_lines = list(dict.fromkeys(lines))
160
- return '\n'.join(unique_lines)
161
-
162
- def remove_repetitive_responses(responses):
163
- unique_responses = []
164
- seen_responses = set()
165
- for response in responses:
166
- normalized_response = remove_duplicates(response['response'])
167
- if normalized_response not in seen_responses:
168
- seen_responses.add(normalized_response)
169
- response['response'] = normalized_response
170
- unique_responses.append(response)
171
- return unique_responses
172
-
173
- @app.post("/chat")
174
- async def chat(request: ChatRequest):
175
- results = []
176
- for model_data in model_manager.models:
177
- response = await generate_chat_response(request, model_data)
178
- results.append(response)
179
- unique_results = remove_repetitive_responses(results)
180
- return {"results": unique_results}
181
-
182
- # Ejecutar la aplicación FastAPI
183
- if __name__ == "__main__":
184
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException, Request
2
  from pydantic import BaseModel
 
 
 
 
3
  import uvicorn
 
 
4
  import requests
5
  import asyncio
6
+ import os
7
+ import io
8
  import time
9
+ from typing import List, Dict, Any
10
+ from llama_cpp import Llama # Ajusta según la biblioteca que estés utilizando
11
+ from tqdm import tqdm
12
 
 
 
 
 
13
  app = FastAPI()
14
 
15
  # Configuración de los modelos
 
27
  {"repo_id": "Ffftdtd5dtft/gemma-2-2b-it-Q2_K-GGUF", "filename": "gemma-2-2b-it-q2_k.gguf", "name": "Gemma 2-2B IT"},
28
  {"repo_id": "Ffftdtd5dtft/sarvam-2b-v0.5-Q2_K-GGUF", "filename": "sarvam-2b-v0.5-q2_k.gguf", "name": "Sarvam 2B v0.5"},
29
  {"repo_id": "Ffftdtd5dtft/WizardLM-13B-Uncensored-Q2_K-GGUF", "filename": "wizardlm-13b-uncensored-q2_k.gguf", "name": "WizardLM 13B Uncensored"},
30
+ {"repo_id": "Ffftdtd5dtft/Qwen2-Math-72B-Instruct-Q2_K-GGUF", "filename": "qwen2-math-72b-instruct-q2_k.gguf", "name": "Qwen2 Math 72B Instruct"},
31
  {"repo_id": "Ffftdtd5dtft/WizardLM-7B-Uncensored-Q2_K-GGUF", "filename": "wizardlm-7b-uncensored-q2_k.gguf", "name": "WizardLM 7B Uncensored"},
32
  {"repo_id": "Ffftdtd5dtft/Qwen2-Math-7B-Instruct-Q2_K-GGUF", "filename": "qwen2-math-7b-instruct-q2_k.gguf", "name": "Qwen2 Math 7B Instruct"}
33
  ]
34
 
 
35
  class ModelManager:
36
  def __init__(self):
37
+ self.models = {}
38
+ self.model_parts = {}
39
+ self.load_lock = asyncio.Lock()
40
+ self.index_lock = asyncio.Lock()
41
+ self.part_size = 1024 * 1024 # Tamaño de cada parte en bytes (1 MB)
42
 
43
  async def download_model_to_memory(self, model_config):
 
44
  url = f"https://huggingface.co/{model_config['repo_id']}/resolve/main/{model_config['filename']}"
45
+ print(f"Descargando modelo desde {url}")
 
 
 
 
 
 
 
46
  try:
47
  start_time = time.time()
48
+ response = requests.get(url)
49
+ response.raise_for_status()
50
+ end_time = time.time()
51
+ download_duration = end_time - start_time
52
+ print(f"Descarga completa para {model_config['name']} en {download_duration:.2f} segundos")
53
+ return io.BytesIO(response.content)
54
+ except requests.RequestException as e:
55
+ raise HTTPException(status_code=500, detail=f"Error al descargar el modelo: {e}")
56
+
57
+ async def save_model_to_temp_file(self, model_config):
58
+ model_file = await self.download_model_to_memory(model_config)
59
+ temp_filename = f"/tmp/{model_config['filename']}"
60
+ print(f"Guardando el modelo en {temp_filename}")
61
+ with open(temp_filename, 'wb') as f:
62
+ f.write(model_file.getvalue())
63
+ print(f"Modelo guardado en {temp_filename}")
64
+ return temp_filename
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
+ async def load_model(self, model_config):
67
+ async with self.load_lock:
68
+ try:
69
+ temp_filename = await self.save_model_to_temp_file(model_config)
70
+ start_time = time.time()
71
+ print(f"Cargando modelo desde {temp_filename}")
72
+ llama = Llama(temp_filename) # Ajusta según la biblioteca y clase correctas
73
+ end_time = time.time()
74
+ load_duration = end_time - start_time
75
+ if load_duration > 0:
76
+ print(f"Modelo {model_config['name']} tardó {load_duration:.2f} segundos en cargar, dividiendo automáticamente")
77
+ await self.handle_large_model(temp_filename, model_config)
78
+ else:
79
+ print(f"Modelo {model_config['name']} cargado correctamente en {load_duration:.2f} segundos")
80
+
81
+ tokenizer = llama.tokenizer
82
+ model_data = {
83
+ 'model': llama,
84
+ 'tokenizer': tokenizer,
85
+ 'pad_token': tokenizer.pad_token,
86
+ 'pad_token_id': tokenizer.pad_token_id,
87
+ 'eos_token': tokenizer.eos_token,
88
+ 'eos_token_id': tokenizer.eos_token_id,
89
+ 'bos_token': tokenizer.bos_token,
90
+ 'bos_token_id': tokenizer.bos_token_id,
91
+ 'unk_token': tokenizer.unk_token,
92
+ 'unk_token_id': tokenizer.unk_token_id
93
+ }
94
+
95
+ self.models[model_config['name']] = model_data
96
+ except Exception as e:
97
+ print(f"Error al cargar el modelo: {e}")
98
+
99
+ async def handle_large_model(self, model_filename, model_config):
100
+ total_size = os.path.getsize(model_filename)
101
+ num_parts = (total_size + self.part_size - 1) // self.part_size
102
+
103
+ print(f"Modelo {model_config['name']} dividido en {num_parts} partes")
104
+ with open(model_filename, 'rb') as file:
105
+ for i in tqdm(range(num_parts), desc=f"Indexando {model_config['name']}"):
106
+ start = i * self.part_size
107
+ end = min(start + self.part_size, total_size)
108
+ file.seek(start)
109
+ model_part = io.BytesIO(file.read(end - start))
110
+ await self.index_model_part(model_part, i)
111
+
112
+ async def index_model_part(self, model_part, part_index):
113
+ async with self.index_lock:
114
+ part_name = f"part_{part_index}"
115
+ print(f"Indexando parte {part_index}")
116
+ llama_part = Llama(model_part)
117
+ self.model_parts[part_name] = llama_part
118
+ print(f"Parte {part_index} indexada")
119
+
120
+ async def generate_response(self, user_input):
121
+ results = []
122
+ for model_name, model_data in self.models.items():
123
+ print(f"Generando respuesta con el modelo {model_name}")
124
+ try:
125
+ tokenizer = model_data['tokenizer']
126
+ input_ids = tokenizer(user_input, return_tensors="pt").input_ids
127
+ outputs = model_data['model'].generate(input_ids)
128
+ generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
129
+
130
+ # Dividir el texto generado en partes
131
+ parts = []
132
+ while len(generated_text) > 1000:
133
+ part = generated_text[:1000]
134
+ parts.append(part)
135
+ generated_text = generated_text[1000:]
136
+ parts.append(generated_text)
137
+
138
+ results.append({
139
+ 'model_name': model_name,
140
+ 'generated_text_parts': parts
141
+ })
142
+ except Exception as e:
143
+ print(f"Error al generar respuesta con el modelo {model_name}: {e}")
144
+ results.append({'model_name': model_name, 'error': str(e)})
145
+
146
+ return results
147
+
148
+ @app.post("/generate/")
149
+ async def generate(request: Request):
150
+ data = await request.json()
151
+ user_input = data.get('input', '')
152
+ if not user_input:
153
+ raise HTTPException(status_code=400, detail="Se requiere una entrada de usuario.")
154
 
 
 
155
  try:
156
+ model_manager = ModelManager()
157
+ tasks = [model_manager.load_model(config) for config in model_configs]
158
+ await asyncio.gather(*tasks)
159
+ responses = await model_manager.generate_response(user_input)
160
+ return {"responses": responses}
 
 
 
 
 
 
 
 
161
  except Exception as e:
162
+ raise HTTPException(status_code=500, detail=str(e))
163
+
164
+ def start_uvicorn():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  uvicorn.run(app, host="0.0.0.0", port=7860)
166
+
167
+ if __name__ == "__main__":
168
+ loop = asyncio.get_event_loop()
169
+ model_manager = ModelManager()
170
+ tasks = [model_manager.load_model(config) for config in model_configs]
171
+ loop.run_until_complete(asyncio.gather(*tasks))
172
+ start_uvicorn()