Update app.py
Browse files
app.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import os
|
|
|
2 |
from dotenv import load_dotenv
|
3 |
import torch
|
4 |
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
|
@@ -8,12 +9,23 @@ import time
|
|
8 |
import uvicorn
|
9 |
from fastapi import FastAPI
|
10 |
import threading
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
# Cargar las variables de entorno
|
13 |
load_dotenv()
|
14 |
huggingface_token = os.getenv('HUGGINGFACE_TOKEN')
|
15 |
if huggingface_token is None:
|
16 |
-
raise ValueError("HUGGINGFACE_TOKEN
|
17 |
|
18 |
# Iniciar sesión en Hugging Face
|
19 |
login(token=huggingface_token)
|
@@ -30,43 +42,72 @@ def load_and_train():
|
|
30 |
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
|
31 |
model = GPT2LMHeadModel.from_pretrained(model_name)
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
# Intentar cargar los datasets con manejo de errores
|
34 |
try:
|
35 |
-
dataset_humanizado = load_dataset('daily_dialog', split='train', cache_dir=
|
36 |
-
dataset_codigo = load_dataset('code_search_net', split='train', cache_dir=
|
37 |
except Exception as e:
|
38 |
-
|
39 |
-
#
|
40 |
time.sleep(60) # Esperar 60 segundos antes de reintentar
|
41 |
try:
|
42 |
-
dataset_humanizado = load_dataset('alternative_dataset', split='train', cache_dir=
|
|
|
43 |
except Exception as e:
|
44 |
-
|
45 |
return
|
46 |
|
47 |
-
|
48 |
-
|
49 |
|
50 |
# Combinar los datasets en memoria
|
51 |
combined_dataset = concatenate_datasets([dataset_humanizado, dataset_codigo])
|
52 |
|
53 |
-
|
54 |
|
55 |
# Función de tokenización en RAM
|
56 |
def tokenize_function(examples):
|
|
|
57 |
if 'dialog' in examples:
|
58 |
-
|
59 |
elif 'docstring' in examples:
|
60 |
-
|
61 |
elif 'code' in examples:
|
62 |
-
|
|
|
|
|
63 |
return {}
|
64 |
|
65 |
# Tokenizar y mantener todo en RAM
|
66 |
-
tokenized_dataset = combined_dataset.map(
|
|
|
|
|
|
|
|
|
67 |
|
|
|
68 |
training_args = TrainingArguments(
|
69 |
-
output_dir='
|
70 |
per_device_train_batch_size=4,
|
71 |
per_device_eval_batch_size=4,
|
72 |
num_train_epochs=1,
|
@@ -78,7 +119,6 @@ def load_and_train():
|
|
78 |
warmup_ratio=0.1,
|
79 |
evaluation_strategy="epoch",
|
80 |
lr_scheduler_type="linear",
|
81 |
-
save_steps=500, # Guardar menos frecuentemente para evitar escritura
|
82 |
save_strategy="epoch", # Guardar solo al final de cada epoch
|
83 |
)
|
84 |
|
@@ -92,14 +132,15 @@ def load_and_train():
|
|
92 |
try:
|
93 |
trainer.train()
|
94 |
# Subir el modelo a Hugging Face desde la RAM
|
95 |
-
model.push_to_hub('Yhhxhfh/nombre_de_tu_modelo',
|
96 |
-
tokenizer.push_to_hub('Yhhxhfh/nombre_de_tu_modelo',
|
97 |
-
|
|
|
98 |
except Exception as e:
|
99 |
-
|
100 |
time.sleep(10)
|
101 |
|
102 |
if __name__ == "__main__":
|
103 |
# Correr FastAPI en un hilo separado
|
104 |
-
threading.Thread(target=lambda: uvicorn.run(app, host="0.0.0.0", port=7860)).start()
|
105 |
load_and_train()
|
|
|
1 |
import os
|
2 |
+
import platform
|
3 |
from dotenv import load_dotenv
|
4 |
import torch
|
5 |
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
|
|
|
9 |
import uvicorn
|
10 |
from fastapi import FastAPI
|
11 |
import threading
|
12 |
+
import logging
|
13 |
+
|
14 |
+
# Configurar logging
|
15 |
+
logging.basicConfig(
|
16 |
+
level=logging.INFO,
|
17 |
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
18 |
+
handlers=[
|
19 |
+
logging.FileHandler("training.log"),
|
20 |
+
logging.StreamHandler()
|
21 |
+
]
|
22 |
+
)
|
23 |
|
24 |
# Cargar las variables de entorno
|
25 |
load_dotenv()
|
26 |
huggingface_token = os.getenv('HUGGINGFACE_TOKEN')
|
27 |
if huggingface_token is None:
|
28 |
+
raise ValueError("HUGGINGFACE_TOKEN no encontrado en las variables de entorno.")
|
29 |
|
30 |
# Iniciar sesión en Hugging Face
|
31 |
login(token=huggingface_token)
|
|
|
42 |
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
|
43 |
model = GPT2LMHeadModel.from_pretrained(model_name)
|
44 |
|
45 |
+
# Asignar el pad_token al eos_token
|
46 |
+
tokenizer.pad_token = tokenizer.eos_token
|
47 |
+
|
48 |
+
# Redimensionar las embeddings del modelo para incluir el pad_token
|
49 |
+
model.resize_token_embeddings(len(tokenizer))
|
50 |
+
|
51 |
+
# Verificar dispositivo
|
52 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
53 |
+
model.to(device)
|
54 |
+
logging.info(f"Entrenando en: {device}")
|
55 |
+
|
56 |
+
# Determinar cache_dir
|
57 |
+
if platform.system() == "Linux":
|
58 |
+
cache_dir = '/dev/shm'
|
59 |
+
else:
|
60 |
+
cache_dir = './cache'
|
61 |
+
|
62 |
+
# Crear el directorio de caché si no existe
|
63 |
+
os.makedirs(cache_dir, exist_ok=True)
|
64 |
+
|
65 |
# Intentar cargar los datasets con manejo de errores
|
66 |
try:
|
67 |
+
dataset_humanizado = load_dataset('daily_dialog', split='train', cache_dir=cache_dir, trust_remote_code=True)
|
68 |
+
dataset_codigo = load_dataset('code_search_net', split='train', cache_dir=cache_dir, trust_remote_code=True)
|
69 |
except Exception as e:
|
70 |
+
logging.error(f"Error al cargar los datasets: {e}")
|
71 |
+
# Intentar cargar un dataset alternativo
|
72 |
time.sleep(60) # Esperar 60 segundos antes de reintentar
|
73 |
try:
|
74 |
+
dataset_humanizado = load_dataset('alternative_dataset', split='train', cache_dir=cache_dir, trust_remote_code=True)
|
75 |
+
dataset_codigo = load_dataset('alternative_code_dataset', split='train', cache_dir=cache_dir, trust_remote_code=True)
|
76 |
except Exception as e:
|
77 |
+
logging.error(f"Error al cargar el dataset alternativo: {e}")
|
78 |
return
|
79 |
|
80 |
+
logging.info("Daily Dialog columnas: %s", dataset_humanizado.column_names)
|
81 |
+
logging.info("Code Search Net columnas: %s", dataset_codigo.column_names)
|
82 |
|
83 |
# Combinar los datasets en memoria
|
84 |
combined_dataset = concatenate_datasets([dataset_humanizado, dataset_codigo])
|
85 |
|
86 |
+
logging.info("Dataset combinado columnas: %s", combined_dataset.column_names)
|
87 |
|
88 |
# Función de tokenización en RAM
|
89 |
def tokenize_function(examples):
|
90 |
+
text = ""
|
91 |
if 'dialog' in examples:
|
92 |
+
text = examples['dialog']
|
93 |
elif 'docstring' in examples:
|
94 |
+
text = examples['docstring']
|
95 |
elif 'code' in examples:
|
96 |
+
text = examples['code']
|
97 |
+
if text:
|
98 |
+
return tokenizer(text, truncation=True, padding='max_length', max_length=512)
|
99 |
return {}
|
100 |
|
101 |
# Tokenizar y mantener todo en RAM
|
102 |
+
tokenized_dataset = combined_dataset.map(
|
103 |
+
tokenize_function,
|
104 |
+
batched=True,
|
105 |
+
cache_dir=cache_dir
|
106 |
+
)
|
107 |
|
108 |
+
# Configurar argumentos de entrenamiento
|
109 |
training_args = TrainingArguments(
|
110 |
+
output_dir=os.path.join(cache_dir, 'results'), # Almacenar temporalmente en RAM
|
111 |
per_device_train_batch_size=4,
|
112 |
per_device_eval_batch_size=4,
|
113 |
num_train_epochs=1,
|
|
|
119 |
warmup_ratio=0.1,
|
120 |
evaluation_strategy="epoch",
|
121 |
lr_scheduler_type="linear",
|
|
|
122 |
save_strategy="epoch", # Guardar solo al final de cada epoch
|
123 |
)
|
124 |
|
|
|
132 |
try:
|
133 |
trainer.train()
|
134 |
# Subir el modelo a Hugging Face desde la RAM
|
135 |
+
model.push_to_hub('Yhhxhfh/nombre_de_tu_modelo', commit_message="Actualización del modelo")
|
136 |
+
tokenizer.push_to_hub('Yhhxhfh/nombre_de_tu_modelo', commit_message="Actualización del tokenizador")
|
137 |
+
logging.info("Modelo y tokenizador subidos exitosamente.")
|
138 |
+
time.sleep(300) # Esperar 5 minutos antes de la siguiente iteración
|
139 |
except Exception as e:
|
140 |
+
logging.error(f"Error durante el entrenamiento: {e}. Reiniciando el proceso de entrenamiento...")
|
141 |
time.sleep(10)
|
142 |
|
143 |
if __name__ == "__main__":
|
144 |
# Correr FastAPI en un hilo separado
|
145 |
+
threading.Thread(target=lambda: uvicorn.run(app, host="0.0.0.0", port=7860), daemon=True).start()
|
146 |
load_and_train()
|