File size: 2,540 Bytes
360d510
 
 
a9bd469
360d510
 
 
0dd719b
 
b80eb3f
fe0bd35
360d510
 
 
 
ecfa30a
360d510
ecfa30a
 
 
360d510
faca04b
ecfa30a
 
 
 
360d510
ecfa30a
 
 
360d510
ecfa30a
 
 
 
 
360d510
ecfa30a
 
360d510
ecfa30a
360d510
ecfa30a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0dd719b
ecfa30a
 
 
 
 
0dd719b
360d510
 
a9bd469
360d510
 
 
 
 
 
 
0dd719b
ecfa30a
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import os
from dotenv import load_dotenv
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset, concatenate_datasets
from huggingface_hub import login
import time
import uvicorn
from fastapi import FastAPI
import threading
import spaces

load_dotenv()
login(token=os.getenv('HUGGINGFACE_TOKEN'))

app = FastAPI()

@app.get("/")
async def root():
    return {"message": "Modelo entrenado y en ejecuci贸n."}

@spaces.GPU
def load_and_train():
    model_name = 'gpt2'
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)

    dataset_humanizado = load_dataset('daily_dialog', split='train', trust_remote_code=True)
    dataset_codigo = load_dataset('code_search_net', split='train', trust_remote_code=True)
    dataset_prompts = load_dataset('openai_humaneval', split='train', trust_remote_code=True)

    combined_dataset = concatenate_datasets([
        dataset_humanizado,
        dataset_codigo,
        dataset_prompts
    ])

    def tokenize_function(examples):
        return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

    tokenized_dataset = combined_dataset.map(tokenize_function, batched=True)

    training_args = TrainingArguments(
        output_dir='./results',
        per_device_train_batch_size=100,
        per_device_eval_batch_size=100,
        num_train_epochs=0,
        learning_rate=1e-5,
        logging_steps=-1,
        max_grad_norm=1,
        save_total_limit=1,
        seed=42,
        weight_decay=0,
        warmup_ratio=0.0,
        evaluation_strategy="no",
        optim="adamw_torch",
        lr_scheduler_type="constant",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
    )

    while True:
        try:
            trainer.train()
            model.push_to_hub('Yhhxhfh/nombre_de_tu_modelo', repo_type='model', use_temp_dir=True, commit_message="Actualizaci贸n del modelo")
            tokenizer.push_to_hub('Yhhxhfh/nombre_de_tu_modelo', repo_type='model', use_temp_dir=True, commit_message="Actualizaci贸n del tokenizador")
            time.sleep(5)
        except Exception as e:
            print(f"Error durante el entrenamiento: {e}. Reiniciando el proceso de entrenamiento...")
            time.sleep(10)

if __name__ == "__main__":
    threading.Thread(target=lambda: uvicorn.run(app, host="0.0.0.0", port=7860)).start()
    load_and_train()