zeroMN-zeroSG / config.yaml
zeroMN's picture
Rename train.py to config.yaml
5320dd2 verified
raw
history blame
2.91 kB
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
import os
# 加载数据集
datasets = [
"Johnson8187/Chinese_Multi-Emotion_Dialogue_Dataset",
"clapAI/MultiLingualSentiment",
"shareAI/ShareGPT-Chinese-English-90k",
"wikimedia/wikipedia",
"google/code_x_glue_tt_text_to_text",
"silk-road/ChatHaruhi-54K-Role-Playing-Dialogue",
"yentinglin/TaiwanChat",
"liswei/rm-static-zhTW",
"yys/OpenOrca-Chinese",
"Fumika/Wikinews-multilingual",
"aqweteddy/Taiwan-Curlture-MCQ",
"Nexdata/Chinese_Mandarin_Multi-emotional_Synthesis_Corpus",
"Nexdata/Chinese_Mandarin_Entertainment_anchor_Style_Multi-emotional_Synthesis_Corpus",
"voices365/102_Hours_High_Quality_Chinese_Audio_Dataset_For_Speech_Synthesis_Female_Samples",
"voices365/Chinese_Female_001VoiceArtist_40Hours_High_Quality_Voice_Dataset",
"Nexdata/Mandarin_Spontaneous_Speech_Data",
"speechbrain/common_language",
"hello2mao/Chinese_Audio_Resource"
]
# 加载模型和tokenizer
model = AutoModelForSequenceClassification.from_pretrained("zeroMN/zeroSG")
tokenizer = AutoTokenizer.from_pretrained("zeroMN/zeroSG")
# 创建数据加载器
class MyDataset(Dataset):
def __init__(self, datasets):
self.datasets = datasets
self.data = []
for dataset in datasets:
data = load_dataset(dataset)
self.data.extend(data["train"])
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
text = self.data[idx]["text"]
inputs = tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=512,
return_attention_mask=True,
return_tensors='pt'
)
return {
'input_ids': inputs['input_ids'].flatten(),
'attention_mask': inputs['attention_mask'].flatten(),
'labels': torch.tensor(0) # placeholder for labels
}
dataset = MyDataset(datasets)
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)
# 训练模型
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
for epoch in range(5):
model.train()
total_loss = 0
for batch in data_loader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
optimizer.zero_grad()
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
total_loss += loss.item()
print(f'Epoch {epoch+1}, Loss: {total_loss / len(data_loader)}')