Spaces:
Running
Running
import torch | |
from transformers import AutoModelForSequenceClassification, AutoTokenizer | |
from datasets import load_dataset | |
from torch.utils.data import Dataset, DataLoader | |
import os | |
# 加载数据集 | |
datasets = [ | |
"Johnson8187/Chinese_Multi-Emotion_Dialogue_Dataset", | |
"clapAI/MultiLingualSentiment", | |
"shareAI/ShareGPT-Chinese-English-90k", | |
"wikimedia/wikipedia", | |
"google/code_x_glue_tt_text_to_text", | |
"silk-road/ChatHaruhi-54K-Role-Playing-Dialogue", | |
"yentinglin/TaiwanChat", | |
"liswei/rm-static-zhTW", | |
"yys/OpenOrca-Chinese", | |
"Fumika/Wikinews-multilingual", | |
"aqweteddy/Taiwan-Curlture-MCQ", | |
"Nexdata/Chinese_Mandarin_Multi-emotional_Synthesis_Corpus", | |
"Nexdata/Chinese_Mandarin_Entertainment_anchor_Style_Multi-emotional_Synthesis_Corpus", | |
"voices365/102_Hours_High_Quality_Chinese_Audio_Dataset_For_Speech_Synthesis_Female_Samples", | |
"voices365/Chinese_Female_001VoiceArtist_40Hours_High_Quality_Voice_Dataset", | |
"Nexdata/Mandarin_Spontaneous_Speech_Data", | |
"speechbrain/common_language", | |
"hello2mao/Chinese_Audio_Resource" | |
] | |
# 加载模型和tokenizer | |
model = AutoModelForSequenceClassification.from_pretrained("zeroMN/zeroSG") | |
tokenizer = AutoTokenizer.from_pretrained("zeroMN/zeroSG") | |
# 创建数据加载器 | |
class MyDataset(Dataset): | |
def __init__(self, datasets): | |
self.datasets = datasets | |
self.data = [] | |
for dataset in datasets: | |
data = load_dataset(dataset) | |
self.data.extend(data["train"]) | |
def __len__(self): | |
return len(self.data) | |
def __getitem__(self, idx): | |
text = self.data[idx]["text"] | |
inputs = tokenizer.encode_plus( | |
text, | |
add_special_tokens=True, | |
max_length=512, | |
return_attention_mask=True, | |
return_tensors='pt' | |
) | |
return { | |
'input_ids': inputs['input_ids'].flatten(), | |
'attention_mask': inputs['attention_mask'].flatten(), | |
'labels': torch.tensor(0) # placeholder for labels | |
} | |
dataset = MyDataset(datasets) | |
data_loader = DataLoader(dataset, batch_size=32, shuffle=True) | |
# 训练模型 | |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
model.to(device) | |
criterion = torch.nn.CrossEntropyLoss() | |
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5) | |
for epoch in range(5): | |
model.train() | |
total_loss = 0 | |
for batch in data_loader: | |
input_ids = batch['input_ids'].to(device) | |
attention_mask = batch['attention_mask'].to(device) | |
labels = batch['labels'].to(device) | |
optimizer.zero_grad() | |
outputs = model(input_ids, attention_mask=attention_mask, labels=labels) | |
loss = criterion(outputs, labels) | |
loss.backward() | |
optimizer.step() | |
total_loss += loss.item() | |
print(f'Epoch {epoch+1}, Loss: {total_loss / len(data_loader)}') |