import torch from transformers import AutoModelForSequenceClassification, AutoTokenizer from datasets import load_dataset from torch.utils.data import Dataset, DataLoader import os # 加载数据集 datasets = [ "Johnson8187/Chinese_Multi-Emotion_Dialogue_Dataset", "clapAI/MultiLingualSentiment", "shareAI/ShareGPT-Chinese-English-90k", "wikimedia/wikipedia", "google/code_x_glue_tt_text_to_text", "silk-road/ChatHaruhi-54K-Role-Playing-Dialogue", "yentinglin/TaiwanChat", "liswei/rm-static-zhTW", "yys/OpenOrca-Chinese", "Fumika/Wikinews-multilingual", "aqweteddy/Taiwan-Curlture-MCQ", "Nexdata/Chinese_Mandarin_Multi-emotional_Synthesis_Corpus", "Nexdata/Chinese_Mandarin_Entertainment_anchor_Style_Multi-emotional_Synthesis_Corpus", "voices365/102_Hours_High_Quality_Chinese_Audio_Dataset_For_Speech_Synthesis_Female_Samples", "voices365/Chinese_Female_001VoiceArtist_40Hours_High_Quality_Voice_Dataset", "Nexdata/Mandarin_Spontaneous_Speech_Data", "speechbrain/common_language", "hello2mao/Chinese_Audio_Resource" ] # 加载模型和tokenizer model = AutoModelForSequenceClassification.from_pretrained("zeroMN/zeroSG") tokenizer = AutoTokenizer.from_pretrained("zeroMN/zeroSG") # 创建数据加载器 class MyDataset(Dataset): def __init__(self, datasets): self.datasets = datasets self.data = [] for dataset in datasets: data = load_dataset(dataset) self.data.extend(data["train"]) def __len__(self): return len(self.data) def __getitem__(self, idx): text = self.data[idx]["text"] inputs = tokenizer.encode_plus( text, add_special_tokens=True, max_length=512, return_attention_mask=True, return_tensors='pt' ) return { 'input_ids': inputs['input_ids'].flatten(), 'attention_mask': inputs['attention_mask'].flatten(), 'labels': torch.tensor(0) # placeholder for labels } dataset = MyDataset(datasets) data_loader = DataLoader(dataset, batch_size=32, shuffle=True) # 训练模型 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model.to(device) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=1e-5) for epoch in range(5): model.train() total_loss = 0 for batch in data_loader: input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) labels = batch['labels'].to(device) optimizer.zero_grad() outputs = model(input_ids, attention_mask=attention_mask, labels=labels) loss = criterion(outputs, labels) loss.backward() optimizer.step() total_loss += loss.item() print(f'Epoch {epoch+1}, Loss: {total_loss / len(data_loader)}')