File size: 2,908 Bytes
074a4b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
import os

# 加载数据集
datasets = [
    "Johnson8187/Chinese_Multi-Emotion_Dialogue_Dataset",
    "clapAI/MultiLingualSentiment",
    "shareAI/ShareGPT-Chinese-English-90k",
    "wikimedia/wikipedia",
    "google/code_x_glue_tt_text_to_text",
    "silk-road/ChatHaruhi-54K-Role-Playing-Dialogue",
    "yentinglin/TaiwanChat",
    "liswei/rm-static-zhTW",
    "yys/OpenOrca-Chinese",
    "Fumika/Wikinews-multilingual",
    "aqweteddy/Taiwan-Curlture-MCQ",
    "Nexdata/Chinese_Mandarin_Multi-emotional_Synthesis_Corpus",
    "Nexdata/Chinese_Mandarin_Entertainment_anchor_Style_Multi-emotional_Synthesis_Corpus",
    "voices365/102_Hours_High_Quality_Chinese_Audio_Dataset_For_Speech_Synthesis_Female_Samples",
    "voices365/Chinese_Female_001VoiceArtist_40Hours_High_Quality_Voice_Dataset",
    "Nexdata/Mandarin_Spontaneous_Speech_Data",
    "speechbrain/common_language",
    "hello2mao/Chinese_Audio_Resource"
]

# 加载模型和tokenizer
model = AutoModelForSequenceClassification.from_pretrained("zeroMN/zeroSG")
tokenizer = AutoTokenizer.from_pretrained("zeroMN/zeroSG")

# 创建数据加载器
class MyDataset(Dataset):
    def __init__(self, datasets):
        self.datasets = datasets
        self.data = []
        for dataset in datasets:
            data = load_dataset(dataset)
            self.data.extend(data["train"])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]["text"]
        inputs = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=512,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': torch.tensor(0)  # placeholder for labels
        }

dataset = MyDataset(datasets)
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)

# 训练模型
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

for epoch in range(5):
    model.train()
    total_loss = 0
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {total_loss / len(data_loader)}')