zeroMN commited on
Commit
074a4b0
·
verified ·
1 Parent(s): 60cb95e

Create train.py

Browse files
Files changed (1) hide show
  1. train.py +87 -0
train.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
3
+ from datasets import load_dataset
4
+ from torch.utils.data import Dataset, DataLoader
5
+ import os
6
+
7
+ # 加载数据集
8
+ datasets = [
9
+ "Johnson8187/Chinese_Multi-Emotion_Dialogue_Dataset",
10
+ "clapAI/MultiLingualSentiment",
11
+ "shareAI/ShareGPT-Chinese-English-90k",
12
+ "wikimedia/wikipedia",
13
+ "google/code_x_glue_tt_text_to_text",
14
+ "silk-road/ChatHaruhi-54K-Role-Playing-Dialogue",
15
+ "yentinglin/TaiwanChat",
16
+ "liswei/rm-static-zhTW",
17
+ "yys/OpenOrca-Chinese",
18
+ "Fumika/Wikinews-multilingual",
19
+ "aqweteddy/Taiwan-Curlture-MCQ",
20
+ "Nexdata/Chinese_Mandarin_Multi-emotional_Synthesis_Corpus",
21
+ "Nexdata/Chinese_Mandarin_Entertainment_anchor_Style_Multi-emotional_Synthesis_Corpus",
22
+ "voices365/102_Hours_High_Quality_Chinese_Audio_Dataset_For_Speech_Synthesis_Female_Samples",
23
+ "voices365/Chinese_Female_001VoiceArtist_40Hours_High_Quality_Voice_Dataset",
24
+ "Nexdata/Mandarin_Spontaneous_Speech_Data",
25
+ "speechbrain/common_language",
26
+ "hello2mao/Chinese_Audio_Resource"
27
+ ]
28
+
29
+ # 加载模型和tokenizer
30
+ model = AutoModelForSequenceClassification.from_pretrained("zeroMN/zeroSG")
31
+ tokenizer = AutoTokenizer.from_pretrained("zeroMN/zeroSG")
32
+
33
+ # 创建数据加载器
34
+ class MyDataset(Dataset):
35
+ def __init__(self, datasets):
36
+ self.datasets = datasets
37
+ self.data = []
38
+ for dataset in datasets:
39
+ data = load_dataset(dataset)
40
+ self.data.extend(data["train"])
41
+
42
+ def __len__(self):
43
+ return len(self.data)
44
+
45
+ def __getitem__(self, idx):
46
+ text = self.data[idx]["text"]
47
+ inputs = tokenizer.encode_plus(
48
+ text,
49
+ add_special_tokens=True,
50
+ max_length=512,
51
+ return_attention_mask=True,
52
+ return_tensors='pt'
53
+ )
54
+ return {
55
+ 'input_ids': inputs['input_ids'].flatten(),
56
+ 'attention_mask': inputs['attention_mask'].flatten(),
57
+ 'labels': torch.tensor(0) # placeholder for labels
58
+ }
59
+
60
+ dataset = MyDataset(datasets)
61
+ data_loader = DataLoader(dataset, batch_size=32, shuffle=True)
62
+
63
+ # 训练模型
64
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
65
+ model.to(device)
66
+ criterion = torch.nn.CrossEntropyLoss()
67
+ optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
68
+
69
+ for epoch in range(5):
70
+ model.train()
71
+ total_loss = 0
72
+ for batch in data_loader:
73
+ input_ids = batch['input_ids'].to(device)
74
+ attention_mask = batch['attention_mask'].to(device)
75
+ labels = batch['labels'].to(device)
76
+
77
+ optimizer.zero_grad()
78
+
79
+ outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
80
+ loss = criterion(outputs, labels)
81
+
82
+ loss.backward()
83
+ optimizer.step()
84
+
85
+ total_loss += loss.item()
86
+
87
+ print(f'Epoch {epoch+1}, Loss: {total_loss / len(data_loader)}')