Sakalti commited on
Commit
d7c629e
1 Parent(s): 3ddf304

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -112
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr
2
  import pandas as pd
3
- from datasets import Dataset
4
- from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
5
  import torch
6
  import os
7
  import matplotlib.pyplot as plt
@@ -12,12 +12,14 @@ from datetime import datetime
12
  # Variables globales pour stocker les colonnes détectées
13
  columns = []
14
 
 
 
15
 
16
- # Fonction pour lire le fichier et détecter les colonnes
17
  def read_file(data_file):
18
  global columns
19
  try:
20
- # Charger les données
21
  file_extension = os.path.splitext(data_file.name)[1]
22
  if file_extension == '.csv':
23
  df = pd.read_csv(data_file.name)
@@ -26,30 +28,30 @@ def read_file(data_file):
26
  elif file_extension == '.xlsx':
27
  df = pd.read_excel(data_file.name)
28
  else:
29
- return "Invalid file format. Please upload a CSV, JSON, or Excel file."
30
 
31
- # Détecter les colonnes
32
  columns = df.columns.tolist()
33
  return columns
34
  except Exception as e:
35
- return f"An error occurred: {str(e)}"
36
 
37
 
38
- # Fonction pour valider les colonnes sélectionnées
39
  def validate_columns(prompt_col, description_col):
40
  if prompt_col not in columns or description_col not in columns:
41
  return False
42
  return True
43
 
44
 
45
- # Fonction pour entraîner le modèle
46
- def train_model(data_file, model_name, epochs, batch_size, learning_rate, output_dir, prompt_col, description_col):
47
  try:
48
- # Valider les colonnes sélectionnées
49
  if not validate_columns(prompt_col, description_col):
50
- return "Invalid column selection. Please ensure the columns exist in the dataset."
51
 
52
- # Charger les données
53
  file_extension = os.path.splitext(data_file.name)[1]
54
  if file_extension == '.csv':
55
  df = pd.read_csv(data_file.name)
@@ -58,23 +60,23 @@ def train_model(data_file, model_name, epochs, batch_size, learning_rate, output
58
  elif file_extension == '.xlsx':
59
  df = pd.read_excel(data_file.name)
60
 
61
- # Prévisualisation des données
62
  preview = df.head().to_string(index=False)
63
 
64
- # Préparer le texte d'entraînement
65
  df['text'] = df[prompt_col] + ': ' + df[description_col]
66
  dataset = Dataset.from_pandas(df[['text']])
67
 
68
- # Initialiser le tokenizer et le modèle GPT-2
69
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
70
  model = GPT2LMHeadModel.from_pretrained(model_name)
71
 
72
- # Ajouter un token de padding si nécessaire
73
  if tokenizer.pad_token is None:
74
  tokenizer.add_special_tokens({'pad_token': '[PAD]'})
75
  model.resize_token_embeddings(len(tokenizer))
76
 
77
- # Tokenizer les données
78
  def tokenize_function(examples):
79
  tokens = tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)
80
  tokens['labels'] = tokens['input_ids'].copy()
@@ -82,7 +84,7 @@ def train_model(data_file, model_name, epochs, batch_size, learning_rate, output
82
 
83
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
84
 
85
- # Ajustement des hyperparamètres
86
  training_args = TrainingArguments(
87
  output_dir=output_dir,
88
  overwrite_output_dir=True,
@@ -102,7 +104,7 @@ def train_model(data_file, model_name, epochs, batch_size, learning_rate, output
102
  metric_for_best_model="eval_loss"
103
  )
104
 
105
- # Configuration du Trainer
106
  trainer = Trainer(
107
  model=model,
108
  args=training_args,
@@ -110,15 +112,15 @@ def train_model(data_file, model_name, epochs, batch_size, learning_rate, output
110
  eval_dataset=tokenized_datasets,
111
  )
112
 
113
- # Entraînement et évaluation
114
  trainer.train()
115
  eval_results = trainer.evaluate()
116
 
117
- # Sauvegarder le modèle fine-tuné
118
  model.save_pretrained(output_dir)
119
  tokenizer.save_pretrained(output_dir)
120
 
121
- # Générer un graphique des pertes d'entraînement et de validation
122
  train_loss = [x['loss'] for x in trainer.state.log_history if 'loss' in x]
123
  eval_loss = [x['eval_loss'] for x in trainer.state.log_history if 'eval_loss' in x]
124
  plt.plot(train_loss, label='Training Loss')
@@ -129,102 +131,66 @@ def train_model(data_file, model_name, epochs, batch_size, learning_rate, output
129
  plt.legend()
130
  plt.savefig(os.path.join(output_dir, 'training_eval_loss.png'))
131
 
132
- return f"Training completed successfully.\nPreview of data:\n{preview}", eval_results
 
 
 
133
  except Exception as e:
134
- return f"An error occurred: {str(e)}"
135
 
136
 
137
- # Fonction de génération de texte
138
- def generate_text(prompt, temperature, top_k, top_p, max_length, repetition_penalty, use_comma, batch_size):
139
  try:
140
- model_name = "./fine-tuned-gpt2"
141
- tokenizer = GPT2Tokenizer.from_pretrained(model_name)
142
- model = GPT2LMHeadModel.from_pretrained(model_name)
143
-
144
- if use_comma:
145
- prompt = prompt.replace('.', ',')
146
-
147
- inputs = tokenizer(prompt, return_tensors="pt", padding=True)
148
- attention_mask = inputs.attention_mask
149
- outputs = model.generate(
150
- inputs.input_ids,
151
- attention_mask=attention_mask,
152
- max_length=int(max_length),
153
- temperature=float(temperature),
154
- top_k=int(top_k),
155
- top_p=float(top_p),
156
- repetition_penalty=float(repetition_penalty),
157
- num_return_sequences=int(batch_size),
158
- pad_token_id=tokenizer.eos_token_id
159
  )
160
-
161
- return [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
162
  except Exception as e:
163
- return f"An error occurred: {str(e)}"
164
 
165
 
166
- # Fonction pour configurer les presets
167
- def set_preset(preset):
168
- if preset == "Default":
169
- return 5, 8, 3e-5
170
- elif preset == "Fast Training":
171
- return 3, 16, 5e-5
172
- elif preset == "High Accuracy":
173
- return 10, 4, 1e-5
174
-
175
 
176
- # Interface Gradio
177
  with gr.Blocks() as ui:
178
- gr.Markdown("# Fine-Tune GPT-2 UI Design Model")
179
-
180
- with gr.Tab("Train Model"):
181
- with gr.Row():
182
- data_file = gr.File(label="Upload Data File (CSV, JSON, Excel)")
183
- model_name = gr.Textbox(label="Model Name", value="gpt2")
184
- output_dir = gr.Textbox(label="Output Directory", value="./fine-tuned-gpt2")
185
-
186
- with gr.Row():
187
- preset = gr.Radio(["Default", "Fast Training", "High Accuracy"], label="Preset")
188
- epochs = gr.Number(label="Epochs", value=5)
189
- batch_size = gr.Number(label="Batch Size", value=8)
190
- learning_rate = gr.Number(label="Learning Rate", value=3e-5)
191
-
192
- preset.change(set_preset, preset, [epochs, batch_size, learning_rate])
193
-
194
- # Champs pour sélectionner les colonnes
195
- with gr.Row():
196
- prompt_col = gr.Dropdown(label="Prompt Column")
197
- description_col = gr.Dropdown(label="Description Column")
198
-
199
- # Détection des colonnes lors du téléchargement du fichier
200
- data_file.upload(read_file, inputs=data_file, outputs=[prompt_col, description_col])
201
-
202
- train_button = gr.Button("Train Model")
203
- train_output = gr.Textbox(label="Training Output")
204
- train_graph = gr.Image(label="Training and Validation Loss Graph")
205
-
206
- train_button.click(train_model,
207
- inputs=[data_file, model_name, epochs, batch_size, learning_rate, output_dir, prompt_col,
208
- description_col], outputs=[train_output, train_graph])
209
-
210
- with gr.Tab("Generate Text"):
211
- with gr.Row():
212
- with gr.Column():
213
- temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, value=0.7)
214
- top_k = gr.Slider(label="Top K", minimum=1, maximum=100, value=50)
215
- top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.9)
216
- max_length = gr.Slider(label="Max Length", minimum=10, maximum=1024, value=128)
217
- repetition_penalty = gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, value=1.2)
218
- use_comma = gr.Checkbox(label="Use Comma", value=True)
219
- batch_size = gr.Number(label="Batch Size", value=1, minimum=1)
220
-
221
- with gr.Column():
222
- prompt = gr.Textbox(label="Prompt")
223
- generate_button = gr.Button("Generate Text")
224
- generated_text = gr.Textbox(label="Generated Text", lines=20)
225
-
226
- generate_button.click(generate_text,
227
- inputs=[prompt, temperature, top_k, top_p, max_length, repetition_penalty, use_comma,
228
- batch_size], outputs=generated_text)
229
-
230
- ui.launch()
 
1
  import gradio as gr
2
  import pandas as pd
3
+ from datasets import Dataset, load_dataset
4
+ from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, HfApi
5
  import torch
6
  import os
7
  import matplotlib.pyplot as plt
 
12
  # Variables globales pour stocker les colonnes détectées
13
  columns = []
14
 
15
+ # Hugging Faceにアクセスするためのアクセストークン
16
+ hf_token = "YOUR_HUGGINGFACE_ACCESS_TOKEN"
17
 
18
+ # ファイル読み込み機能
19
  def read_file(data_file):
20
  global columns
21
  try:
22
+ # データを読み込む
23
  file_extension = os.path.splitext(data_file.name)[1]
24
  if file_extension == '.csv':
25
  df = pd.read_csv(data_file.name)
 
28
  elif file_extension == '.xlsx':
29
  df = pd.read_excel(data_file.name)
30
  else:
31
+ return "無効なファイル形式です。CSVJSON、またはExcelファイルをアップロードしてください。"
32
 
33
+ # 列を検出
34
  columns = df.columns.tolist()
35
  return columns
36
  except Exception as e:
37
+ return f"エラーが発生しました: {str(e)}"
38
 
39
 
40
+ # 列のバリデーション
41
  def validate_columns(prompt_col, description_col):
42
  if prompt_col not in columns or description_col not in columns:
43
  return False
44
  return True
45
 
46
 
47
+ # モデルの訓練
48
+ def train_model(data_file, model_name, epochs, batch_size, learning_rate, output_dir, prompt_col, description_col, hf_token):
49
  try:
50
+ # 列のバリデーション
51
  if not validate_columns(prompt_col, description_col):
52
+ return "選択された列が無効です。データセットに列が存在することを確認してください。"
53
 
54
+ # データの読み込み
55
  file_extension = os.path.splitext(data_file.name)[1]
56
  if file_extension == '.csv':
57
  df = pd.read_csv(data_file.name)
 
60
  elif file_extension == '.xlsx':
61
  df = pd.read_excel(data_file.name)
62
 
63
+ # データのプレビュー
64
  preview = df.head().to_string(index=False)
65
 
66
+ # トレーニングテキストの準備
67
  df['text'] = df[prompt_col] + ': ' + df[description_col]
68
  dataset = Dataset.from_pandas(df[['text']])
69
 
70
+ # GPT-2トークナイザーとモデルの初期化
71
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
72
  model = GPT2LMHeadModel.from_pretrained(model_name)
73
 
74
+ # パディングトークンの追加
75
  if tokenizer.pad_token is None:
76
  tokenizer.add_special_tokens({'pad_token': '[PAD]'})
77
  model.resize_token_embeddings(len(tokenizer))
78
 
79
+ # データのトークナイズ
80
  def tokenize_function(examples):
81
  tokens = tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)
82
  tokens['labels'] = tokens['input_ids'].copy()
 
84
 
85
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
86
 
87
+ # ハイパーパラメータの設定
88
  training_args = TrainingArguments(
89
  output_dir=output_dir,
90
  overwrite_output_dir=True,
 
104
  metric_for_best_model="eval_loss"
105
  )
106
 
107
+ # Trainerの設定
108
  trainer = Trainer(
109
  model=model,
110
  args=training_args,
 
112
  eval_dataset=tokenized_datasets,
113
  )
114
 
115
+ # 訓練と評価
116
  trainer.train()
117
  eval_results = trainer.evaluate()
118
 
119
+ # Fine-tunedモデルの保存
120
  model.save_pretrained(output_dir)
121
  tokenizer.save_pretrained(output_dir)
122
 
123
+ # トレーニングと評価の損失グラフ生成
124
  train_loss = [x['loss'] for x in trainer.state.log_history if 'loss' in x]
125
  eval_loss = [x['eval_loss'] for x in trainer.state.log_history if 'eval_loss' in x]
126
  plt.plot(train_loss, label='Training Loss')
 
131
  plt.legend()
132
  plt.savefig(os.path.join(output_dir, 'training_eval_loss.png'))
133
 
134
+ # Hugging Faceにアップロード
135
+ upload_response = upload_model_to_huggingface(output_dir, model_name, hf_token)
136
+
137
+ return f"訓練が成功しました。\nデータプレビュー:\n{preview}", eval_results, upload_response
138
  except Exception as e:
139
+ return f"エラーが発生しました: {str(e)}"
140
 
141
 
142
+ # モデルをHugging Faceにアップロード
143
+ def upload_model_to_huggingface(output_dir, model_name, hf_token):
144
  try:
145
+ api = HfApi()
146
+ repo_url = api.create_repo(model_name, exist_ok=True) # リポジトリが既にあればそのまま使用
147
+ api.upload_folder(
148
+ folder_path=output_dir,
149
+ repo_id=model_name,
150
+ path_in_repo=".",
151
+ use_auth_token=hf_token
 
 
 
 
 
 
 
 
 
 
 
 
152
  )
153
+ return f"モデルがHugging Faceに正常にアップロードされました。\nリポジトリURL: https://huggingface.co/{model_name}"
 
154
  except Exception as e:
155
+ return f"モデルのアップロード中にエラーが発生しました: {str(e)}"
156
 
157
 
158
+ # UI設定
159
+ def generate_text(prompt, temperature, top_k, top_p, max_length, repetition_penalty, use_comma, batch_size):
160
+ # 生成ロジック(実際のモデル使用コードを挿入)
161
+ return "生成されたテキスト"
 
 
 
 
 
162
 
163
+ # UI設定
164
  with gr.Blocks() as ui:
165
+ with gr.Row():
166
+ data_file = gr.File(label="データファイル", file_types=[".csv", ".json", ".xlsx"])
167
+ model_name = gr.Textbox(label="モデル名", value="gpt2")
168
+ epochs = gr.Number(label="エポック数", value=3, minimum=1)
169
+ batch_size = gr.Number(label="バッチサイズ", value=4, minimum=1)
170
+ learning_rate = gr.Number(label="学習率", value=5e-5, minimum=1e-7, maximum=1e-2, step=1e-7)
171
+ output_dir = gr.Textbox(label="出力ディレクトリ", value="./output")
172
+ prompt_col = gr.Textbox(label="プロンプト列名", value="prompt")
173
+ description_col = gr.Textbox(label="説明列名", value="description")
174
+ hf_token = gr.Textbox(label="Hugging Face アクセストークン")
175
+
176
+ with gr.Row():
177
+ validate_button = gr.Button("列検証")
178
+ output = gr.Textbox(label="出力")
179
+
180
+ validate_button.click(
181
+ read_file,
182
+ inputs=[data_file],
183
+ outputs=[output]
184
+ )
185
+
186
+ with gr.Row():
187
+ train_button = gr.Button("訓練開始")
188
+ result_output = gr.Textbox(label="訓練結果", lines=20)
189
+
190
+ train_button.click(
191
+ train_model,
192
+ inputs=[data_file, model_name, epochs, batch_size, learning_rate, output_dir, prompt_col, description_col, hf_token],
193
+ outputs=[result_output]
194
+ )
195
+
196
+ ui.launch()