Spaces:

hchak
/

GPT-Fine-Tuning-Formatter

Running

App Files Files Community

GPT-Fine-Tuning-Formatter / main.py

hchak

Upload folder using huggingface_hub

84e0912 verified 6 months ago

raw

history blame contribute delete

3.84 kB

	import gradio as gr
	import json

	def validate_base_format(data):
	entries = data.split("\n") # Moved outside try for broader error handling scope
	for i, entry in enumerate(entries):
	try:
	if not entry.strip():
	continue
	json_data = json.loads(entry)
	messages = json_data["messages"]
	if not isinstance(messages, list) or not all("role" in message and "content" in message for message in messages):
	return False, i + 1
	except json.JSONDecodeError: # Catch decoding errors specifically for each entry
	return False, i + 1
	return True, None

	def validate_conversational_format(data):
	entries = data.split("\n")
	for i, entry in enumerate(entries):
	try:
	if not entry.strip():
	continue
	json_data = json.loads(entry)
	if "prompt" not in json_data or "completion" not in json_data:
	return False, i + 1
	except json.JSONDecodeError: # Catch decoding errors specifically for each entry
	return False, i + 1
	return True, None

	def validate_multi_turn_format(data):
	entries = data.split("\n")
	for i, entry in enumerate(entries):
	try:
	if not entry.strip():
	continue
	json_data = json.loads(entry)
	messages = json_data["messages"]
	if not isinstance(messages, list) or not all("role" in message and "content" in message for message in messages):
	return False, i + 1
	for message in messages:
	if message["role"] == "assistant" and "weight" in message:
	if message["weight"] not in [0, 1]:
	return False, i + 1
	except json.JSONDecodeError: # Catch decoding errors specifically for each entry
	return False, i + 1
	return True, None

	def process_data(text, file, option):
	try:
	if file:
	data = file.read().decode("utf-8")
	else:
	data = text

	if option == "Base":
	is_valid, line_error = validate_base_format(data)
	if not is_valid:
	return f"Error: Input does not follow the required 'Base' format at line {line_error}."
	return "Input follows the 'Base' format."
	elif option == "Conversational":
	is_valid, line_error = validate_conversational_format(data)
	if not is_valid:
	return f"Error: Input does not follow the required 'Conversational' format at line {line_error}."
	return "Input follows the 'Conversational' format."
	elif option == "Multi-turn":
	is_valid, line_error = validate_multi_turn_format(data)
	if not is_valid:
	return f"Error: Input does not follow the required 'Multi-turn' format at line {line_error}."
	return "Input follows the 'Multi-turn' format."

	return "Option selected but no specific format validation implemented for this option."
	except Exception as e:
	return f"An error occurred: {str(e)}"

	with gr.Blocks(title="Fine-tuning Formatter") as demo:
	with gr.Row():
	text_input = gr.Textbox(label="Paste your text here", lines=10, placeholder="Enter text here or upload a file...")
	file_input = gr.File(label="Upload CSV or JSONL file", file_types=['.csv', '.jsonl'])
	with gr.Row():
	option = gr.Radio(choices=["Base", "Conversational", "Multi-turn"], label="Select the processing mode", value="Base")
	with gr.Row():
	submit_button = gr.Button("Submit")
	output = gr.Textbox(label="Output", lines=2)

	submit_button.click(
	fn=process_data,
	inputs=[text_input, file_input, option],
	outputs=output
	)

	demo.launch()