|
import gradio as gr |
|
import json |
|
|
|
def validate_base_format(data): |
|
entries = data.split("\n") |
|
for i, entry in enumerate(entries): |
|
try: |
|
if not entry.strip(): |
|
continue |
|
json_data = json.loads(entry) |
|
messages = json_data["messages"] |
|
if not isinstance(messages, list) or not all("role" in message and "content" in message for message in messages): |
|
return False, i + 1 |
|
except json.JSONDecodeError: |
|
return False, i + 1 |
|
return True, None |
|
|
|
def validate_conversational_format(data): |
|
entries = data.split("\n") |
|
for i, entry in enumerate(entries): |
|
try: |
|
if not entry.strip(): |
|
continue |
|
json_data = json.loads(entry) |
|
if "prompt" not in json_data or "completion" not in json_data: |
|
return False, i + 1 |
|
except json.JSONDecodeError: |
|
return False, i + 1 |
|
return True, None |
|
|
|
def validate_multi_turn_format(data): |
|
entries = data.split("\n") |
|
for i, entry in enumerate(entries): |
|
try: |
|
if not entry.strip(): |
|
continue |
|
json_data = json.loads(entry) |
|
messages = json_data["messages"] |
|
if not isinstance(messages, list) or not all("role" in message and "content" in message for message in messages): |
|
return False, i + 1 |
|
for message in messages: |
|
if message["role"] == "assistant" and "weight" in message: |
|
if message["weight"] not in [0, 1]: |
|
return False, i + 1 |
|
except json.JSONDecodeError: |
|
return False, i + 1 |
|
return True, None |
|
|
|
def process_data(text, file, option): |
|
try: |
|
if file: |
|
data = file.read().decode("utf-8") |
|
else: |
|
data = text |
|
|
|
if option == "Base": |
|
is_valid, line_error = validate_base_format(data) |
|
if not is_valid: |
|
return f"Error: Input does not follow the required 'Base' format at line {line_error}." |
|
return "Input follows the 'Base' format." |
|
elif option == "Conversational": |
|
is_valid, line_error = validate_conversational_format(data) |
|
if not is_valid: |
|
return f"Error: Input does not follow the required 'Conversational' format at line {line_error}." |
|
return "Input follows the 'Conversational' format." |
|
elif option == "Multi-turn": |
|
is_valid, line_error = validate_multi_turn_format(data) |
|
if not is_valid: |
|
return f"Error: Input does not follow the required 'Multi-turn' format at line {line_error}." |
|
return "Input follows the 'Multi-turn' format." |
|
|
|
return "Option selected but no specific format validation implemented for this option." |
|
except Exception as e: |
|
return f"An error occurred: {str(e)}" |
|
|
|
with gr.Blocks(title="Fine-tuning Formatter") as demo: |
|
with gr.Row(): |
|
text_input = gr.Textbox(label="Paste your text here", lines=10, placeholder="Enter text here or upload a file...") |
|
file_input = gr.File(label="Upload CSV or JSONL file", file_types=['.csv', '.jsonl']) |
|
with gr.Row(): |
|
option = gr.Radio(choices=["Base", "Conversational", "Multi-turn"], label="Select the processing mode", value="Base") |
|
with gr.Row(): |
|
submit_button = gr.Button("Submit") |
|
output = gr.Textbox(label="Output", lines=2) |
|
|
|
submit_button.click( |
|
fn=process_data, |
|
inputs=[text_input, file_input, option], |
|
outputs=output |
|
) |
|
|
|
demo.launch() |
|
|