File size: 3,841 Bytes
84e0912
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import gradio as gr
import json

def validate_base_format(data):
    entries = data.split("\n")  # Moved outside try for broader error handling scope
    for i, entry in enumerate(entries):
        try:
            if not entry.strip():
                continue
            json_data = json.loads(entry)
            messages = json_data["messages"]
            if not isinstance(messages, list) or not all("role" in message and "content" in message for message in messages):
                return False, i + 1
        except json.JSONDecodeError:  # Catch decoding errors specifically for each entry
            return False, i + 1
    return True, None

def validate_conversational_format(data):
    entries = data.split("\n")
    for i, entry in enumerate(entries):
        try:
            if not entry.strip():
                continue
            json_data = json.loads(entry)
            if "prompt" not in json_data or "completion" not in json_data:
                return False, i + 1
        except json.JSONDecodeError:  # Catch decoding errors specifically for each entry
            return False, i + 1
    return True, None

def validate_multi_turn_format(data):
    entries = data.split("\n")
    for i, entry in enumerate(entries):
        try:
            if not entry.strip():
                continue
            json_data = json.loads(entry)
            messages = json_data["messages"]
            if not isinstance(messages, list) or not all("role" in message and "content" in message for message in messages):
                return False, i + 1
            for message in messages:
                if message["role"] == "assistant" and "weight" in message:
                    if message["weight"] not in [0, 1]:
                        return False, i + 1
        except json.JSONDecodeError:  # Catch decoding errors specifically for each entry
            return False, i + 1
    return True, None

def process_data(text, file, option):
    try:
        if file:
            data = file.read().decode("utf-8")
        else:
            data = text

        if option == "Base":
            is_valid, line_error = validate_base_format(data)
            if not is_valid:
                return f"Error: Input does not follow the required 'Base' format at line {line_error}."
            return "Input follows the 'Base' format."
        elif option == "Conversational":
            is_valid, line_error = validate_conversational_format(data)
            if not is_valid:
                return f"Error: Input does not follow the required 'Conversational' format at line {line_error}."
            return "Input follows the 'Conversational' format."
        elif option == "Multi-turn":
            is_valid, line_error = validate_multi_turn_format(data)
            if not is_valid:
                return f"Error: Input does not follow the required 'Multi-turn' format at line {line_error}."
            return "Input follows the 'Multi-turn' format."

        return "Option selected but no specific format validation implemented for this option."
    except Exception as e:
        return f"An error occurred: {str(e)}"

with gr.Blocks(title="Fine-tuning Formatter") as demo:
    with gr.Row():
        text_input = gr.Textbox(label="Paste your text here", lines=10, placeholder="Enter text here or upload a file...")
        file_input = gr.File(label="Upload CSV or JSONL file", file_types=['.csv', '.jsonl'])
    with gr.Row():
        option = gr.Radio(choices=["Base", "Conversational", "Multi-turn"], label="Select the processing mode", value="Base")
    with gr.Row():
        submit_button = gr.Button("Submit")
    output = gr.Textbox(label="Output", lines=2)

    submit_button.click(
        fn=process_data,
        inputs=[text_input, file_input, option],
        outputs=output
    )

demo.launch()