Spaces:
Running
on
Zero
Running
on
Zero
Martín Santillán Cooper
commited on
Commit
•
e5f0735
1
Parent(s):
2cb730a
UX improvemnts
Browse files
app.py
CHANGED
@@ -34,6 +34,7 @@ def on_test_case_click(state: gr.State):
|
|
34 |
user_message: gr.update(value=selected_test_case['user_message'], elem_classes=[], interactive=True),# if selected_sub_catalog != 'harmful_content_in_assistant_message' else gr.update(value=selected_test_case['user_message'], interactive=False, elem_classes=['read-only']),
|
35 |
assistant_message: gr.update(value=selected_test_case['assistant_message'], visible=True) if selected_test_case['assistant_message'] is not None else gr.update(visible=False, value=''),
|
36 |
result_text: gr.update(value='', visible=False),
|
|
|
37 |
}
|
38 |
|
39 |
def change_button_color(event: gr.EventData):
|
@@ -47,12 +48,13 @@ def on_submit(criteria, context, user_message, assistant_message, state):
|
|
47 |
'user_message': user_message,
|
48 |
'assistant_message': assistant_message,
|
49 |
}, state['selected_sub_catalog'])
|
50 |
-
evaluated_component =
|
51 |
evaluated_component_adjective = get_evaluated_component_adjective(state['selected_sub_catalog'], state['selected_criteria_name'])
|
52 |
logger.debug(f"Starting evaluation for subcatelog {state['selected_sub_catalog']} and criteria name {state['selected_criteria_name']}")
|
53 |
result_label = generate_text(prompt)['assessment'] # Yes or No
|
54 |
-
result_str = f"Is the {evaluated_component} message harmful based on the risk definition: {result_label}"
|
55 |
-
|
|
|
56 |
|
57 |
def on_show_prompt_click(criteria, context, user_message, assistant_message, state):
|
58 |
prompt = get_prompt_from_test_case({
|
@@ -114,11 +116,12 @@ with gr.Blocks(
|
|
114 |
assistant_message = gr.Textbox(label="Assistant Message", lines=3, interactive=True, visible=False, value=starting_test_case['assistant_message'])
|
115 |
|
116 |
submit_button = gr.Button("Evaluate", variant='primary')
|
117 |
-
|
118 |
-
evaluation_results_label = gr.HTML("Evaluation results", elem_classes='subtitle', visible=False)
|
119 |
-
result_text = gr.Textbox(label="Result", interactive=False, elem_classes=['read-only', "result-text"], max_lines=1, visible=False)
|
120 |
|
121 |
-
with
|
|
|
|
|
|
|
|
|
122 |
prompt = gr.Markdown("Hello world!")
|
123 |
|
124 |
show_propt_button.click(
|
@@ -127,14 +130,14 @@ with gr.Blocks(
|
|
127 |
outputs=prompt
|
128 |
).then(lambda: gr.update(visible=True), None, modal)
|
129 |
|
130 |
-
submit_button.click(lambda: gr.update(visible=True), inputs=None, outputs=result_text).then(
|
131 |
on_submit,
|
132 |
inputs=[criteria, context, user_message, assistant_message, state],
|
133 |
-
outputs=
|
134 |
|
135 |
for button in [t for sub_catalog_name, sub_catalog_buttons in catalog_buttons.items() for t in sub_catalog_buttons.values()]:
|
136 |
button.click(update_selected_test_case, inputs=[button, state], outputs=[state])\
|
137 |
-
.then(on_test_case_click, inputs=state, outputs={test_case_name, criteria, context, user_message, assistant_message, result_text}) \
|
138 |
.then(change_button_color, None, [v for c in catalog_buttons.values() for v in c.values()])
|
139 |
|
140 |
demo.launch(server_name='0.0.0.0')
|
|
|
34 |
user_message: gr.update(value=selected_test_case['user_message'], elem_classes=[], interactive=True),# if selected_sub_catalog != 'harmful_content_in_assistant_message' else gr.update(value=selected_test_case['user_message'], interactive=False, elem_classes=['read-only']),
|
35 |
assistant_message: gr.update(value=selected_test_case['assistant_message'], visible=True) if selected_test_case['assistant_message'] is not None else gr.update(visible=False, value=''),
|
36 |
result_text: gr.update(value='', visible=False),
|
37 |
+
result_container: gr.update(visible=False)
|
38 |
}
|
39 |
|
40 |
def change_button_color(event: gr.EventData):
|
|
|
48 |
'user_message': user_message,
|
49 |
'assistant_message': assistant_message,
|
50 |
}, state['selected_sub_catalog'])
|
51 |
+
evaluated_component = get_evaluated_component(state['selected_sub_catalog'], state['selected_criteria_name'])
|
52 |
evaluated_component_adjective = get_evaluated_component_adjective(state['selected_sub_catalog'], state['selected_criteria_name'])
|
53 |
logger.debug(f"Starting evaluation for subcatelog {state['selected_sub_catalog']} and criteria name {state['selected_criteria_name']}")
|
54 |
result_label = generate_text(prompt)['assessment'] # Yes or No
|
55 |
+
# result_str = f"Is the {evaluated_component} message harmful based on the risk definition: {result_label}"
|
56 |
+
html_str = f"<p>Is the {evaluated_component} {evaluated_component_adjective}: <strong>{result_label}</strong></p>"
|
57 |
+
return gr.update(value=html_str)
|
58 |
|
59 |
def on_show_prompt_click(criteria, context, user_message, assistant_message, state):
|
60 |
prompt = get_prompt_from_test_case({
|
|
|
116 |
assistant_message = gr.Textbox(label="Assistant Message", lines=3, interactive=True, visible=False, value=starting_test_case['assistant_message'])
|
117 |
|
118 |
submit_button = gr.Button("Evaluate", variant='primary')
|
|
|
|
|
|
|
119 |
|
120 |
+
with gr.Column(elem_classes="result-container", visible=False) as result_container:
|
121 |
+
evaluation_results_label = gr.HTML("<span>Results</span>", elem_classes='result-title', visible=False)
|
122 |
+
result_text = gr.HTML(label="Result", elem_classes=['read-only', "result-text"], visible=False)
|
123 |
+
|
124 |
+
with Modal(visible=False, elem_classes='modal') as modal:
|
125 |
prompt = gr.Markdown("Hello world!")
|
126 |
|
127 |
show_propt_button.click(
|
|
|
130 |
outputs=prompt
|
131 |
).then(lambda: gr.update(visible=True), None, modal)
|
132 |
|
133 |
+
submit_button.click(lambda: [gr.update(visible=True, value=''), gr.update(visible=True), gr.update(visible=True)], inputs=None, outputs=[result_text, evaluation_results_label, result_container]).then(
|
134 |
on_submit,
|
135 |
inputs=[criteria, context, user_message, assistant_message, state],
|
136 |
+
outputs=result_text)
|
137 |
|
138 |
for button in [t for sub_catalog_name, sub_catalog_buttons in catalog_buttons.items() for t in sub_catalog_buttons.values()]:
|
139 |
button.click(update_selected_test_case, inputs=[button, state], outputs=[state])\
|
140 |
+
.then(on_test_case_click, inputs=state, outputs={test_case_name, criteria, context, user_message, assistant_message, result_text, result_container}) \
|
141 |
.then(change_button_color, None, [v for c in catalog_buttons.values() for v in c.values()])
|
142 |
|
143 |
demo.launch(server_name='0.0.0.0')
|
model.py
CHANGED
@@ -58,7 +58,7 @@ def generate_text(prompt):
|
|
58 |
mock_model_call = os.getenv('MOCK_MODEL_CALL') == 'true'
|
59 |
if mock_model_call:
|
60 |
logger.debug('Returning mocked model result.')
|
61 |
-
sleep(
|
62 |
return {'assessment': 'Yes', 'certainty': 0.97}
|
63 |
else:
|
64 |
start = time()
|
|
|
58 |
mock_model_call = os.getenv('MOCK_MODEL_CALL') == 'true'
|
59 |
if mock_model_call:
|
60 |
logger.debug('Returning mocked model result.')
|
61 |
+
sleep(3)
|
62 |
return {'assessment': 'Yes', 'certainty': 0.97}
|
63 |
else:
|
64 |
start = time()
|
styles.css
CHANGED
@@ -46,6 +46,34 @@
|
|
46 |
cursor: not-allowed !important;
|
47 |
}
|
48 |
|
49 |
-
.result-text
|
50 |
box-shadow: none;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
}
|
|
|
46 |
cursor: not-allowed !important;
|
47 |
}
|
48 |
|
49 |
+
.result-text p {
|
50 |
box-shadow: none;
|
51 |
+
padding: var(--input-padding);
|
52 |
+
}
|
53 |
+
|
54 |
+
.result-container {
|
55 |
+
background-color: var(--block-background-fill);
|
56 |
+
padding: var(--block-padding);
|
57 |
+
}
|
58 |
+
|
59 |
+
.result-title span{
|
60 |
+
display: inline-block;
|
61 |
+
position: relative;
|
62 |
+
z-index: var(--layer-4);
|
63 |
+
border: solid var(--block-title-border-width) var(--block-title-border-color);
|
64 |
+
border-radius: var(--block-title-radius);
|
65 |
+
background: var(--block-title-background-fill);
|
66 |
+
padding: var(--block-title-padding);
|
67 |
+
color: var(--block-title-text-color);
|
68 |
+
font-weight: var(--block-title-text-weight);
|
69 |
+
font-size: var(--block-title-text-size);
|
70 |
+
line-height: var(--line-sm);
|
71 |
+
}
|
72 |
+
|
73 |
+
.modal .modal-container .modal-block{
|
74 |
+
padding: 1.5rem;
|
75 |
+
}
|
76 |
+
|
77 |
+
.result-text .pending {
|
78 |
+
padding-top: -4rem
|
79 |
}
|
utils.py
CHANGED
@@ -22,21 +22,22 @@ def get_prompt_from_test_case(test_case, sub_catalog_name):
|
|
22 |
|
23 |
def get_evaluated_component(sub_catalog_name, criteria_name):
|
24 |
if sub_catalog_name == 'harmful_content_in_user_message':
|
25 |
-
|
26 |
elif sub_catalog_name == 'harmful_content_in_assistant_message':
|
27 |
-
|
28 |
elif sub_catalog_name == 'rag_hallucination_risks':
|
29 |
if criteria_name == "context_relevance":
|
30 |
-
|
31 |
elif criteria_name == "groundedness":
|
32 |
-
|
33 |
elif criteria_name == "answer_relevance":
|
34 |
-
|
|
|
35 |
|
36 |
def get_evaluated_component_adjective(sub_catalog_name, criteria_name):
|
37 |
-
if criteria_name == 'context_relevance':
|
38 |
-
return '
|
39 |
-
else: return 'harmful'
|
40 |
|
41 |
def to_title_case(input_string):
|
42 |
if input_string == 'rag_hallucination_risks': return 'RAG Hallucination Risks'
|
|
|
22 |
|
23 |
def get_evaluated_component(sub_catalog_name, criteria_name):
|
24 |
if sub_catalog_name == 'harmful_content_in_user_message':
|
25 |
+
component = "user"
|
26 |
elif sub_catalog_name == 'harmful_content_in_assistant_message':
|
27 |
+
component = 'assistant'
|
28 |
elif sub_catalog_name == 'rag_hallucination_risks':
|
29 |
if criteria_name == "context_relevance":
|
30 |
+
component = "context"
|
31 |
elif criteria_name == "groundedness":
|
32 |
+
component = "assistant"
|
33 |
elif criteria_name == "answer_relevance":
|
34 |
+
component = "assistant"
|
35 |
+
return f"'{to_title_case(component)} message'"
|
36 |
|
37 |
def get_evaluated_component_adjective(sub_catalog_name, criteria_name):
|
38 |
+
if criteria_name == 'context_relevance' or criteria_name == 'answer_relevance':
|
39 |
+
return 'irrelevant based on the definition'
|
40 |
+
else: return 'harmful based on the risk definition'
|
41 |
|
42 |
def to_title_case(input_string):
|
43 |
if input_string == 'rag_hallucination_risks': return 'RAG Hallucination Risks'
|