Spaces:
Running
on
Zero
Running
on
Zero
Martín Santillán Cooper
commited on
Commit
•
cab16f9
1
Parent(s):
ee37b9e
Convert the results into a string
Browse files
app.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
import gradio as gr
|
2 |
from dotenv import load_dotenv
|
3 |
|
4 |
-
from utils import to_title_case, get_prompt_from_test_case, to_snake_case
|
5 |
load_dotenv()
|
6 |
import json
|
7 |
from model import generate_text
|
@@ -34,7 +34,6 @@ def on_test_case_click(state: gr.State):
|
|
34 |
user_message: gr.update(value=selected_test_case['user_message'], elem_classes=[], interactive=True),# if selected_sub_catalog != 'harmful_content_in_assistant_message' else gr.update(value=selected_test_case['user_message'], interactive=False, elem_classes=['read-only']),
|
35 |
assistant_message: gr.update(value=selected_test_case['assistant_message'], visible=True) if selected_test_case['assistant_message'] is not None else gr.update(visible=False, value=''),
|
36 |
result_text: gr.update(value=''),
|
37 |
-
result_certainty: gr.update(value='')
|
38 |
}
|
39 |
|
40 |
def change_button_color(event: gr.EventData):
|
@@ -48,8 +47,11 @@ def on_submit(criteria, context, user_message, assistant_message, state):
|
|
48 |
'user_message': user_message,
|
49 |
'assistant_message': assistant_message,
|
50 |
}, state['selected_sub_catalog'])
|
51 |
-
|
52 |
-
|
|
|
|
|
|
|
53 |
|
54 |
def on_show_prompt_click(criteria, context, user_message, assistant_message, state):
|
55 |
prompt = get_prompt_from_test_case({
|
@@ -111,9 +113,7 @@ with gr.Blocks(
|
|
111 |
submit_button = gr.Button("Evaluate", variant='primary')
|
112 |
gr.HTML("Evaluation results", elem_classes='subtitle')
|
113 |
|
114 |
-
|
115 |
-
result_text = gr.Textbox(label="Result", interactive=False, elem_classes=['read-only'])
|
116 |
-
result_certainty = gr.Number(label="Certainty", interactive=False, value='', elem_classes=['read-only'])
|
117 |
|
118 |
show_propt_button = gr.Button('Show prompt', size='sm', scale=0)
|
119 |
|
@@ -129,11 +129,11 @@ with gr.Blocks(
|
|
129 |
submit_button.click(
|
130 |
on_submit,
|
131 |
inputs=[criteria, context, user_message, assistant_message, state],
|
132 |
-
outputs=[result_text
|
133 |
|
134 |
for button in [t for sub_catalog_name, sub_catalog_buttons in catalog_buttons.items() for t in sub_catalog_buttons.values()]:
|
135 |
button.click(update_selected_test_case, inputs=[button, state], outputs=[state])\
|
136 |
-
.then(on_test_case_click, inputs=state, outputs={test_case_name, criteria, context, user_message, assistant_message, result_text
|
137 |
.then(change_button_color, None, [v for c in catalog_buttons.values() for v in c.values()])
|
138 |
|
139 |
demo.launch(server_name='0.0.0.0')
|
|
|
1 |
import gradio as gr
|
2 |
from dotenv import load_dotenv
|
3 |
|
4 |
+
from utils import get_evaluated_component, get_evaluated_component_adjective, to_title_case, get_prompt_from_test_case, to_snake_case
|
5 |
load_dotenv()
|
6 |
import json
|
7 |
from model import generate_text
|
|
|
34 |
user_message: gr.update(value=selected_test_case['user_message'], elem_classes=[], interactive=True),# if selected_sub_catalog != 'harmful_content_in_assistant_message' else gr.update(value=selected_test_case['user_message'], interactive=False, elem_classes=['read-only']),
|
35 |
assistant_message: gr.update(value=selected_test_case['assistant_message'], visible=True) if selected_test_case['assistant_message'] is not None else gr.update(visible=False, value=''),
|
36 |
result_text: gr.update(value=''),
|
|
|
37 |
}
|
38 |
|
39 |
def change_button_color(event: gr.EventData):
|
|
|
47 |
'user_message': user_message,
|
48 |
'assistant_message': assistant_message,
|
49 |
}, state['selected_sub_catalog'])
|
50 |
+
evaluated_component = to_title_case(get_evaluated_component(state['selected_sub_catalog'], state['selected_criteria_name']))
|
51 |
+
evaluated_component_adjective = get_evaluated_component_adjective(state['selected_sub_catalog'], state['selected_criteria_name'])
|
52 |
+
result_label = generate_text(prompt)['assessment'] # Yes or No
|
53 |
+
result_str = f"Is the {evaluated_component} message harmful based on the risk definition: {result_label}"
|
54 |
+
return result_str
|
55 |
|
56 |
def on_show_prompt_click(criteria, context, user_message, assistant_message, state):
|
57 |
prompt = get_prompt_from_test_case({
|
|
|
113 |
submit_button = gr.Button("Evaluate", variant='primary')
|
114 |
gr.HTML("Evaluation results", elem_classes='subtitle')
|
115 |
|
116 |
+
result_text = gr.Textbox(label="Result", interactive=False, elem_classes=['read-only', "result-text"], max_lines=1)
|
|
|
|
|
117 |
|
118 |
show_propt_button = gr.Button('Show prompt', size='sm', scale=0)
|
119 |
|
|
|
129 |
submit_button.click(
|
130 |
on_submit,
|
131 |
inputs=[criteria, context, user_message, assistant_message, state],
|
132 |
+
outputs=[result_text])
|
133 |
|
134 |
for button in [t for sub_catalog_name, sub_catalog_buttons in catalog_buttons.items() for t in sub_catalog_buttons.values()]:
|
135 |
button.click(update_selected_test_case, inputs=[button, state], outputs=[state])\
|
136 |
+
.then(on_test_case_click, inputs=state, outputs={test_case_name, criteria, context, user_message, assistant_message, result_text}) \
|
137 |
.then(change_button_color, None, [v for c in catalog_buttons.values() for v in c.values()])
|
138 |
|
139 |
demo.launch(server_name='0.0.0.0')
|
model.py
CHANGED
@@ -63,7 +63,7 @@ def generate_text(prompt):
|
|
63 |
mock_model_call = os.getenv('MOCK_MODEL_CALL') == 'true'
|
64 |
if mock_model_call:
|
65 |
logger.debug('Returning mocked model result.')
|
66 |
-
sleep(
|
67 |
return {'assessment': 'Yes', 'certainty': 0.97}
|
68 |
else:
|
69 |
start = time()
|
|
|
63 |
mock_model_call = os.getenv('MOCK_MODEL_CALL') == 'true'
|
64 |
if mock_model_call:
|
65 |
logger.debug('Returning mocked model result.')
|
66 |
+
sleep(1)
|
67 |
return {'assessment': 'Yes', 'certainty': 0.97}
|
68 |
else:
|
69 |
start = time()
|
styles.css
CHANGED
@@ -44,4 +44,8 @@
|
|
44 |
|
45 |
.read-only label textarea,input {
|
46 |
cursor: not-allowed !important;
|
|
|
|
|
|
|
|
|
47 |
}
|
|
|
44 |
|
45 |
.read-only label textarea,input {
|
46 |
cursor: not-allowed !important;
|
47 |
+
}
|
48 |
+
|
49 |
+
.result-text label input {
|
50 |
+
box-shadow: none;
|
51 |
}
|
utils.py
CHANGED
@@ -20,6 +20,24 @@ def get_prompt_template(test_case, sub_catalog_name):
|
|
20 |
def get_prompt_from_test_case(test_case, sub_catalog_name):
|
21 |
return assessment_prompt(Template(get_prompt_template(test_case, sub_catalog_name)).render(**test_case))
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
def to_title_case(input_string):
|
24 |
if input_string == 'rag_hallucination_risks': return 'RAG Hallucination Risks'
|
25 |
return ' '.join(word.capitalize() for word in input_string.split('_'))
|
|
|
20 |
def get_prompt_from_test_case(test_case, sub_catalog_name):
|
21 |
return assessment_prompt(Template(get_prompt_template(test_case, sub_catalog_name)).render(**test_case))
|
22 |
|
23 |
+
def get_evaluated_component(sub_catalog_name, criteria_name):
|
24 |
+
if sub_catalog_name == 'harmful_content_in_user_message':
|
25 |
+
return "user"
|
26 |
+
elif sub_catalog_name == 'harmful_content_in_assistant_message':
|
27 |
+
return 'assistant'
|
28 |
+
elif sub_catalog_name == 'rag_hallucination_risks':
|
29 |
+
if criteria_name == "context_relevance":
|
30 |
+
return "context"
|
31 |
+
elif criteria_name == "groundedness":
|
32 |
+
return "assistant"
|
33 |
+
elif criteria_name == "answer_relevance":
|
34 |
+
return "assistant"
|
35 |
+
|
36 |
+
def get_evaluated_component_adjective(sub_catalog_name, criteria_name):
|
37 |
+
if criteria_name == 'context_relevance':
|
38 |
+
return 'relevant'
|
39 |
+
else: return 'harmful'
|
40 |
+
|
41 |
def to_title_case(input_string):
|
42 |
if input_string == 'rag_hallucination_risks': return 'RAG Hallucination Risks'
|
43 |
return ' '.join(word.capitalize() for word in input_string.split('_'))
|