granite-guardian / utils.py
Martín Santillán Cooper
Update result messages
b022d45
raw
history blame
3.22 kB
import json
from jinja2 import Template
with open('prompt_templates.json', mode='r', encoding="utf-8") as f:
prompt_templates = json.load(f)
def assessment_prompt(content):
return {"role": "user", "content": content}
def get_prompt_template(test_case, sub_catalog_name):
test_case_name = test_case['name']
if sub_catalog_name == 'harmful_content_in_user_message':
template_type = 'prompt'
elif sub_catalog_name == 'harmful_content_in_assistant_message':
template_type = 'prompt_response'
elif sub_catalog_name == 'rag_hallucination_risks':
template_type = test_case_name
return prompt_templates[f'{test_case_name}>{template_type}']
def get_prompt_from_test_case(test_case, sub_catalog_name):
return assessment_prompt(Template(get_prompt_template(test_case, sub_catalog_name)).render(**test_case))
def get_result_description(sub_catalog_name, criteria_name):
evaluated_component = get_evaluated_component(sub_catalog_name, criteria_name)
messages = {
"general_harm": f"Is the {evaluated_component} message harmful by common-sense?",
"bias": f"Is the {evaluated_component} message prejudiced based on identity or characteristics?",
"jailbreak": f"Does the user message try to manipulate the AI to generate harmful, undesired, or inappropriate content?",
"violence": f"Does the {evaluated_component} message contain content promoting physical, mental, or sexual harm?",
"profanity": f"Does the {evaluated_component} message include offensive language or insults?",
"unethical_behavior": f"Does the {evaluated_component} message include actions that violate moral or legal standards?",
"answer_relevance": f"Does the assistant response fail to address or properly answer the user question?",
"context_relevance": f"Is the retrieved context irrelevant to the user question or does not address their needs?",
"groundedness": f"Does the assistant response include claims or facts not supported by or contradicted by the provided context?"
}
return messages[criteria_name]
def get_evaluated_component(sub_catalog_name, criteria_name):
if sub_catalog_name == 'harmful_content_in_user_message':
component = "user"
elif sub_catalog_name == 'harmful_content_in_assistant_message':
component = 'assistant'
elif sub_catalog_name == 'rag_hallucination_risks':
if criteria_name == "context_relevance":
component = "context"
elif criteria_name == "groundedness":
component = "assistant"
elif criteria_name == "answer_relevance":
component = "assistant"
return component
def get_evaluated_component_adjective(sub_catalog_name, criteria_name):
if criteria_name == 'context_relevance' or criteria_name == 'answer_relevance':
return 'irrelevant based on the definition'
else: return 'harmful based on the risk definition'
def to_title_case(input_string):
if input_string == 'rag_hallucination_risks': return 'RAG Hallucination Risks'
return ' '.join(word.capitalize() for word in input_string.split('_'))
def to_snake_case(text):
return text.lower().replace(" ", "_")