BertChristiaens's picture
push demo
b735ab5
raw
history blame
13.5 kB
"""Demo UI to show different levels of LLM security."""
import streamlit as st
from streamlit_extras.stylable_container import stylable_container
import pandas as pd
import llm
import config
st.set_page_config(
page_title="LLM security demo",
page_icon="images/LEG.png",
layout="wide",
initial_sidebar_state="expanded",
)
if "key" not in st.session_state:
st.session_state["key"] = "value"
st.logo("images/ML6_logo.png")
st.title("πŸ•΅οΈ LLM security demo")
st.info(
"You are a secret agent meeting your informant in a bar. Convince him to give you his secret! But be prepared, with every new level the informant will be more cautious."
)
num_levels = [i for i in range(7)]
# n random secrets
secrets = [
"Perspicacity",
"Obfuscation",
"Sesquipedalian",
"Synecdoche",
"Defenestration",
"Pulchritudinous",
"Sesquipedalian",
"Ineffable",
]
level_names = [f"Level {i}" for i in num_levels]
level_emojis = ["😴", "πŸ™‚", "😏", "πŸ˜₯", "πŸ₯΅", "πŸ’€", "πŸ’€"]
level_tabs = st.tabs(level_names)
hint_css = """
{
background-color: rgba(110, 110, 110, 0.1);
padding: 16px;
border-radius: 0.5rem;
}
code {
white-space: pre-wrap !important;
}
"""
for level_num, level_tab, name, emoji, secret in zip(
num_levels, level_tabs, level_names, level_emojis, secrets
):
# init "solved" state
if f"solved_{name}" not in st.session_state:
st.session_state[f"solved_{name}"] = False
# init prompt try count
if f"prompt_try_count_{name}" not in st.session_state:
st.session_state[f"prompt_try_count_{name}"] = 0
# init secret guess count
if f"secret_guess_count_{name}" not in st.session_state:
st.session_state[f"secret_guess_count_{name}"] = 0
# init hint expander status
for i in range(3):
if f"opend_hint_{name}_{i}" not in st.session_state:
st.session_state[f"opend_hint_{name}_{i}"] = False
deployment_name = name.replace(" ", "-").lower()
with level_tab:
header_col1, header_col2 = st.columns(2, gap="medium")
header_col1.subheader(f"{emoji} {name}")
header_col2.subheader(f"Need help ...")
col1, col2 = st.columns(2, gap="medium")
with col1:
with st.container(height=600, border=False):
with st.container(border=True):
txt = st.text_area(
"Provide your prompt here:",
key=f"txt_{name}",
label_visibility="visible",
height=200,
placeholder="Your prompt",
)
btn_submit_prompt = st.button(
"Send prompt", key=f"submit_prompt_{name}"
)
# Show response
if len(txt) > 0 and btn_submit_prompt:
st.session_state[f"prompt_try_count_{name}"] += 1
with st.container(border=True):
st.write("Response:")
# special checks for certain levels
if level_num == 3 and llm.run_judge(level_num, {"user_input": txt}):
st.error("Malicious user input detected. Try again.")
elif level_num == 4:
output = "".join(
llm.stream_request(deployment_name, secret, txt)
)
if secret.lower() in output.lower():
st.error(
"Secret detected in user input. Try again."
)
else:
st.write(output)
elif level_num == 5:
output = "".join(
llm.stream_request(deployment_name, secret, txt)
)
if llm.is_subsequence(output, secret):
st.error(
"Secret detected in user input. Try again."
)
else:
st.write(output)
elif level_num == 6:
output = "".join(
llm.stream_request(deployment_name, secret, txt)
)
if llm.run_judge(level_num, {"secret": secret, "message": output}):
st.error(
"Secret detected in user input. Try again."
)
else:
st.write(output)
else:
st.write_stream(
llm.stream_request(deployment_name, secret, txt)
)
with st.container(border=True):
secret_guess = st.text_input(
"What is the secret?",
key=f"guess_{name}",
placeholder="Your guess",
)
btn_submit_guess = st.button(
"Submit guess", key=f"submit_guess_{name}"
)
if btn_submit_guess:
st.session_state[f"secret_guess_count_{name}"] += 1
if secret_guess.lower() == secret.lower():
st.success("You found the secret!")
st.session_state[f"solved_{name}"] = True
else:
st.error("Wrong guess. Try again.")
with col2:
with st.container(border=True, height=600):
st.info(
"There are three levels of hints available to you. But be careful, if you open a hint before solving the secret, it will show up in your record.",
icon="ℹ️",
)
hint_1_cont = stylable_container("hint_1_container", hint_css)
hint1 = hint_1_cont.checkbox(
"Hint 1 - **Description of security strategy**",
key=f"hint1_checkbox_{name}",
)
if hint1:
# if hint gets revealed, it is marked as opened. Unless the secret was already found
st.session_state[f"opend_hint_{name}_0"] = (
True
if st.session_state[f"opend_hint_{name}_0"]
else not st.session_state[f"solved_{name}"]
)
hint_1_cont.write(config.LEVEL_DESCRIPTIONS[level_num]["info"])
hint_2_cont = stylable_container("hint_2_container", hint_css)
hint2 = hint_2_cont.checkbox(
"Hint 2 - **Code execution**", key=f"hint2_checkbox_{name}"
)
if hint2:
st.session_state[f"opend_hint_{name}_1"] = (
True
if st.session_state[f"opend_hint_{name}_1"]
else not st.session_state[f"solved_{name}"]
)
def show_base_prompt():
# show prompt
for key, val in prompts.items():
descr = key.replace("_", " ").capitalize()
hint_2_cont.write(f"*{descr}:*")
# custom_code_container(val)
# val = val.replace("{{secret}}", '<span style="color: #ff0000">{{secret}}</span>')
hint_2_cont.code(val, language=None)
user_input_holder = (
txt if len(txt) > 0 and btn_submit_prompt else None
)
prompts = llm.get_full_prompt(
deployment_name, user_input=user_input_holder
)
if level_num == 3:
special_prompt = llm.get_full_prompt(
llm.special_checks[3], user_input=txt
)
hint_2_cont.write(
"Step 1: A **LLM judge** reviews the user input and determines if it is malicious or not."
)
hint_2_cont.write("**LLM judge prompt:**")
for key, val in special_prompt.items():
hint_2_cont.code(val, language=None)
hint_2_cont.write(
"Step 2: If the user input is not classified as malicious, the prompt containing the actual secret is executed and the response is shown."
)
hint_2_cont.write("**Actual prompt:**")
show_base_prompt()
elif level_num == 4:
hint_2_cont.write(
"Step 1: The following prompt is executed:"
)
show_base_prompt()
hint_2_cont.write(
"Step 2: In the response text of the LLM call, the secret is seach for with a simple python expression `secret.lower() in output.lower()`. If it contains the secret, the output will not be shown."
)
elif level_num == 5:
hint_2_cont.write(
"Step 1: The following prompt is executed:"
)
show_base_prompt()
hint_2_cont.write(
"Step 2: In the response text of the LLM call, the secret is seach for with a python function 'is_subsequence' which looks for substrings. If it contains the secret, the output will not be shown."
)
llm.is_subsequence
else:
hint_2_cont.write(
"Step 1: The following prompt is executed and the response is shown:"
)
show_base_prompt()
# st.divider()
hint_3_cont = stylable_container("hint_3_container", hint_css)
hint3 = hint_3_cont.checkbox(
"Hint 3 - **Example solution**",
key=f"hint3_checkbox_{name}",
)
if hint3:
st.session_state[f"opend_hint_{name}_2"] = (
True
if st.session_state[f"opend_hint_{name}_2"]
else not st.session_state[f"solved_{name}"]
)
# custom_code_container(
# config.LEVEL_DESCRIPTIONS[level_num]["solution"],
# )
hint_3_cont.code(
config.LEVEL_DESCRIPTIONS[level_num]["solution"],
language=None,
)
hint_3_cont.info("*May not allways work")
with st.expander("πŸ† Record", expanded=True):
# build table
table_data = []
for idx, name in enumerate(level_names):
table_data.append(
[
idx,
st.session_state[f"prompt_try_count_{name}"],
st.session_state[f"secret_guess_count_{name}"],
"❌" if st.session_state[f"opend_hint_{name}_0"] else "-",
"❌" if st.session_state[f"opend_hint_{name}_1"] else "-",
"❌" if st.session_state[f"opend_hint_{name}_2"] else "-",
"βœ…" if st.session_state[f"solved_{name}"] else "❌",
secrets[idx] if st.session_state[f"solved_{name}"] else "...",
]
)
# show as pandas dataframe
st.table(
pd.DataFrame(
table_data,
columns=[
"Level",
"Prompt tries",
"Secret guesses",
"Used hint 1",
"Used hint 2",
"Used hint 3",
"Solved",
"Secret",
],
index=level_emojis,
)
)
# TODOS:
# - add more levels
# - use Gemini-Pro-Flash for supervisor LLM
# - show the actual workflow of the safeguard (what gets executed)
# - story telling --> new field hard to be 100 percentage save
# - use LLM judge to look for secret in model output
# - show which safe guards were used in 'Record' table
# - funny: always return "I am sorry I cannot do that."
# switch to azure deployment