Spaces:

schroneko
/

meta-llama-Llama-Guard-3-8B-INT8

Running

File size: 2,556 Bytes

46358a2
e2fac8d
 
6293678
29e0785
6293678
46358a2
 
 
 
e2fac8d
 
 
 
 
 
0a17bfe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142b81d
 
e2fac8d
 
 
 
 
ca0aa0f
 
 
 
0a17bfe
ca0aa0f
 
 
0a17bfe
 
 
 
e2fac8d
 
83fe2ae
e2fac8d
 
 
 
ca0aa0f
 
 
 
 
e2fac8d
 
 
29e0785
e2fac8d

import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import gradio as gr
import spaces

huggingface_token = os.getenv('HUGGINGFACE_TOKEN')
if not huggingface_token:
    raise ValueError("HUGGINGFACE_TOKEN environment variable is not set")

model_id = "meta-llama/Llama-Guard-3-8B-INT8"
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.bfloat16

quantization_config = BitsAndBytesConfig(load_in_8bit=True)

tokenizer = AutoTokenizer.from_pretrained(model_id, token=huggingface_token)
model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    torch_dtype=dtype, 
    device_map="auto",
    quantization_config=quantization_config,
    token=huggingface_token,
    low_cpu_mem_usage=True
)

def parse_llama_guard_output(result):
    lines = [line.strip().lower() for line in result.split('\n') if line.strip()]
    
    if not lines:
        return "Error", "No valid output", result

    safety_status = next((line for line in lines if line in ['safe', 'unsafe']), None)
    
    if safety_status == 'safe':
        return "Safe", "None", result
    elif safety_status == 'unsafe':
        violated_categories = next((lines[i+1] for i, line in enumerate(lines) if line == 'unsafe' and i+1 < len(lines)), "Unspecified")
        return "Unsafe", violated_categories, result
    else:
        return "Error", f"Invalid output: {safety_status}", result

@spaces.GPU
def moderate(user_input, assistant_response):
    chat = [
        {"role": "user", "content": user_input},
        {"role": "assistant", "content": assistant_response},
    ]
    input_ids = tokenizer.apply_chat_template(chat, return_tensors="pt").to(device)
    
    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids, 
            max_new_tokens=100,
            pad_token_id=tokenizer.eos_token_id,
        )
    
    prompt_len = input_ids.shape[-1]
    result = tokenizer.decode(output[0][prompt_len:], skip_special_tokens=True)
    
    return parse_llama_guard_output(result)

iface = gr.Interface(
    fn=moderate,
    inputs=[
        gr.Textbox(lines=3, label="User Input"),
        gr.Textbox(lines=3, label="Assistant Response")
    ],
    outputs=[
        gr.Textbox(label="Safety Status"),
        gr.Textbox(label="Violated Categories"),
        gr.Textbox(label="Raw Output")
    ],
    title="Llama Guard Moderation",
    description="Enter a user input and an assistant response to check for content moderation."
)

if __name__ == "__main__":
    iface.launch()