HackHPC / app.py
Seanyoon's picture
Create app.py
78a2900
raw
history blame
1.38 kB
import transformers
import pandas as pd
import streamlit as st
from preprocess import preprocess_data
def anonymize_text(text):
model_name = "distilbert-base-uncased"
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
model = transformers.AutoModelForMaskedLM.from_pretrained(model_name)
input_ids = tokenizer.encode(text, return_tensors="pt")
mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1]
token_logits = model(input_ids)[0]
mask_token_logits = token_logits[0, mask_token_index, :]
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
anonymized_text = []
for token in top_5_tokens:
token = tokenizer.decode([token])
anonymized_text.append(token)
return anonymized_text
def run_app():
st.title("Text Anonymization App")
# File upload
st.subheader("Upload your data")
file = st.file_uploader("Upload CSV", type=["csv"])
if file is not None:
# Read the file
data = pd.read_csv(file)
# Preprocess the data
preprocessed_data = preprocess_data(data)
# Column selection
st.subheader("Select columns to anonymize")
selected_columns = []
for col in preprocessed_data.columns:
if st.checkbox(col):
selected_columns.append(col)
#