import transformers import pandas as pd import streamlit as st from preprocess import preprocess_data def anonymize_text(text): model_name = "distilbert-base-uncased" tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) model = transformers.AutoModelForMaskedLM.from_pretrained(model_name) input_ids = tokenizer.encode(text, return_tensors="pt") mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1] token_logits = model(input_ids)[0] mask_token_logits = token_logits[0, mask_token_index, :] top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist() anonymized_text = [] for token in top_5_tokens: token = tokenizer.decode([token]) anonymized_text.append(token) return anonymized_text def run_app(): st.title("Text Anonymization App") # File upload st.subheader("Upload your data") file = st.file_uploader("Upload CSV", type=["csv"]) if file is not None: # Read the file data = pd.read_csv(file) # Preprocess the data preprocessed_data = preprocess_data(data) # Column selection st.subheader("Select columns to anonymize") selected_columns = [] for col in preprocessed_data.columns: if st.checkbox(col): selected_columns.append(col) #