YAML Metadata
Warning:
empty or missing yaml metadata in repo card
(https://huggingface.co/docs/hub/model-cards#model-card-metadata)
import pandas as pd
def preprocess_data(data): nc = len(data.columns) nr = len(data.index) new = [0] * nc
for i in range(nc):
new[i] = len(data.iloc[:, i].unique()) / nr
sorted_index = sorted(range(len(new)), key=lambda k: new[k], reverse=True)
sensitive_cols = list(data.columns[sorted_index[i]] for i in range(nc) if new[sorted_index[i]] > 0.5)
data = data.drop(columns=sensitive_cols)
return data
import transformers import pandas as pd import streamlit as st from preprocess import preprocess_data
def anonymize_text(text): model_name = "distilbert-base-uncased" tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) model = transformers.AutoModelForMaskedLM.from_pretrained(model_name)
input_ids = tokenizer.encode(text, return_tensors="pt")
mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1]
token_logits = model(input_ids)[0]
mask_token_logits = token_logits[0, mask_token_index, :]
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
anonymized_text = []
for token in top_5_tokens:
token = tokenizer.decode([token])
anonymized_text.append(token)
return anonymized_text
def run_app(): st.title("Text Anonymization App")
# File upload
st.subheader("Upload your data")
file = st.file_uploader("Upload CSV", type=["csv"])
if file is not None:
# Read the file
data = pd.read_csv(file)
# Preprocess the data
preprocessed_data = preprocess_data(data)
# Column selection
st.subheader("Select columns to anonymize")
selected_columns = []
for col in preprocessed_data.columns:
if st.checkbox(col):
selected_columns.append(col)
#