Alexander Watson
initial checkin
eb03925
raw
history blame
10.4 kB
import json
import pandas as pd
import streamlit as st
from datasets import load_dataset
from huggingface_hub import HfApi, login
from openai import OpenAI
# Import our utility functions
from utils.analysis import analyze_dataset_with_openai, generate_dataset_card
from utils.visualization import create_distribution_plot, create_wordcloud
# Initialize session state variables
if "openai_analysis" not in st.session_state:
st.session_state.openai_analysis = None
if "df" not in st.session_state:
st.session_state.df = None
if "dataset_name" not in st.session_state:
st.session_state.dataset_name = None
if "selected_dist_columns" not in st.session_state:
st.session_state.selected_dist_columns = []
if "selected_wordcloud_columns" not in st.session_state:
st.session_state.selected_wordcloud_columns = []
st.set_page_config(
page_title="Dataset Card Generator",
page_icon="πŸ“Š",
layout="wide",
)
def initialize_openai_client(api_key):
"""Initialize OpenAI client with API key."""
return OpenAI(api_key=api_key)
def load_and_analyze_dataset(dataset_name):
"""Load dataset and perform initial analysis."""
progress_container = st.empty()
with progress_container.container():
with st.status("Loading dataset...", expanded=True) as status:
try:
# Load dataset
status.write("πŸ“₯ Loading dataset from HuggingFace...")
dataset = load_dataset(dataset_name, split="train")
df = pd.DataFrame(dataset)
st.session_state.df = df
st.session_state.dataset_name = dataset_name
# Initialize OpenAI analysis
try:
status.write("πŸ€– Analyzing dataset ...")
client = initialize_openai_client(st.session_state.openai_key)
sample_data = dataset[:5]
print("Sample data:", json.dumps(sample_data, indent=2))
analysis = analyze_dataset_with_openai(client, sample_data)
print("Analysis result:", json.dumps(analysis, indent=2))
st.session_state.openai_analysis = analysis
except Exception as e:
print(f"Analysis error: {str(e)}")
status.update(label=f"❌ Error: {str(e)}", state="error")
status.update(
label="βœ… Dataset loaded and analyzed successfully!",
state="complete",
)
except Exception as e:
status.update(label=f"❌ Error: {str(e)}", state="error")
st.error(f"Failed to load dataset: {str(e)}")
return
def display_dataset_analysis():
"""Display dataset analysis and visualization options."""
if st.session_state.df is None:
return
st.header("Dataset Analysis")
# Dataset preview
with st.expander("πŸ“Š Dataset Preview", expanded=True):
st.dataframe(st.session_state.df.head(), use_container_width=True)
# Column selection for visualizations
st.subheader("Select Visualization Fields")
col1, col2 = st.columns(2)
with col1:
# Distribution plot selection
st.session_state.selected_dist_columns = st.multiselect(
"Distribution Plots (max 2)",
options=st.session_state.df.columns.tolist(),
format_func=lambda x: get_column_type_description(st.session_state.df, x),
max_selections=2,
help="Select columns to show value distributions. List columns will show frequency of individual items.",
)
with col2:
# Word cloud selection
text_columns = [
col
for col in st.session_state.df.columns
if st.session_state.df[col].dtype == "object"
or isinstance(st.session_state.df[col].iloc[0], list)
]
st.session_state.selected_wordcloud_columns = st.multiselect(
"Word Clouds (max 2)",
options=text_columns,
format_func=lambda x: get_column_type_description(st.session_state.df, x),
max_selections=2,
help="Select text columns to generate word clouds",
)
# Add some spacing
st.markdown("---")
# Generate card button
if st.button("Generate Dataset Card", type="primary", use_container_width=True):
if not (
st.session_state.selected_dist_columns
or st.session_state.selected_wordcloud_columns
):
st.warning(
"Please select at least one visualization before generating the card."
)
return
generate_and_display_card()
def generate_and_display_card():
"""Generate and display the dataset card with visualizations."""
if not st.session_state.openai_analysis:
st.error(
"Dataset analysis not available. Please try loading the dataset again."
)
return
with st.status("Generating dataset card...", expanded=True) as status:
try:
# Create visualizations
status.write("πŸ“Š Creating distribution plots...")
distribution_plots = {}
for col in st.session_state.selected_dist_columns:
print(f"Generating distribution plot for {col}")
img_base64 = create_distribution_plot(st.session_state.df, col)
distribution_plots[col] = img_base64
print(f"Successfully created plot for {col}")
status.write("πŸ”€ Generating word clouds...")
wordcloud_plots = {}
for col in st.session_state.selected_wordcloud_columns:
print(f"Generating word cloud for {col}")
img_base64 = create_wordcloud(st.session_state.df, col)
wordcloud_plots[col] = img_base64
print(f"Successfully created word cloud for {col}")
# Generate dataset card content
status.write("πŸ“ Composing dataset card...")
dataset_info = {"dataset_name": st.session_state.dataset_name}
readme_content = generate_dataset_card(
dataset_info=dataset_info,
distribution_plots=distribution_plots,
wordcloud_plots=wordcloud_plots,
openai_analysis=st.session_state.openai_analysis,
df=st.session_state.df, # Added DataFrame parameter
)
# Display results
status.update(label="βœ… Dataset card generated!", state="complete")
# Display the markdown with images
st.markdown(readme_content, unsafe_allow_html=True)
# Add download button
st.download_button(
label="⬇️ Download Dataset Card",
data=readme_content,
file_name="README.md",
mime="text/markdown",
use_container_width=True,
)
except Exception as e:
print(f"Error in generate_and_display_card: {str(e)}")
st.error(f"Error generating dataset card: {str(e)}")
raise e
def get_column_type_description(data, column):
"""Get a user-friendly description of the column type."""
try:
if isinstance(data[column].iloc[0], list):
return f"{column} (list)"
elif data[column].dtype in ["int64", "float64"]:
return f"{column} (numeric)"
else:
return f"{column} (text/categorical)"
except:
return f"{column} (unknown)"
def get_api_keys():
"""Get API keys from secrets or user input."""
# Try to get from secrets first
try:
hf_token = st.secrets["api_keys"]["huggingface"]
openai_key = st.secrets["api_keys"]["openai"]
return hf_token, openai_key
except:
return None, None
def get_secrets():
"""Get API keys from secrets.toml if it exists."""
try:
hf_token = st.secrets.get("api_keys", {}).get("huggingface", "")
openai_key = st.secrets.get("api_keys", {}).get("openai", "")
return hf_token, openai_key
except Exception as e:
print(f"No secrets file found or error reading secrets: {e}")
return "", ""
def main():
st.title("πŸ“Š Dataset Card Generator")
st.markdown(
"""
Generate beautiful documentation for your HuggingFace datasets with automated analysis,
visualizations, and formatted dataset cards.
"""
)
# Get secrets if available
default_hf_token, default_openai_key = get_api_keys()
# Authentication section in sidebar
with st.sidebar:
st.header("πŸ”‘ Authentication")
# OpenAI API key (required)
openai_key = st.text_input(
"OpenAI API Key",
value=default_openai_key,
type="password" if not default_openai_key else "default",
help="Required: Your OpenAI API key for dataset analysis",
)
# HuggingFace token (optional)
hf_token = st.text_input(
"HuggingFace Token (optional)",
value=default_hf_token,
type="password" if not default_hf_token else "default",
help="Optional: Only required for private datasets",
)
if openai_key:
try:
# Only attempt HF login if token is provided
if hf_token:
login(hf_token)
st.success("βœ… HuggingFace authentication successful!")
st.session_state.openai_key = openai_key
st.success("βœ… OpenAI API key set!")
except Exception as e:
st.error(f"❌ Authentication error: {str(e)}")
return
else:
st.info("πŸ‘† Please enter your OpenAI API key to get started.")
return
# Main content area
if not openai_key:
return
dataset_name = st.text_input(
"Enter HuggingFace Dataset Name",
placeholder="username/dataset",
help="Enter the full path to your HuggingFace dataset (e.g., 'username/dataset')",
)
if dataset_name:
if st.button("Load Dataset", type="primary"):
load_and_analyze_dataset(dataset_name)
if st.session_state.df is not None:
display_dataset_analysis()
if __name__ == "__main__":
main()