import nltk import seaborn as sns import numpy as np import pandas as pd import streamlit as st import matplotlib.pyplot as plt import plotly.express as px import plotly.graph_objects as go import scipy.stats as stats from sklearn.decomposition import PCA from wordcloud import WordCloud from sklearn.metrics import confusion_matrix from nltk import regexp_tokenize # Single attribute visualization def distribution_histogram(df, attribute): """ Histogram of the distribution of a single attribute. """ if df[attribute].dtype == 'object' or pd.api.types.is_categorical_dtype(df[attribute]): codes, uniques = pd.factorize(df[attribute]) temp_df = pd.DataFrame({attribute: codes}) fig, ax = plt.subplots(figsize=(8, 6)) sns.histplot(temp_df[attribute], ax=ax, discrete=True, color='#e17160') ax.set_xticks(range(len(uniques))) ax.set_xticklabels(uniques, rotation=45, ha='right') else: fig, ax = plt.subplots(figsize=(6, 4)) sns.histplot(df[attribute], ax=ax, color='#e17160') ax.set_title(f"Distribution of {attribute}") return fig def distribution_boxplot(df, attribute): """ Boxplot of the distribution of a single attribute. """ if df[attribute].dtype == 'object' or pd.api.types.is_categorical_dtype(df[attribute]): return -1 fig, ax = plt.subplots(figsize=(8, 6)) sns.boxenplot(data=df[attribute], palette=["#32936f", "#26a96c", "#2bc016"]) ax.set_title(f"Boxplot of {attribute}") return fig def count_Y(df, Y_name): """ Donut chart of the distribution of a single attribute. """ if Y_name in df.columns and df[Y_name].nunique() >= 1: value_counts = df[Y_name].value_counts() fig = px.pie(names=value_counts.index, values=value_counts.values, title=f'Distribution of {Y_name}', hole=0.5, color_discrete_sequence=px.colors.sequential.Cividis_r) return fig def density_plot(df, column_name): """ Density plot of the distribution of a single attribute. """ if column_name in df.columns: fig = px.density_contour(df, x=column_name, y=column_name, title=f'Density Plot of {column_name}', color_discrete_sequence=px.colors.sequential.Inferno) return fig # Mutiple attribute visualization def box_plot(df, column_names): """ Box plot of multiple attributes. """ if len(column_names) > 1 and not all(df[column_names].dtypes.apply(lambda x: np.issubdtype(x, np.number))): return -1 valid_columns = [col for col in column_names if col in df.columns] if valid_columns: fig = px.box(df, y=valid_columns, title=f'Box Plot of {", ".join(valid_columns)}', color_discrete_sequence=px.colors.sequential.Cividis_r) return fig def violin_plot(df, column_names): """ Violin plot of multiple attributes. """ if len(column_names) > 1 and not all(df[column_names].dtypes.apply(lambda x: np.issubdtype(x, np.number))): return -1 valid_columns = [col for col in column_names if col in df.columns] if valid_columns: fig = px.violin(df, y=valid_columns, title=f'Violin Plot of {", ".join(valid_columns)}', color_discrete_sequence=px.colors.sequential.Cividis_r) return fig def strip_plot(df, column_names): """ Strip plot of multiple attributes. """ if len(column_names) > 1 and not all(df[column_names].dtypes.apply(lambda x: np.issubdtype(x, np.number))): return -1 valid_columns = [col for col in column_names if col in df.columns] if valid_columns: fig = px.strip(df, y=valid_columns, title=f'Strip Plot of {", ".join(valid_columns)}', color_discrete_sequence=px.colors.sequential.Cividis_r) return fig def multi_plot_scatter(df, selected_attributes): """ Scatter plot of multiple attributes. """ if len(selected_attributes) < 2: return -1 plt.figure(figsize=(10, 6)) if df[selected_attributes[0]].dtype not in [np.float64, np.int64]: x, x_labels = pd.factorize(df[selected_attributes[0]]) plt.xticks(ticks=np.arange(len(x_labels)), labels=x_labels, rotation=45) else: x = df[selected_attributes[0]] if df[selected_attributes[1]].dtype not in [np.float64, np.int64]: y, y_labels = pd.factorize(df[selected_attributes[1]]) plt.yticks(ticks=np.arange(len(y_labels)), labels=y_labels) else: y = df[selected_attributes[1]] plt.scatter(x, y, c=np.linspace(0, 1, len(df)), cmap='viridis') plt.colorbar() plt.xlabel(selected_attributes[0]) plt.ylabel(selected_attributes[1]) plt.title(f'Scatter Plot of {selected_attributes[0]} vs {selected_attributes[1]}') return plt.gcf() def multi_plot_line(df, selected_attributes): """ Line plot of multiple attributes. """ if not all(df[selected_attributes].dtypes.apply(lambda x: np.issubdtype(x, np.number))): return -1 if len(selected_attributes) >= 2: plt.figure(figsize=(10, 6)) colors = plt.cm.viridis(np.linspace(0, 1, len(selected_attributes))) for i, attribute in enumerate(selected_attributes): plt.plot(df.index, df[attribute], marker='', linewidth=2, color=colors[i], label=attribute) plt.legend() plt.xlabel(selected_attributes[0]) plt.ylabel(selected_attributes[1]) plt.title(f'Line Plot of {selected_attributes[0]} vs {selected_attributes[1]}') return plt.gcf() else: return -2 def multi_plot_heatmap(df, selected_attributes): """ Correlation heatmap of multiple attributes. """ if not all(df[selected_attributes].dtypes.apply(lambda x: np.issubdtype(x, np.number))): return -1 if len(selected_attributes) >= 1: sns.set_theme() plt.figure(figsize=(10, 8)) sns.heatmap(df[selected_attributes].corr(), annot=True, cmap='viridis') plt.title('Heatmap of Correlation') return plt.gcf() # Overall visualization @st.cache_data def correlation_matrix(df): """ Correlation heatmap of all attributes using Seaborn. """ plt.figure(figsize=(16, 12)) sns.set(font_scale=0.9) sns.heatmap(df.corr(), annot=True, cmap='viridis', annot_kws={"size": 12}) return plt.gcf() @st.cache_data def correlation_matrix_plotly(df): """ Correlation heatmap of all attributes using Plotly. """ corr_matrix = df.corr() labels = corr_matrix.columns text = [[f'{corr_matrix.iloc[i, j]:.2f}' for j in range(len(labels))] for i in range(len(labels))] fig = go.Figure(data=go.Heatmap( z=corr_matrix.values, x=labels, y=labels, colorscale='Viridis', colorbar=dict(title='Correlation'), text=text, hoverinfo='text', )) fig.update_layout( title='Correlation Matrix Between Attributes', xaxis=dict(tickmode='linear'), yaxis=dict(tickmode='linear'), width=800, height=700, ) fig.update_layout(font=dict(size=10)) return fig @st.cache_data def list_all(df, max_plots=16): """ Display histograms of all attributes in the DataFrame. """ # Calculate the number of plots to display (up to 16) num_plots = min(len(df.columns), max_plots) nrows = int(np.ceil(num_plots / 4)) ncols = min(num_plots, 4) fig, axes = plt.subplots(nrows, ncols, figsize=(4 * ncols, 4 * nrows)) fig.suptitle('Attribute Distributions', fontsize=20) plt.style.use('ggplot') sns.set(style="darkgrid") # if only one plot, convert to list if num_plots == 1: axes = [axes] # Flatten the axes array axes = axes.flatten() # Display the histograms for i, column in enumerate(df.columns[:num_plots]): sns.histplot(ax=axes[i], data=df, x=column, color='#1867ac') # Hide additional subplots for ax in axes[num_plots:]: ax.axis('off') plt.tight_layout() plt.subplots_adjust(top=0.95) # Adjust the top to accommodate the title return fig # Model evaluation def confusion_metrix(model_name, model, X_test, Y_test): """ Confusion matrix plot for classification models """ Y_pred = model.predict(X_test) matrix = confusion_matrix(Y_test, Y_pred) plt.figure(figsize=(10, 7)) # temporary sns_heatmap = sns.heatmap(matrix, annot=True, cmap='Blues', fmt='g', annot_kws={"size": 20}) plt.title(f"Confusion Matrix for {model_name}", fontsize=20) plt.xlabel('Predicted labels', fontsize=16) plt.ylabel('True labels', fontsize=16) return sns_heatmap.figure def roc(model_name, fpr, tpr): """ ROC curve for classification models """ fig = plt.figure() plt.style.use('ggplot') plt.plot([0,1],[0,1],'k--') plt.plot(fpr, tpr, label=model_name) plt.xlabel('False Positive rate') plt.ylabel('True Positive rate') plt.title(f'ROC Curve - {model_name}') plt.legend(loc='best') plt.xticks(rotation=45) return fig def plot_clusters(X, labels): """ Scatter plot of clusters for clustering models """ sns.set(style="whitegrid") pca = PCA(n_components=2) X_pca = pca.fit_transform(X) unique_labels = set(labels) colors = plt.cm.viridis(np.linspace(0, 1, len(unique_labels))) fig, ax = plt.subplots() for color, label in zip(colors, unique_labels): idx = labels == label ax.scatter(X_pca[idx, 0], X_pca[idx, 1], color=color, label=f'Cluster {label}', s=50) ax.set_title('Cluster Scatter Plot') ax.legend() return fig def plot_residuals(y_pred, Y_test): """ Residual plot for regression models """ residuals = Y_test - y_pred fig, ax = plt.subplots() sns.residplot(x=y_pred, y=residuals, lowess=True, ax=ax, scatter_kws={'alpha': 0.7}, line_kws={'color': 'purple', 'lw': 2}) ax.set_xlabel('Predicted Values') ax.set_ylabel('Residuals') ax.set_title('Residual Plot') return fig def plot_predictions_vs_actual(y_pred, Y_test): """ Scatter plot of predicted vs. actual values for regression models """ fig, ax = plt.subplots() ax.scatter(Y_test, y_pred, c='#10a37f', marker='x') ax.plot([Y_test.min(), Y_test.max()], [Y_test.min(), Y_test.max()], 'k--', lw=2) ax.set_xlabel('Actual') ax.set_ylabel('Predicted') ax.set_title('Actual vs. Predicted') ax.set_facecolor('white') ax.grid(True, which='major', linestyle='--', linewidth=0.5, color='gray') ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) return fig def plot_qq_plot(y_pred, Y_test): """ Quantile-Quantile plot for regression models """ residuals = Y_test - y_pred fig, ax = plt.subplots() (osm, osr), (slope, intercept, r) = stats.probplot(residuals, dist="norm", plot=None) line = slope * osm + intercept ax.plot(osm, line, 'grey', lw=2) ax.scatter(osm, osr, alpha=0.8, edgecolors='#e8b517', c='yellow', label='Data Points') ax.set_title('Quantile-Quantile Plot') ax.set_facecolor('white') ax.grid(True, which='major', linestyle='--', linewidth=0.5, color='gray') ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.set_xlabel('Theoretical Quantiles') ax.set_ylabel('Ordered Values') return fig # Advanced Visualization @st.cache_data def word_cloud_plot(text): """ Generates and displays a word cloud from the given text. The word cloud visualizes the frequency of occurrence of words in the text, with the size of each word indicating its frequency. :param text: The input text from which to generate the word cloud. :return: A matplotlib figure object containing the word cloud if successful, -1 otherwise. """ try: words = regexp_tokenize(text, pattern='\w+') text_dist = nltk.FreqDist([w for w in words]) wordcloud = WordCloud(width=1200, height=600, background_color ='white').generate_from_frequencies(text_dist) fig, ax = plt.subplots(figsize=(10, 7.5)) ax.imshow(wordcloud, interpolation='bilinear') ax.axis('off') return fig except: return -1 @st.cache_data def world_map(df, country_column, key_attribute): """ Creates a choropleth world map visualization based on the specified DataFrame. The function highlights countries based on a key attribute, providing an interactive map that can be used to analyze geographical data distributions. :param df: DataFrame containing the data to be visualized. :param country_column: Name of the column in df that contains country names. :param key_attribute: Name of the column in df that contains the data to visualize on the map. :return: A Plotly figure object representing the choropleth map if successful, -1 otherwise. """ try: hover_data_columns = [col for col in df.columns if col != country_column] fig = px.choropleth(df, locations="iso_alpha", color=key_attribute, hover_name=country_column, hover_data=hover_data_columns, color_continuous_scale=px.colors.sequential.Cividis, projection="equirectangular",) return fig except: return -1 @st.cache_data def scatter_3d(df, x, y, z): """ Generates a 3D scatter plot from the given DataFrame. Each point in the plot corresponds to a row in the DataFrame, with its position determined by three specified columns. Points are colored based on the values of the z-axis. :param df: DataFrame containing the data to be visualized. :param x: Name of the column in df to use for the x-axis values. :param y: Name of the column in df to use for the y-axis values. :param z: Name of the column in df to use for the z-axis values and color coding. :return: A Plotly figure object containing the 3D scatter plot if successful, -1 otherwise. """ try: return px.scatter_3d(df, x=x, y=y, z=z, color=z, color_continuous_scale=px.colors.sequential.Viridis) except: return -1