Spaces:
Sleeping
Sleeping
import nltk | |
import seaborn as sns | |
import numpy as np | |
import pandas as pd | |
import streamlit as st | |
import matplotlib.pyplot as plt | |
import plotly.express as px | |
import plotly.graph_objects as go | |
import scipy.stats as stats | |
from sklearn.decomposition import PCA | |
from wordcloud import WordCloud | |
from sklearn.metrics import confusion_matrix | |
from nltk import regexp_tokenize | |
# Single attribute visualization | |
def distribution_histogram(df, attribute): | |
""" | |
Histogram of the distribution of a single attribute. | |
""" | |
if df[attribute].dtype == 'object' or pd.api.types.is_categorical_dtype(df[attribute]): | |
codes, uniques = pd.factorize(df[attribute]) | |
temp_df = pd.DataFrame({attribute: codes}) | |
fig, ax = plt.subplots(figsize=(8, 6)) | |
sns.histplot(temp_df[attribute], ax=ax, discrete=True, color='#e17160') | |
ax.set_xticks(range(len(uniques))) | |
ax.set_xticklabels(uniques, rotation=45, ha='right') | |
else: | |
fig, ax = plt.subplots(figsize=(6, 4)) | |
sns.histplot(df[attribute], ax=ax, color='#e17160') | |
ax.set_title(f"Distribution of {attribute}") | |
return fig | |
def distribution_boxplot(df, attribute): | |
""" | |
Boxplot of the distribution of a single attribute. | |
""" | |
if df[attribute].dtype == 'object' or pd.api.types.is_categorical_dtype(df[attribute]): | |
return -1 | |
fig, ax = plt.subplots(figsize=(8, 6)) | |
sns.boxenplot(data=df[attribute], palette=["#32936f", "#26a96c", "#2bc016"]) | |
ax.set_title(f"Boxplot of {attribute}") | |
return fig | |
def count_Y(df, Y_name): | |
""" | |
Donut chart of the distribution of a single attribute. | |
""" | |
if Y_name in df.columns and df[Y_name].nunique() >= 1: | |
value_counts = df[Y_name].value_counts() | |
fig = px.pie(names=value_counts.index, | |
values=value_counts.values, | |
title=f'Distribution of {Y_name}', | |
hole=0.5, | |
color_discrete_sequence=px.colors.sequential.Cividis_r) | |
return fig | |
def density_plot(df, column_name): | |
""" | |
Density plot of the distribution of a single attribute. | |
""" | |
if column_name in df.columns: | |
fig = px.density_contour(df, x=column_name, y=column_name, | |
title=f'Density Plot of {column_name}', | |
color_discrete_sequence=px.colors.sequential.Inferno) | |
return fig | |
# Mutiple attribute visualization | |
def box_plot(df, column_names): | |
""" | |
Box plot of multiple attributes. | |
""" | |
if len(column_names) > 1 and not all(df[column_names].dtypes.apply(lambda x: np.issubdtype(x, np.number))): | |
return -1 | |
valid_columns = [col for col in column_names if col in df.columns] | |
if valid_columns: | |
fig = px.box(df, y=valid_columns, | |
title=f'Box Plot of {", ".join(valid_columns)}', | |
color_discrete_sequence=px.colors.sequential.Cividis_r) | |
return fig | |
def violin_plot(df, column_names): | |
""" | |
Violin plot of multiple attributes. | |
""" | |
if len(column_names) > 1 and not all(df[column_names].dtypes.apply(lambda x: np.issubdtype(x, np.number))): | |
return -1 | |
valid_columns = [col for col in column_names if col in df.columns] | |
if valid_columns: | |
fig = px.violin(df, y=valid_columns, | |
title=f'Violin Plot of {", ".join(valid_columns)}', | |
color_discrete_sequence=px.colors.sequential.Cividis_r) | |
return fig | |
def strip_plot(df, column_names): | |
""" | |
Strip plot of multiple attributes. | |
""" | |
if len(column_names) > 1 and not all(df[column_names].dtypes.apply(lambda x: np.issubdtype(x, np.number))): | |
return -1 | |
valid_columns = [col for col in column_names if col in df.columns] | |
if valid_columns: | |
fig = px.strip(df, y=valid_columns, | |
title=f'Strip Plot of {", ".join(valid_columns)}', | |
color_discrete_sequence=px.colors.sequential.Cividis_r) | |
return fig | |
def multi_plot_scatter(df, selected_attributes): | |
""" | |
Scatter plot of multiple attributes. | |
""" | |
if len(selected_attributes) < 2: | |
return -1 | |
plt.figure(figsize=(10, 6)) | |
if df[selected_attributes[0]].dtype not in [np.float64, np.int64]: | |
x, x_labels = pd.factorize(df[selected_attributes[0]]) | |
plt.xticks(ticks=np.arange(len(x_labels)), labels=x_labels, rotation=45) | |
else: | |
x = df[selected_attributes[0]] | |
if df[selected_attributes[1]].dtype not in [np.float64, np.int64]: | |
y, y_labels = pd.factorize(df[selected_attributes[1]]) | |
plt.yticks(ticks=np.arange(len(y_labels)), labels=y_labels) | |
else: | |
y = df[selected_attributes[1]] | |
plt.scatter(x, y, c=np.linspace(0, 1, len(df)), cmap='viridis') | |
plt.colorbar() | |
plt.xlabel(selected_attributes[0]) | |
plt.ylabel(selected_attributes[1]) | |
plt.title(f'Scatter Plot of {selected_attributes[0]} vs {selected_attributes[1]}') | |
return plt.gcf() | |
def multi_plot_line(df, selected_attributes): | |
""" | |
Line plot of multiple attributes. | |
""" | |
if not all(df[selected_attributes].dtypes.apply(lambda x: np.issubdtype(x, np.number))): | |
return -1 | |
if len(selected_attributes) >= 2: | |
plt.figure(figsize=(10, 6)) | |
colors = plt.cm.viridis(np.linspace(0, 1, len(selected_attributes))) | |
for i, attribute in enumerate(selected_attributes): | |
plt.plot(df.index, df[attribute], marker='', linewidth=2, color=colors[i], label=attribute) | |
plt.legend() | |
plt.xlabel(selected_attributes[0]) | |
plt.ylabel(selected_attributes[1]) | |
plt.title(f'Line Plot of {selected_attributes[0]} vs {selected_attributes[1]}') | |
return plt.gcf() | |
else: | |
return -2 | |
def multi_plot_heatmap(df, selected_attributes): | |
""" | |
Correlation heatmap of multiple attributes. | |
""" | |
if not all(df[selected_attributes].dtypes.apply(lambda x: np.issubdtype(x, np.number))): | |
return -1 | |
if len(selected_attributes) >= 1: | |
sns.set_theme() | |
plt.figure(figsize=(10, 8)) | |
sns.heatmap(df[selected_attributes].corr(), annot=True, cmap='viridis') | |
plt.title('Heatmap of Correlation') | |
return plt.gcf() | |
# Overall visualization | |
def correlation_matrix(df): | |
""" | |
Correlation heatmap of all attributes using Seaborn. | |
""" | |
plt.figure(figsize=(16, 12)) | |
sns.set(font_scale=0.9) | |
sns.heatmap(df.corr(), annot=True, cmap='viridis', annot_kws={"size": 12}) | |
return plt.gcf() | |
def correlation_matrix_plotly(df): | |
""" | |
Correlation heatmap of all attributes using Plotly. | |
""" | |
corr_matrix = df.corr() | |
labels = corr_matrix.columns | |
text = [[f'{corr_matrix.iloc[i, j]:.2f}' for j in range(len(labels))] for i in range(len(labels))] | |
fig = go.Figure(data=go.Heatmap( | |
z=corr_matrix.values, | |
x=labels, | |
y=labels, | |
colorscale='Viridis', | |
colorbar=dict(title='Correlation'), | |
text=text, | |
hoverinfo='text', | |
)) | |
fig.update_layout( | |
title='Correlation Matrix Between Attributes', | |
xaxis=dict(tickmode='linear'), | |
yaxis=dict(tickmode='linear'), | |
width=800, | |
height=700, | |
) | |
fig.update_layout(font=dict(size=10)) | |
return fig | |
def list_all(df, max_plots=16): | |
""" | |
Display histograms of all attributes in the DataFrame. | |
""" | |
# Calculate the number of plots to display (up to 16) | |
num_plots = min(len(df.columns), max_plots) | |
nrows = int(np.ceil(num_plots / 4)) | |
ncols = min(num_plots, 4) | |
fig, axes = plt.subplots(nrows, ncols, figsize=(4 * ncols, 4 * nrows)) | |
fig.suptitle('Attribute Distributions', fontsize=20) | |
plt.style.use('ggplot') | |
sns.set(style="darkgrid") | |
# if only one plot, convert to list | |
if num_plots == 1: axes = [axes] | |
# Flatten the axes array | |
axes = axes.flatten() | |
# Display the histograms | |
for i, column in enumerate(df.columns[:num_plots]): | |
sns.histplot(ax=axes[i], data=df, x=column, color='#1867ac') | |
# Hide additional subplots | |
for ax in axes[num_plots:]: ax.axis('off') | |
plt.tight_layout() | |
plt.subplots_adjust(top=0.95) # Adjust the top to accommodate the title | |
return fig | |
# Model evaluation | |
def confusion_metrix(model_name, model, X_test, Y_test): | |
""" | |
Confusion matrix plot for classification models | |
""" | |
Y_pred = model.predict(X_test) | |
matrix = confusion_matrix(Y_test, Y_pred) | |
plt.figure(figsize=(10, 7)) # temporary | |
sns_heatmap = sns.heatmap(matrix, annot=True, cmap='Blues', fmt='g', annot_kws={"size": 20}) | |
plt.title(f"Confusion Matrix for {model_name}", fontsize=20) | |
plt.xlabel('Predicted labels', fontsize=16) | |
plt.ylabel('True labels', fontsize=16) | |
return sns_heatmap.figure | |
def roc(model_name, fpr, tpr): | |
""" | |
ROC curve for classification models | |
""" | |
fig = plt.figure() | |
plt.style.use('ggplot') | |
plt.plot([0,1],[0,1],'k--') | |
plt.plot(fpr, tpr, label=model_name) | |
plt.xlabel('False Positive rate') | |
plt.ylabel('True Positive rate') | |
plt.title(f'ROC Curve - {model_name}') | |
plt.legend(loc='best') | |
plt.xticks(rotation=45) | |
return fig | |
def plot_clusters(X, labels): | |
""" | |
Scatter plot of clusters for clustering models | |
""" | |
sns.set(style="whitegrid") | |
pca = PCA(n_components=2) | |
X_pca = pca.fit_transform(X) | |
unique_labels = set(labels) | |
colors = plt.cm.viridis(np.linspace(0, 1, len(unique_labels))) | |
fig, ax = plt.subplots() | |
for color, label in zip(colors, unique_labels): | |
idx = labels == label | |
ax.scatter(X_pca[idx, 0], X_pca[idx, 1], color=color, label=f'Cluster {label}', s=50) | |
ax.set_title('Cluster Scatter Plot') | |
ax.legend() | |
return fig | |
def plot_residuals(y_pred, Y_test): | |
""" | |
Residual plot for regression models | |
""" | |
residuals = Y_test - y_pred | |
fig, ax = plt.subplots() | |
sns.residplot(x=y_pred, y=residuals, lowess=True, ax=ax, scatter_kws={'alpha': 0.7}, line_kws={'color': 'purple', 'lw': 2}) | |
ax.set_xlabel('Predicted Values') | |
ax.set_ylabel('Residuals') | |
ax.set_title('Residual Plot') | |
return fig | |
def plot_predictions_vs_actual(y_pred, Y_test): | |
""" | |
Scatter plot of predicted vs. actual values for regression models | |
""" | |
fig, ax = plt.subplots() | |
ax.scatter(Y_test, y_pred, c='#10a37f', marker='x') | |
ax.plot([Y_test.min(), Y_test.max()], [Y_test.min(), Y_test.max()], 'k--', lw=2) | |
ax.set_xlabel('Actual') | |
ax.set_ylabel('Predicted') | |
ax.set_title('Actual vs. Predicted') | |
ax.set_facecolor('white') | |
ax.grid(True, which='major', linestyle='--', linewidth=0.5, color='gray') | |
ax.spines['top'].set_visible(False) | |
ax.spines['right'].set_visible(False) | |
return fig | |
def plot_qq_plot(y_pred, Y_test): | |
""" | |
Quantile-Quantile plot for regression models | |
""" | |
residuals = Y_test - y_pred | |
fig, ax = plt.subplots() | |
(osm, osr), (slope, intercept, r) = stats.probplot(residuals, dist="norm", plot=None) | |
line = slope * osm + intercept | |
ax.plot(osm, line, 'grey', lw=2) | |
ax.scatter(osm, osr, alpha=0.8, edgecolors='#e8b517', c='yellow', label='Data Points') | |
ax.set_title('Quantile-Quantile Plot') | |
ax.set_facecolor('white') | |
ax.grid(True, which='major', linestyle='--', linewidth=0.5, color='gray') | |
ax.spines['top'].set_visible(False) | |
ax.spines['right'].set_visible(False) | |
ax.set_xlabel('Theoretical Quantiles') | |
ax.set_ylabel('Ordered Values') | |
return fig | |
# Advanced Visualization | |
def word_cloud_plot(text): | |
""" | |
Generates and displays a word cloud from the given text. | |
The word cloud visualizes the frequency of occurrence of words in the text, with the size of each word indicating its frequency. | |
:param text: The input text from which to generate the word cloud. | |
:return: A matplotlib figure object containing the word cloud if successful, -1 otherwise. | |
""" | |
try: | |
words = regexp_tokenize(text, pattern='\w+') | |
text_dist = nltk.FreqDist([w for w in words]) | |
wordcloud = WordCloud(width=1200, height=600, background_color ='white').generate_from_frequencies(text_dist) | |
fig, ax = plt.subplots(figsize=(10, 7.5)) | |
ax.imshow(wordcloud, interpolation='bilinear') | |
ax.axis('off') | |
return fig | |
except: | |
return -1 | |
def world_map(df, country_column, key_attribute): | |
""" | |
Creates a choropleth world map visualization based on the specified DataFrame. | |
The function highlights countries based on a key attribute, providing an interactive map that can be used to analyze geographical data distributions. | |
:param df: DataFrame containing the data to be visualized. | |
:param country_column: Name of the column in df that contains country names. | |
:param key_attribute: Name of the column in df that contains the data to visualize on the map. | |
:return: A Plotly figure object representing the choropleth map if successful, -1 otherwise. | |
""" | |
try: | |
hover_data_columns = [col for col in df.columns if col != country_column] | |
fig = px.choropleth(df, locations="iso_alpha", | |
color=key_attribute, | |
hover_name=country_column, | |
hover_data=hover_data_columns, | |
color_continuous_scale=px.colors.sequential.Cividis, | |
projection="equirectangular",) | |
return fig | |
except: | |
return -1 | |
def scatter_3d(df, x, y, z): | |
""" | |
Generates a 3D scatter plot from the given DataFrame. | |
Each point in the plot corresponds to a row in the DataFrame, with its position determined by three specified columns. Points are colored based on the values of the z-axis. | |
:param df: DataFrame containing the data to be visualized. | |
:param x: Name of the column in df to use for the x-axis values. | |
:param y: Name of the column in df to use for the y-axis values. | |
:param z: Name of the column in df to use for the z-axis values and color coding. | |
:return: A Plotly figure object containing the 3D scatter plot if successful, -1 otherwise. | |
""" | |
try: | |
return px.scatter_3d(df, x=x, y=y, z=z, color=z, color_continuous_scale=px.colors.sequential.Viridis) | |
except: | |
return -1 | |