Spaces:
Sleeping
Sleeping
import pandas as pd | |
from collections import Counter | |
import re | |
import plotly.express as px | |
import streamlit as st | |
# Sample DataFrame (replace with your actual data) | |
data = { | |
'headline': [ | |
"Europe is investing in renewable energy.", | |
"China's economic growth is surpassing expectations.", | |
"U.S. and Russia are in talks about trade.", | |
"Arab countries are leading in oil production.", | |
"Asia's tech industry is booming." | |
], | |
'body': [ | |
"The renewable energy sector in Europe has seen increased investments.", | |
"China continues to grow economically, beating many forecasts.", | |
"The United States and Russia have started trade negotiations amid tensions.", | |
"Oil production in Arab countries is at a new high.", | |
"Asia is becoming a leader in the global tech industry." | |
] | |
} | |
df = pd.DataFrame(data) | |
# List of stopwords (you can customize this or use libraries like NLTK) | |
stop_words = set(['is', 'in', 'are', 'and', 'the', 'of', 'a', 'to']) | |
# Function to clean, tokenize, and remove stopwords | |
def tokenize(text): | |
# Remove punctuation and convert text to lowercase | |
text = re.sub(r'[^\w\s]', '', text.lower()) | |
# Tokenize by splitting the string into words | |
words = text.split() | |
# Remove stopwords | |
filtered_words = [word for word in words if word not in stop_words] | |
return filtered_words | |
# Tokenize and count word frequencies across the entire DataFrame | |
all_words = [] | |
df['headline'].apply(lambda x: all_words.extend(tokenize(x))) | |
df['body'].apply(lambda x: all_words.extend(tokenize(x))) | |
# Count word frequencies | |
word_counts = Counter(all_words) | |
# Get the top 10 most common words | |
top_10_words = word_counts.most_common(10) | |
# Convert the top 10 words to a DataFrame for plotting | |
top_10_df = pd.DataFrame(top_10_words, columns=['word', 'frequency']) | |
# Streamlit setup | |
st.title('Interactive Word Frequency Analysis') | |
# Plot the bar chart using Plotly | |
fig = px.bar(top_10_df, x='word', y='frequency', title='Top 10 Most Common Words (Excluding Stopwords)', | |
labels={'word': 'Word', 'frequency': 'Frequency'}, | |
text='frequency') | |
# Display the Plotly chart in Streamlit | |
st.plotly_chart(fig) | |
# Sidebar filter for word selection, simulating a "click" on the bar chart | |
clicked_word = st.selectbox("Select a word to filter headlines:", options=top_10_df['word']) | |
# Filter the DataFrame for headlines that contain the selected word | |
filtered_df = df[df['headline'].str.contains(clicked_word, case=False)] | |
# Display filtered headlines | |
if not filtered_df.empty: | |
st.subheader(f"Headlines containing the word '{clicked_word}'") | |
st.table(filtered_df[['headline']]) | |
else: | |
st.write(f"No headlines found containing the word '{clicked_word}'") | |