import pandas as pd from collections import Counter import re import plotly.express as px import streamlit as st # Sample DataFrame (replace with your actual data) data = { 'headline': [ "Europe is investing in renewable energy.", "China's economic growth is surpassing expectations.", "U.S. and Russia are in talks about trade.", "Arab countries are leading in oil production.", "Asia's tech industry is booming." ], 'body': [ "The renewable energy sector in Europe has seen increased investments.", "China continues to grow economically, beating many forecasts.", "The United States and Russia have started trade negotiations amid tensions.", "Oil production in Arab countries is at a new high.", "Asia is becoming a leader in the global tech industry." ] } df = pd.DataFrame(data) # List of stopwords (you can customize this or use libraries like NLTK) stop_words = set(['is', 'in', 'are', 'and', 'the', 'of', 'a', 'to']) # Function to clean, tokenize, and remove stopwords def tokenize(text): # Remove punctuation and convert text to lowercase text = re.sub(r'[^\w\s]', '', text.lower()) # Tokenize by splitting the string into words words = text.split() # Remove stopwords filtered_words = [word for word in words if word not in stop_words] return filtered_words # Tokenize and count word frequencies across the entire DataFrame all_words = [] df['headline'].apply(lambda x: all_words.extend(tokenize(x))) df['body'].apply(lambda x: all_words.extend(tokenize(x))) # Count word frequencies word_counts = Counter(all_words) # Get the top 10 most common words top_10_words = word_counts.most_common(10) # Convert the top 10 words to a DataFrame for plotting top_10_df = pd.DataFrame(top_10_words, columns=['word', 'frequency']) # Streamlit setup st.title('Interactive Word Frequency Analysis') # Plot the bar chart using Plotly fig = px.bar(top_10_df, x='word', y='frequency', title='Top 10 Most Common Words (Excluding Stopwords)', labels={'word': 'Word', 'frequency': 'Frequency'}, text='frequency') # Display the Plotly chart in Streamlit st.plotly_chart(fig) # Sidebar filter for word selection, simulating a "click" on the bar chart clicked_word = st.selectbox("Select a word to filter headlines:", options=top_10_df['word']) # Filter the DataFrame for headlines that contain the selected word filtered_df = df[df['headline'].str.contains(clicked_word, case=False)] # Display filtered headlines if not filtered_df.empty: st.subheader(f"Headlines containing the word '{clicked_word}'") st.table(filtered_df[['headline']]) else: st.write(f"No headlines found containing the word '{clicked_word}'")