Spaces:

ryanrahmadifa
/

poc

Sleeping

poc / test_app.py

ryanrahmadifa

Added files

79e1719 6 months ago

2.78 kB

	import pandas as pd
	from collections import Counter
	import re
	import plotly.express as px
	import streamlit as st

	# Sample DataFrame (replace with your actual data)
	data = {
	'headline': [
	"Europe is investing in renewable energy.",
	"China's economic growth is surpassing expectations.",
	"U.S. and Russia are in talks about trade.",
	"Arab countries are leading in oil production.",
	"Asia's tech industry is booming."
	],
	'body': [
	"The renewable energy sector in Europe has seen increased investments.",
	"China continues to grow economically, beating many forecasts.",
	"The United States and Russia have started trade negotiations amid tensions.",
	"Oil production in Arab countries is at a new high.",
	"Asia is becoming a leader in the global tech industry."
	]
	}
	df = pd.DataFrame(data)

	# List of stopwords (you can customize this or use libraries like NLTK)
	stop_words = set(['is', 'in', 'are', 'and', 'the', 'of', 'a', 'to'])

	# Function to clean, tokenize, and remove stopwords
	def tokenize(text):
	# Remove punctuation and convert text to lowercase
	text = re.sub(r'[^\w\s]', '', text.lower())
	# Tokenize by splitting the string into words
	words = text.split()
	# Remove stopwords
	filtered_words = [word for word in words if word not in stop_words]
	return filtered_words

	# Tokenize and count word frequencies across the entire DataFrame
	all_words = []
	df['headline'].apply(lambda x: all_words.extend(tokenize(x)))
	df['body'].apply(lambda x: all_words.extend(tokenize(x)))

	# Count word frequencies
	word_counts = Counter(all_words)

	# Get the top 10 most common words
	top_10_words = word_counts.most_common(10)

	# Convert the top 10 words to a DataFrame for plotting
	top_10_df = pd.DataFrame(top_10_words, columns=['word', 'frequency'])

	# Streamlit setup
	st.title('Interactive Word Frequency Analysis')

	# Plot the bar chart using Plotly
	fig = px.bar(top_10_df, x='word', y='frequency', title='Top 10 Most Common Words (Excluding Stopwords)',
	labels={'word': 'Word', 'frequency': 'Frequency'},
	text='frequency')

	# Display the Plotly chart in Streamlit
	st.plotly_chart(fig)

	# Sidebar filter for word selection, simulating a "click" on the bar chart
	clicked_word = st.selectbox("Select a word to filter headlines:", options=top_10_df['word'])

	# Filter the DataFrame for headlines that contain the selected word
	filtered_df = df[df['headline'].str.contains(clicked_word, case=False)]

	# Display filtered headlines
	if not filtered_df.empty:
	st.subheader(f"Headlines containing the word '{clicked_word}'")
	st.table(filtered_df[['headline']])
	else:
	st.write(f"No headlines found containing the word '{clicked_word}'")