import pandas as pd
from collections import Counter
import re
import plotly.express as px
import streamlit as st

# Sample DataFrame (replace with your actual data)
data = {
    'headline': [
        "Europe is investing in renewable energy.",
        "China's economic growth is surpassing expectations.",
        "U.S. and Russia are in talks about trade.",
        "Arab countries are leading in oil production.",
        "Asia's tech industry is booming."
    ],
    'body': [
        "The renewable energy sector in Europe has seen increased investments.",
        "China continues to grow economically, beating many forecasts.",
        "The United States and Russia have started trade negotiations amid tensions.",
        "Oil production in Arab countries is at a new high.",
        "Asia is becoming a leader in the global tech industry."
    ]
}
df = pd.DataFrame(data)

# List of stopwords (you can customize this or use libraries like NLTK)
stop_words = set(['is', 'in', 'are', 'and', 'the', 'of', 'a', 'to'])

# Function to clean, tokenize, and remove stopwords
def tokenize(text):
    # Remove punctuation and convert text to lowercase
    text = re.sub(r'[^\w\s]', '', text.lower())
    # Tokenize by splitting the string into words
    words = text.split()
    # Remove stopwords
    filtered_words = [word for word in words if word not in stop_words]
    return filtered_words

# Tokenize and count word frequencies across the entire DataFrame
all_words = []
df['headline'].apply(lambda x: all_words.extend(tokenize(x)))
df['body'].apply(lambda x: all_words.extend(tokenize(x)))

# Count word frequencies
word_counts = Counter(all_words)

# Get the top 10 most common words
top_10_words = word_counts.most_common(10)

# Convert the top 10 words to a DataFrame for plotting
top_10_df = pd.DataFrame(top_10_words, columns=['word', 'frequency'])

# Streamlit setup
st.title('Interactive Word Frequency Analysis')

# Plot the bar chart using Plotly
fig = px.bar(top_10_df, x='word', y='frequency', title='Top 10 Most Common Words (Excluding Stopwords)',
             labels={'word': 'Word', 'frequency': 'Frequency'}, 
             text='frequency')

# Display the Plotly chart in Streamlit
st.plotly_chart(fig)

# Sidebar filter for word selection, simulating a "click" on the bar chart
clicked_word = st.selectbox("Select a word to filter headlines:", options=top_10_df['word'])

# Filter the DataFrame for headlines that contain the selected word
filtered_df = df[df['headline'].str.contains(clicked_word, case=False)]

# Display filtered headlines
if not filtered_df.empty:
    st.subheader(f"Headlines containing the word '{clicked_word}'")
    st.table(filtered_df[['headline']])
else:
    st.write(f"No headlines found containing the word '{clicked_word}'")