import streamlit as st
import pandas as pd
import joblib
from preprocessText import preprocess
from apiSearch import get_metadata,get_trending_videos
import base64
import requests
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
# Load the model
def read_model(region):
if(region == "United States"):
model = joblib.load('85pct.pkl')
return model
# Define the categories
categories = {
'Film & Animation': 1,
'Autos & Vehicles': 2,
'Music': 10,
'Pets & Animals': 15,
'Sports' : 17,
'Travel & Events' : 19,
'Gaming' : 20,
'People & Blogs' : 22,
'Comedy' : 23,
'Entertainment' : 24,
'News & Politics' : 25,
'Howto & Style' : 26,
'Education' : 27,
'Science & Technology' : 28,
'Nonprofits & Activism' : 29
}
# Create the Streamlit web application
def main():
st.set_page_config(layout="wide")
st.markdown(
f"""
""",
unsafe_allow_html=True
)
st.markdown("
",unsafe_allow_html=True)
st.markdown("YouTube Trend Prediction
", unsafe_allow_html=True)
@st.cache_data
def convert_df(df):
return df.to_csv(index=False).encode('utf-8')
# Sidebar menu options
menu_options = ["Predict", "Trending", "Visualize"]
selected_option = st.sidebar.selectbox("Menu", menu_options)
# Input fields
if selected_option == "Predict":
region = st.sidebar.selectbox("Select Region", ['United States'])
model = read_model(region)
with st.container():
col1, col2, col3 = st.columns(3)
getTitle, getDuration, getCategory = "", 0.00, 1
getThumbnailUrl = ""
with col1:
url = st.text_input("URL", placeholder="Enter a video URL")
if url:
metadata = get_metadata(url)
if not metadata.empty:
getTitle = metadata['title'].iloc[0]
getDuration = metadata['duration'].iloc[0]
category_id = metadata['category_id'].iloc[0]
getThumbnailUrl = metadata['thumbnail_link'].iloc[0]
getCategory = int(category_id)
getDescription = metadata['description'].iloc[0]
if getThumbnailUrl is not None:
picture = get_picture_from_url(getThumbnailUrl)
if picture:
st.image(picture, caption='Thumbnail captured', width=320, channels="BGR")
with col2:
title = st.text_input("Title", placeholder="Enter a video title", value=getTitle)
duration = st.number_input("Duration (in seconds)", min_value=0.0, value=getDuration)
category = st.selectbox(
"Category", list(categories.keys()), index=list(categories.values()).index(getCategory)
)
with col3:
picture = st.file_uploader("Upload Picture", type=["jpg", "jpeg", "png"])
if picture is not None:
st.picture(picture, caption='Thumbnail Uploaded', width=400, channels="BGR")
# Convert category to category ID
categoryId = categories[category]
if st.button("Predict"):
# Perform prediction
if title is None or title.strip() == "" and duration == 0:
st.warning("Please enter a title and duration.")
else:
if title is None or title.strip() == "":
st.warning("Please enter a title")
if duration == 0:
st.warning("Please enter a duration.")
else:
prediction = predict_trend(model,title, duration, categoryId)
if prediction[0] == 1:
st.success("This video is predicted to be a trend!")
st.markdown("![Alt Text](https://media.tenor.com/Cyi2zT7wcmcAAAAj/pentol-gif-eak.gif)")
else:
st.info("This video is predicted not to be a trend.")
st.markdown("![Alt Text](https://media.tenor.com/VYKtkKnHaUcAAAAj/quby-cute.gif)")
elif selected_option == "Trending":
tab1, tab2 = st.tabs(["Trending Board", "Video Info"])
country_code = st.sidebar.selectbox("Select Country Code", ['US', 'CA', 'GB', 'DE', 'FR', 'RU', 'BR', 'IN', 'MY', 'SG', 'JP', 'KR'])
with st.container():
with tab1:
st.write("Top 10 Trending Videos")
df = get_trending_videos(country_code)
st.dataframe(df)
csv = convert_df(df)
st.download_button(
"Download",
csv,
"top10Trending.csv",
"text/csv",
key='download-csv'
)
with tab2:
if df is not None:
selected_video_title = st.selectbox("Select a Video", df['title'])
selected_video = df[df['title'] == selected_video_title].iloc[0]
else:
st.error('Failed to retrieve trending videos.')
col4, col5 = st.columns(2)
with col4:
if selected_video is not None:
image = get_picture_from_url(selected_video['thumbnail_link'])
if image:
st.image(image, caption='Thumbnail captured', width=400, channels="BGR")
with col5:
st.write("Title:", selected_video['title'])
category_name = next(
(key for key, value in categories.items() if value == selected_video['category_id']), 'Unknown Category'
)
st.write("Category:", category_name)
st.write("Duration:", selected_video['duration'])
elif selected_option == "Visualize":
with st.container():
tab3, tab4, tab5, tab6 = st.tabs(["Best Category", "Best Duration","Best Title","Best Title Length"])
with tab3:
col6, col7 = st.columns(2)
with col6:
show_top_category()
with col7:
show_best_category()
with tab4:
col8, col9 = st.columns(2)
with col8:
show_top_duration()
with col9:
show_best_duration()
with tab5:
col10, col11 = st.columns(2)
with col10:
show_top_title()
with col11:
show_best_title()
with tab6:
col12, col13 = st.columns(2)
with col12:
show_top_titleLength()
with col13:
show_best_titleLength()
def get_picture_from_url(url):
try:
response = requests.get(url)
image_data = response.content
return image_data
except:
return None
def get_top_category():
topCategory = pd.read_csv(r'C:\Users\LEGION\Desktop\MMU\Data Science Fundamental\Project\Prediction of Video\topCategory.csv')
topCategory_sorted = topCategory.sort_values('predicted_prob')
topCategory_sorted['rank'] = range(1, len(topCategory_sorted) + 1)
topCategory_sorted['category_name'] = topCategory_sorted['category_id'].map(lambda x: next((key for key, value in categories.items() if value == x), 'Unknown Category'))
return topCategory_sorted
def show_top_category():
topCategory_sorted = get_top_category()
color_palette = sns.color_palette('Set2', len(topCategory_sorted['category_id'].unique()))
fig, ax = plt.subplots(figsize=(8, 5))
sns.barplot(data=topCategory_sorted, x='rank', y='predicted_prob', hue='category_name', palette=color_palette)
plt.xlabel('Rank')
plt.ylabel('Predicted Probability')
plt.title('Top Categories')
st.pyplot(fig)
def show_best_category():
topCategory_sorted = get_top_category()
top_3_categories = topCategory_sorted.sort_values('predicted_prob', ascending=False).head(3)
top_3_categories = top_3_categories['category_name'].head(3)
st.header("Top 3 Categories")
for category_id in top_3_categories:
color = '#339933' if category_id == top_3_categories.iloc[0] else '#ffcc33' if category_id == top_3_categories.iloc[1] else '#ff9900'
st.write(f"{category_id}", unsafe_allow_html=True)
def get_top_duration():
topDurationsorted = pd.read_csv(r'C:\Users\LEGION\Desktop\MMU\Data Science Fundamental\Project\Prediction of Video\topDuration.csv')
topDurationsorted = topDurationsorted.sort_values('predicted_prob', ascending=False)
return topDurationsorted
def show_top_duration():
topDuration_sorted = get_top_duration()
sns.set(style='whitegrid')
plt.figure(figsize=(10, 6))
sns.barplot(x='duration_range', y='predicted_prob',data=topDuration_sorted)
plt.xlabel('Duration')
plt.ylabel('Predicted Probability')
plt.title('Top Durations')
plt.xticks(rotation=45)
plt.show()
st.pyplot(plt)
def show_best_duration():
topDurationRange = get_top_duration()
top_3_durationRange = topDurationRange.sort_values('predicted_prob', ascending=False).head(3)
top_3_range = top_3_durationRange['duration_range'].head(3)
st.header("Top 3 Duration Range")
for range in top_3_range:
color = '#339933' if range == top_3_range.iloc[0] else '#ffcc33' if range == top_3_range.iloc[1] else '#ff9900'
st.write(f"{range}", unsafe_allow_html=True)
def get_top_title():
topTitle = pd.read_csv(r'C:\Users\LEGION\Desktop\MMU\Data Science Fundamental\Project\Prediction of Video\topTitle.csv')
topTitle_sorted = topTitle.sort_values('Importance Score', ascending=False)
return topTitle_sorted
def show_top_title():
topTitle_sorted = get_top_title()
sns.set(style="whitegrid")
plt.figure(figsize=(8, 6))
sns.barplot(x='Importance Score', y='Feature', data=topTitle_sorted, palette="rocket")
plt.xlabel('Importance Score', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.title('Top Title Features', fontsize=14)
plt.tight_layout()
st.pyplot(plt)
def show_best_title():
topTitle_sorted = get_top_title()
top_3_keyword = topTitle_sorted.sort_values('Importance Score', ascending=False).head(3)
top_3_keyword = topTitle_sorted['Feature'].head(3)
st.header("Top 3 Keyword")
for feature in top_3_keyword:
color = '#339933' if feature == top_3_keyword.iloc[0] else '#ffcc33' if feature == top_3_keyword.iloc[1] else '#ff9900'
st.write(f"{feature}", unsafe_allow_html=True)
def round_interval(interval_str):
start, end = map(float, interval_str.strip('()[]').split(','))
return f"({int(start)}, {int(end)})"
def get_top_titleLength():
topTitleLength = pd.read_csv(r'C:\Users\LEGION\Desktop\MMU\Data Science Fundamental\Project\Prediction of Video\topTitleLength.csv')
title_length_ranges = topTitleLength['titleLength']
predicted_probs = topTitleLength['predicted_prob']
rounded_ranges = [round_interval(range_val) for range_val in title_length_ranges]
data = {
'rounded_ranges': rounded_ranges,
'predicted_probs': predicted_probs
}
topTitleLength = pd.DataFrame(data)
sorted_titleLength = topTitleLength.sort_values(by='predicted_probs', ascending=False)
return sorted_titleLength
def show_top_titleLength():
topTitleLength = get_top_titleLength()
sns.set(style='whitegrid')
plt.figure(figsize=(10, 6))
sns.barplot(x='rounded_ranges', y='predicted_probs',data=topTitleLength)
plt.xlabel('Title Length Range')
plt.ylabel('Predicted Probability')
plt.title('Top 5 Ranges for Title Length vs. Predicted Probability')
plt.xticks(rotation=45)
plt.show()
st.pyplot(plt)
def show_best_titleLength():
topTitleLength = get_top_titleLength()
top_3_titleLength = topTitleLength.sort_values('predicted_probs', ascending=False).head(3)
top_3_range = top_3_titleLength['rounded_ranges'].head(3)
st.header("Top 3 Title Length Range")
for range in top_3_range:
color = '#339933' if range == top_3_range.iloc[0] else '#ffcc33' if range == top_3_range.iloc[1] else '#ff9900'
st.write(f"{range}", unsafe_allow_html=True)
# Function to make predictions
def predict_trend(model,title, duration, category_id):
duration = str(duration)
category_id = int(category_id)
clean_new_title = preprocess(title)
clean_new_title_str = ' '.join(clean_new_title)
data = {
'cleanTitle': [clean_new_title_str],
'titleLength' : [len(title)],
'categoryId': [category_id],
'duration': [duration]
}
data = pd.DataFrame(data)
data['categoryId'] = data['categoryId'].astype('category')
data['duration'] = data['duration'].astype('float64')
# Make the prediction
print(model.predict_proba(data))
prediction = model.predict(data)
return prediction
if __name__ == "__main__":
main()