Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import joblib | |
from preprocessText import preprocess | |
from apiSearch import get_metadata,get_trending_videos | |
import base64 | |
import requests | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import seaborn as sns | |
# Load the model | |
def read_model(region): | |
if(region == "United States"): | |
model = joblib.load('85pct.pkl') | |
return model | |
# Define the categories | |
categories = { | |
'Film & Animation': 1, | |
'Autos & Vehicles': 2, | |
'Music': 10, | |
'Pets & Animals': 15, | |
'Sports' : 17, | |
'Travel & Events' : 19, | |
'Gaming' : 20, | |
'People & Blogs' : 22, | |
'Comedy' : 23, | |
'Entertainment' : 24, | |
'News & Politics' : 25, | |
'Howto & Style' : 26, | |
'Education' : 27, | |
'Science & Technology' : 28, | |
'Nonprofits & Activism' : 29 | |
} | |
# Create the Streamlit web application | |
def main(): | |
st.set_page_config(layout="wide") | |
st.markdown( | |
f""" | |
<style> | |
@import url('https://fonts.googleapis.com/css2?family=Roboto:wght@400;700&display=swap'); | |
@import url('https://fonts.googleapis.com/css2?family=YouTube+Sans&display=swap'); | |
html, body, [class*="css"] {{ | |
font-family: 'Roboto', sans-serif; | |
}} | |
[data-testid="stAppViewContainer"] > .main {{ | |
background-color : white; | |
}} | |
p{{ | |
font-family: 'Roboto', sans-serif; | |
text-weight: bold; | |
font-size: 25px; | |
}} | |
body{{ | |
display: flex; | |
justify-content: center; | |
align-items: center; | |
text-align: center; | |
}} | |
h1{{ | |
text-align: center; | |
color: #d72324; | |
}} | |
img{{ | |
max-width: 100%; | |
max-height: 100%; | |
}} | |
.stButton > button {{ | |
background-color: #d72324; | |
color:white; | |
font-weight: bold; | |
width: 500px; | |
height: 50px; | |
}} | |
.stDownloadButton > button{{ | |
background-color: #e7e7e7; | |
color:black; | |
font-weight: bold; | |
width: 150px; | |
height: 35px; | |
float: right; | |
}} | |
.stButton > button:hover {{ | |
background-color: white; | |
color:#d72324; | |
}} | |
.my-container {{ | |
border: 2px solid #d72324; | |
padding: 10px; | |
}} | |
</style> | |
""", | |
unsafe_allow_html=True | |
) | |
st.markdown("<body><img style = 'max-width: 20%;max-height: 20%;text-align: center;' src=\"https://media.tenor.com/U7OFq772kIEAAAAj/sweet-dreams.gif\"></body>",unsafe_allow_html=True) | |
st.markdown("<h1>YouTube Trend Prediction</h1>", unsafe_allow_html=True) | |
def convert_df(df): | |
return df.to_csv(index=False).encode('utf-8') | |
# Sidebar menu options | |
menu_options = ["Predict", "Trending", "Visualize"] | |
selected_option = st.sidebar.selectbox("Menu", menu_options) | |
# Input fields | |
if selected_option == "Predict": | |
region = st.sidebar.selectbox("Select Region", ['United States']) | |
model = read_model(region) | |
with st.container(): | |
col1, col2, col3 = st.columns(3) | |
getTitle, getDuration, getCategory = "", 0.00, 1 | |
getThumbnailUrl = "" | |
with col1: | |
url = st.text_input("URL", placeholder="Enter a video URL") | |
if url: | |
metadata = get_metadata(url) | |
if not metadata.empty: | |
getTitle = metadata['title'].iloc[0] | |
getDuration = metadata['duration'].iloc[0] | |
category_id = metadata['category_id'].iloc[0] | |
getThumbnailUrl = metadata['thumbnail_link'].iloc[0] | |
getCategory = int(category_id) | |
getDescription = metadata['description'].iloc[0] | |
if getThumbnailUrl is not None: | |
picture = get_picture_from_url(getThumbnailUrl) | |
if picture: | |
st.image(picture, caption='Thumbnail captured', width=320, channels="BGR") | |
with col2: | |
title = st.text_input("Title", placeholder="Enter a video title", value=getTitle) | |
duration = st.number_input("Duration (in seconds)", min_value=0.0, value=getDuration) | |
category = st.selectbox( | |
"Category", list(categories.keys()), index=list(categories.values()).index(getCategory) | |
) | |
with col3: | |
picture = st.file_uploader("Upload Picture", type=["jpg", "jpeg", "png"]) | |
if picture is not None: | |
st.picture(picture, caption='Thumbnail Uploaded', width=400, channels="BGR") | |
# Convert category to category ID | |
categoryId = categories[category] | |
if st.button("Predict"): | |
# Perform prediction | |
if title is None or title.strip() == "" and duration == 0: | |
st.warning("Please enter a title and duration.") | |
else: | |
if title is None or title.strip() == "": | |
st.warning("Please enter a title") | |
if duration == 0: | |
st.warning("Please enter a duration.") | |
else: | |
prediction = predict_trend(model,title, duration, categoryId) | |
if prediction[0] == 1: | |
st.success("This video is predicted to be a trend!") | |
st.markdown("![Alt Text](https://media.tenor.com/Cyi2zT7wcmcAAAAj/pentol-gif-eak.gif)") | |
else: | |
st.info("This video is predicted not to be a trend.") | |
st.markdown("![Alt Text](https://media.tenor.com/VYKtkKnHaUcAAAAj/quby-cute.gif)") | |
elif selected_option == "Trending": | |
tab1, tab2 = st.tabs(["Trending Board", "Video Info"]) | |
country_code = st.sidebar.selectbox("Select Country Code", ['US', 'CA', 'GB', 'DE', 'FR', 'RU', 'BR', 'IN', 'MY', 'SG', 'JP', 'KR']) | |
with st.container(): | |
with tab1: | |
st.write("Top 10 Trending Videos") | |
df = get_trending_videos(country_code) | |
st.dataframe(df) | |
csv = convert_df(df) | |
st.download_button( | |
"Download", | |
csv, | |
"top10Trending.csv", | |
"text/csv", | |
key='download-csv' | |
) | |
with tab2: | |
if df is not None: | |
selected_video_title = st.selectbox("Select a Video", df['title']) | |
selected_video = df[df['title'] == selected_video_title].iloc[0] | |
else: | |
st.error('Failed to retrieve trending videos.') | |
col4, col5 = st.columns(2) | |
with col4: | |
if selected_video is not None: | |
image = get_picture_from_url(selected_video['thumbnail_link']) | |
if image: | |
st.image(image, caption='Thumbnail captured', width=400, channels="BGR") | |
with col5: | |
st.write("Title:", selected_video['title']) | |
category_name = next( | |
(key for key, value in categories.items() if value == selected_video['category_id']), 'Unknown Category' | |
) | |
st.write("Category:", category_name) | |
st.write("Duration:", selected_video['duration']) | |
elif selected_option == "Visualize": | |
with st.container(): | |
tab3, tab4, tab5, tab6 = st.tabs(["Best Category", "Best Duration","Best Title","Best Title Length"]) | |
with tab3: | |
col6, col7 = st.columns(2) | |
with col6: | |
show_top_category() | |
with col7: | |
show_best_category() | |
with tab4: | |
col8, col9 = st.columns(2) | |
with col8: | |
show_top_duration() | |
with col9: | |
show_best_duration() | |
with tab5: | |
col10, col11 = st.columns(2) | |
with col10: | |
show_top_title() | |
with col11: | |
show_best_title() | |
with tab6: | |
col12, col13 = st.columns(2) | |
with col12: | |
show_top_titleLength() | |
with col13: | |
show_best_titleLength() | |
def get_picture_from_url(url): | |
try: | |
response = requests.get(url) | |
image_data = response.content | |
return image_data | |
except: | |
return None | |
def get_top_category(): | |
topCategory = pd.read_csv(r'C:\Users\LEGION\Desktop\MMU\Data Science Fundamental\Project\Prediction of Video\topCategory.csv') | |
topCategory_sorted = topCategory.sort_values('predicted_prob') | |
topCategory_sorted['rank'] = range(1, len(topCategory_sorted) + 1) | |
topCategory_sorted['category_name'] = topCategory_sorted['category_id'].map(lambda x: next((key for key, value in categories.items() if value == x), 'Unknown Category')) | |
return topCategory_sorted | |
def show_top_category(): | |
topCategory_sorted = get_top_category() | |
color_palette = sns.color_palette('Set2', len(topCategory_sorted['category_id'].unique())) | |
fig, ax = plt.subplots(figsize=(8, 5)) | |
sns.barplot(data=topCategory_sorted, x='rank', y='predicted_prob', hue='category_name', palette=color_palette) | |
plt.xlabel('Rank') | |
plt.ylabel('Predicted Probability') | |
plt.title('Top Categories') | |
st.pyplot(fig) | |
def show_best_category(): | |
topCategory_sorted = get_top_category() | |
top_3_categories = topCategory_sorted.sort_values('predicted_prob', ascending=False).head(3) | |
top_3_categories = top_3_categories['category_name'].head(3) | |
st.header("Top 3 Categories") | |
for category_id in top_3_categories: | |
color = '#339933' if category_id == top_3_categories.iloc[0] else '#ffcc33' if category_id == top_3_categories.iloc[1] else '#ff9900' | |
st.write(f"<span style='color:{color};font-weight:bold;'>{category_id}</span>", unsafe_allow_html=True) | |
def get_top_duration(): | |
topDurationsorted = pd.read_csv(r'C:\Users\LEGION\Desktop\MMU\Data Science Fundamental\Project\Prediction of Video\topDuration.csv') | |
topDurationsorted = topDurationsorted.sort_values('predicted_prob', ascending=False) | |
return topDurationsorted | |
def show_top_duration(): | |
topDuration_sorted = get_top_duration() | |
sns.set(style='whitegrid') | |
plt.figure(figsize=(10, 6)) | |
sns.barplot(x='duration_range', y='predicted_prob',data=topDuration_sorted) | |
plt.xlabel('Duration') | |
plt.ylabel('Predicted Probability') | |
plt.title('Top Durations') | |
plt.xticks(rotation=45) | |
plt.show() | |
st.pyplot(plt) | |
def show_best_duration(): | |
topDurationRange = get_top_duration() | |
top_3_durationRange = topDurationRange.sort_values('predicted_prob', ascending=False).head(3) | |
top_3_range = top_3_durationRange['duration_range'].head(3) | |
st.header("Top 3 Duration Range") | |
for range in top_3_range: | |
color = '#339933' if range == top_3_range.iloc[0] else '#ffcc33' if range == top_3_range.iloc[1] else '#ff9900' | |
st.write(f"<span style='color:{color};font-weight:bold;'>{range}</span>", unsafe_allow_html=True) | |
def get_top_title(): | |
topTitle = pd.read_csv(r'C:\Users\LEGION\Desktop\MMU\Data Science Fundamental\Project\Prediction of Video\topTitle.csv') | |
topTitle_sorted = topTitle.sort_values('Importance Score', ascending=False) | |
return topTitle_sorted | |
def show_top_title(): | |
topTitle_sorted = get_top_title() | |
sns.set(style="whitegrid") | |
plt.figure(figsize=(8, 6)) | |
sns.barplot(x='Importance Score', y='Feature', data=topTitle_sorted, palette="rocket") | |
plt.xlabel('Importance Score', fontsize=12) | |
plt.ylabel('Feature', fontsize=12) | |
plt.title('Top Title Features', fontsize=14) | |
plt.tight_layout() | |
st.pyplot(plt) | |
def show_best_title(): | |
topTitle_sorted = get_top_title() | |
top_3_keyword = topTitle_sorted.sort_values('Importance Score', ascending=False).head(3) | |
top_3_keyword = topTitle_sorted['Feature'].head(3) | |
st.header("Top 3 Keyword") | |
for feature in top_3_keyword: | |
color = '#339933' if feature == top_3_keyword.iloc[0] else '#ffcc33' if feature == top_3_keyword.iloc[1] else '#ff9900' | |
st.write(f"<span style='color:{color};font-weight:bold;'>{feature}</span>", unsafe_allow_html=True) | |
def round_interval(interval_str): | |
start, end = map(float, interval_str.strip('()[]').split(',')) | |
return f"({int(start)}, {int(end)})" | |
def get_top_titleLength(): | |
topTitleLength = pd.read_csv(r'C:\Users\LEGION\Desktop\MMU\Data Science Fundamental\Project\Prediction of Video\topTitleLength.csv') | |
title_length_ranges = topTitleLength['titleLength'] | |
predicted_probs = topTitleLength['predicted_prob'] | |
rounded_ranges = [round_interval(range_val) for range_val in title_length_ranges] | |
data = { | |
'rounded_ranges': rounded_ranges, | |
'predicted_probs': predicted_probs | |
} | |
topTitleLength = pd.DataFrame(data) | |
sorted_titleLength = topTitleLength.sort_values(by='predicted_probs', ascending=False) | |
return sorted_titleLength | |
def show_top_titleLength(): | |
topTitleLength = get_top_titleLength() | |
sns.set(style='whitegrid') | |
plt.figure(figsize=(10, 6)) | |
sns.barplot(x='rounded_ranges', y='predicted_probs',data=topTitleLength) | |
plt.xlabel('Title Length Range') | |
plt.ylabel('Predicted Probability') | |
plt.title('Top 5 Ranges for Title Length vs. Predicted Probability') | |
plt.xticks(rotation=45) | |
plt.show() | |
st.pyplot(plt) | |
def show_best_titleLength(): | |
topTitleLength = get_top_titleLength() | |
top_3_titleLength = topTitleLength.sort_values('predicted_probs', ascending=False).head(3) | |
top_3_range = top_3_titleLength['rounded_ranges'].head(3) | |
st.header("Top 3 Title Length Range") | |
for range in top_3_range: | |
color = '#339933' if range == top_3_range.iloc[0] else '#ffcc33' if range == top_3_range.iloc[1] else '#ff9900' | |
st.write(f"<span style='color:{color};font-weight:bold;'>{range}</span>", unsafe_allow_html=True) | |
# Function to make predictions | |
def predict_trend(model,title, duration, category_id): | |
duration = str(duration) | |
category_id = int(category_id) | |
clean_new_title = preprocess(title) | |
clean_new_title_str = ' '.join(clean_new_title) | |
data = { | |
'cleanTitle': [clean_new_title_str], | |
'titleLength' : [len(title)], | |
'categoryId': [category_id], | |
'duration': [duration] | |
} | |
data = pd.DataFrame(data) | |
data['categoryId'] = data['categoryId'].astype('category') | |
data['duration'] = data['duration'].astype('float64') | |
# Make the prediction | |
print(model.predict_proba(data)) | |
prediction = model.predict(data) | |
return prediction | |
if __name__ == "__main__": | |
main() | |