Spaces:

xinah3131
/

youtube-trend-prediction

Running

App Files Files Community

xinah3131 commited on Jun 5, 2023

Commit

ef22d5e

•

1 Parent(s): e60e50d

Upload 3 files

Browse files

Files changed (3) hide show

apiSearch.py +92 -0
app3.py +90 -0
preprocessText.py +33 -0

apiSearch.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import re
+import pandas as pd
+from urllib.parse import urlparse, parse_qs
+from preprocessText import preprocess
+from googleapiclient.discovery import build
+import isodate
+api_keys = ['AIzaSyC4hp-RHBw5uY4NcthYw-A2fqYyrG22kaE',
+    'AIzaSyC7KzwigUsNJ4KNvqGfPqXVK9QcDBsKU78',
+    'AIzaSyDEPBCb1PhEaYHuBgzW6D5-ldTHUCowuq4',
+    'AIzaSyD-LN8Z7xG8OHtMQ89GRDvIaRQwkVHzfEo',
+    'AIzaSyCW5J_uI37UPmq3mJVAhVdWNdGSMAMg5tI',
+    'AIzaSyC8VVO0DhDY91lfyqqaUW85VKriqBiahBA',
+    'AIzaSyDC744JL3Xa3eORSxORoxKpunKFPPMGb3Y',
+    'AIzaSyA-DwJmtgWFO-I-Dwv1hcISJKXGDjbpZok',
+    'AIzaSyDC744JL3Xa3eORSxORoxKpunKFPPMGb3Y',
+    'AIzaSyD74KqDih_2AyOIJV-HaIvU9DdUOIyRONs',
+    'AIzaSyALgq5vR27iGsuFuLiz-Ry4NGy6E-L1PUY',
+    'AIzaSyC4hp-RHBw5uY4NcthYw-A2fqYyrG22kaE']
+current_key_index = 0  # Declare current_key_index as a global variable
+def get_video_id(url):
+    video_id = None
+    parsed_url = urlparse(url)
+    query_params = parse_qs(parsed_url.query)
+    if parsed_url.netloc == 'youtu.be':
+        video_id = parsed_url.path[1:]
+    elif parsed_url.netloc in ('www.youtube.com', 'youtube.com'):
+        if 'v' in query_params:
+            video_id = query_params['v'][0]
+    return video_id
+def get_next_api_key():
+    global current_key_index
+    current_key_index = (current_key_index + 1) % len(api_keys)
+    return api_keys[current_key_index]
+def get_video_metadata(video_id):
+    try:
+        # Get the next API key
+        api_key = get_next_api_key()
+        # Set up the YouTube Data API client
+        youtube = build('youtube', 'v3', developerKey=api_key)
+        # Call the API to retrieve video metadata
+        response = youtube.videos().list(
+            part='snippet,contentDetails,statistics',
+            id=video_id
+        ).execute()
+        # Extract the relevant metadata
+        if 'items' in response and len(response['items']) > 0:
+            video = response['items'][0]
+            metadata = {
+                'title': video['snippet']['title'],
+                'description': video['snippet']['description'],
+                'channel_title': video['snippet']['channelTitle'],
+                'publish_date': video['snippet']['publishedAt'],
+                'duration': video['contentDetails']['duration'],
+                'views': video['statistics']['viewCount'],
+                'likes': video['statistics']['likeCount'],
+                'comments': video['statistics']['commentCount'],
+                'category_id': video['snippet']['categoryId'],
+                'thumbnail_link': video['snippet']['thumbnails']['default']['url']
+            }
+            return metadata
+    except Exception as e:
+        print("An error occurred:", str(e))
+    return None
+def get_metadata(url):
+    # Set up the YouTube Data API client
+    video_id = get_video_id(url)
+    metadata = get_video_metadata(video_id)
+    if metadata is not None:
+        # Create a DataFrame from the metadata
+        df = pd.DataFrame([metadata])
+        df['duration'] = df['duration'].apply(lambda x: isodate.parse_duration(x).total_seconds())
+        df['cleanTitle'] = df['title'].apply(preprocess)
+        df['cleanTitle'] = df['cleanTitle'].apply(lambda x: ' '.join(x))
+        df['titleLength'] = df['title'].apply(lambda x: len(x))
+        df['descriptionLength'] = df['description'].apply(lambda x: len(x))
+        return df
+    else:
+        return 0

app3.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import streamlit as st
+import pickle
+import pandas as pd
+import joblib
+from preprocessText import preprocess
+from apiSearch import get_metadata
+# Load the model
+model = joblib.load('85pct.pkl')
+# Define the categories
+categories = {
+    'Film & Animation': 1,
+    'Autos & Vehicles': 2,
+    'Music': 10,
+    'Pets & Animals': 15,
+    'Sports' : 17,
+    'Short Movies' : 18,
+    'Travel & Events' : 19,
+    'Gaming' : 20,
+    'Videoblogging' : 21,
+    'People & Blogs' : 22,
+    'Comedy' : 23,
+    'Entertainment' : 24,
+    'News & Politics' : 25,
+    'Howto & Style' : 26,
+    'Education' : 27,
+    'Science & Technology' : 28,
+    'Nonprofits & Activism' : 29
+}
+# Create the Streamlit web application
+def main():
+    st.title("YouTube Trend Prediction")
+    st.write("Enter the video details below:")
+    getTitle = ""
+    getDuration = 0.00
+    getCategory = 1
+    # Input fields
+    url = st.text_input("URL")
+    if url:
+        metadata = get_metadata(url)
+        getTitle=metadata['title'].iloc[0]
+        getDuration = metadata['duration'].iloc[0]
+        category_id = metadata['category_id'].iloc[0]
+        getCategory = int(category_id)
+    title = st.text_input("Title",value=getTitle)
+    duration = st.number_input("Duration (in minutes)", min_value=0.0,value=getDuration)
+    category = st.selectbox("Category", list(categories.keys()),index=list(categories.values()).index(getCategory))
+    # Convert category to category ID
+    categoryId = categories[category]
+    # Predict button
+    if st.button("Predict"):
+        # Perform prediction
+        prediction = predict_trend(title, duration, categoryId)
+        if prediction[0] == 1:
+            st.success("This video is predicted to be a trend!")
+        else:
+            st.info("This video is predicted not to be a trend.")
+# Function to make predictions
+def predict_trend(title, duration, category_id):
+    duration = str(duration)
+    category_id = str(category_id)
+    clean_new_title = preprocess(title)
+    # Join the preprocessed words back into a string
+    clean_new_title_str = ' '.join(clean_new_title)
+    # Prepare the input data
+    data = {
+        'cleanTitle': [clean_new_title_str],
+        'titleLength' : [len(title)],
+        'categoryId': [category_id],
+        'duration': [duration]
+    }
+    data = pd.DataFrame(data)
+    data['categoryId'] = data['categoryId'].astype('category')
+    data['duration'] = data['duration'].astype('float64')
+    # Make the prediction
+    print(model.predict_proba(data))
+    prediction = model.predict(data)
+    return prediction
+if __name__ == "__main__":
+    main()

preprocessText.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import re
+import nltk
+from nltk.tokenize import word_tokenize
+from nltk.stem import WordNetLemmatizer
+from nltk.corpus import stopwords
+nltk.download('punkt')
+nltk.download('corpus')
+nltk.download('stopwords')
+nltk.download('wordnet')
+nltk.download('omw-1.4')
+stop_words = set(stopwords.words('english'))  # set of English stop words
+lemmatizer = WordNetLemmatizer()
+def preprocess(text,target_language='en'):
+    if not isinstance(text, str):
+        try:
+            text = str(text)
+        except:
+            raise TypeError('Input must be a string or a float')
+    # convert to lowercase
+    text = text.lower()
+    # Remove URLs
+    text = re.sub(r'http\S+', '', text)
+    # Remove special characters and punctuation
+    text = re.sub(r'[^a-zA-Z\s]', '', text)
+    # Removing repeated characters
+    text = re.sub(r'(.)\1{2,}', r'\1', text)
+    words = word_tokenize(text)
+    words = [lemmatizer.lemmatize(w) for w in words]
+    words = [w for w in words if not w in stop_words]
+    return words