xinah3131 commited on
Commit
ef22d5e
1 Parent(s): e60e50d

Upload 3 files

Browse files
Files changed (3) hide show
  1. apiSearch.py +92 -0
  2. app3.py +90 -0
  3. preprocessText.py +33 -0
apiSearch.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import pandas as pd
3
+ from urllib.parse import urlparse, parse_qs
4
+ from preprocessText import preprocess
5
+ from googleapiclient.discovery import build
6
+ import isodate
7
+
8
+ api_keys = ['AIzaSyC4hp-RHBw5uY4NcthYw-A2fqYyrG22kaE',
9
+ 'AIzaSyC7KzwigUsNJ4KNvqGfPqXVK9QcDBsKU78',
10
+ 'AIzaSyDEPBCb1PhEaYHuBgzW6D5-ldTHUCowuq4',
11
+ 'AIzaSyD-LN8Z7xG8OHtMQ89GRDvIaRQwkVHzfEo',
12
+ 'AIzaSyCW5J_uI37UPmq3mJVAhVdWNdGSMAMg5tI',
13
+ 'AIzaSyC8VVO0DhDY91lfyqqaUW85VKriqBiahBA',
14
+ 'AIzaSyDC744JL3Xa3eORSxORoxKpunKFPPMGb3Y',
15
+ 'AIzaSyA-DwJmtgWFO-I-Dwv1hcISJKXGDjbpZok',
16
+ 'AIzaSyDC744JL3Xa3eORSxORoxKpunKFPPMGb3Y',
17
+ 'AIzaSyD74KqDih_2AyOIJV-HaIvU9DdUOIyRONs',
18
+ 'AIzaSyALgq5vR27iGsuFuLiz-Ry4NGy6E-L1PUY',
19
+ 'AIzaSyC4hp-RHBw5uY4NcthYw-A2fqYyrG22kaE']
20
+
21
+ current_key_index = 0 # Declare current_key_index as a global variable
22
+
23
+ def get_video_id(url):
24
+ video_id = None
25
+ parsed_url = urlparse(url)
26
+ query_params = parse_qs(parsed_url.query)
27
+
28
+ if parsed_url.netloc == 'youtu.be':
29
+ video_id = parsed_url.path[1:]
30
+ elif parsed_url.netloc in ('www.youtube.com', 'youtube.com'):
31
+ if 'v' in query_params:
32
+ video_id = query_params['v'][0]
33
+ return video_id
34
+
35
+ def get_next_api_key():
36
+ global current_key_index
37
+ current_key_index = (current_key_index + 1) % len(api_keys)
38
+ return api_keys[current_key_index]
39
+
40
+ def get_video_metadata(video_id):
41
+ try:
42
+ # Get the next API key
43
+ api_key = get_next_api_key()
44
+
45
+ # Set up the YouTube Data API client
46
+ youtube = build('youtube', 'v3', developerKey=api_key)
47
+
48
+ # Call the API to retrieve video metadata
49
+ response = youtube.videos().list(
50
+ part='snippet,contentDetails,statistics',
51
+ id=video_id
52
+ ).execute()
53
+
54
+ # Extract the relevant metadata
55
+ if 'items' in response and len(response['items']) > 0:
56
+ video = response['items'][0]
57
+ metadata = {
58
+ 'title': video['snippet']['title'],
59
+ 'description': video['snippet']['description'],
60
+ 'channel_title': video['snippet']['channelTitle'],
61
+ 'publish_date': video['snippet']['publishedAt'],
62
+ 'duration': video['contentDetails']['duration'],
63
+ 'views': video['statistics']['viewCount'],
64
+ 'likes': video['statistics']['likeCount'],
65
+ 'comments': video['statistics']['commentCount'],
66
+ 'category_id': video['snippet']['categoryId'],
67
+ 'thumbnail_link': video['snippet']['thumbnails']['default']['url']
68
+ }
69
+ return metadata
70
+
71
+ except Exception as e:
72
+ print("An error occurred:", str(e))
73
+
74
+ return None
75
+
76
+ def get_metadata(url):
77
+ # Set up the YouTube Data API client
78
+ video_id = get_video_id(url)
79
+ metadata = get_video_metadata(video_id)
80
+ if metadata is not None:
81
+ # Create a DataFrame from the metadata
82
+ df = pd.DataFrame([metadata])
83
+ df['duration'] = df['duration'].apply(lambda x: isodate.parse_duration(x).total_seconds())
84
+ df['cleanTitle'] = df['title'].apply(preprocess)
85
+ df['cleanTitle'] = df['cleanTitle'].apply(lambda x: ' '.join(x))
86
+ df['titleLength'] = df['title'].apply(lambda x: len(x))
87
+ df['descriptionLength'] = df['description'].apply(lambda x: len(x))
88
+
89
+ return df
90
+ else:
91
+ return 0
92
+
app3.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pickle
3
+ import pandas as pd
4
+ import joblib
5
+ from preprocessText import preprocess
6
+ from apiSearch import get_metadata
7
+
8
+
9
+ # Load the model
10
+ model = joblib.load('85pct.pkl')
11
+
12
+ # Define the categories
13
+ categories = {
14
+ 'Film & Animation': 1,
15
+ 'Autos & Vehicles': 2,
16
+ 'Music': 10,
17
+ 'Pets & Animals': 15,
18
+ 'Sports' : 17,
19
+ 'Short Movies' : 18,
20
+ 'Travel & Events' : 19,
21
+ 'Gaming' : 20,
22
+ 'Videoblogging' : 21,
23
+ 'People & Blogs' : 22,
24
+ 'Comedy' : 23,
25
+ 'Entertainment' : 24,
26
+ 'News & Politics' : 25,
27
+ 'Howto & Style' : 26,
28
+ 'Education' : 27,
29
+ 'Science & Technology' : 28,
30
+ 'Nonprofits & Activism' : 29
31
+ }
32
+
33
+ # Create the Streamlit web application
34
+ def main():
35
+ st.title("YouTube Trend Prediction")
36
+ st.write("Enter the video details below:")
37
+ getTitle = ""
38
+ getDuration = 0.00
39
+ getCategory = 1
40
+
41
+ # Input fields
42
+ url = st.text_input("URL")
43
+ if url:
44
+ metadata = get_metadata(url)
45
+ getTitle=metadata['title'].iloc[0]
46
+ getDuration = metadata['duration'].iloc[0]
47
+ category_id = metadata['category_id'].iloc[0]
48
+ getCategory = int(category_id)
49
+
50
+ title = st.text_input("Title",value=getTitle)
51
+ duration = st.number_input("Duration (in minutes)", min_value=0.0,value=getDuration)
52
+ category = st.selectbox("Category", list(categories.keys()),index=list(categories.values()).index(getCategory))
53
+
54
+ # Convert category to category ID
55
+ categoryId = categories[category]
56
+
57
+ # Predict button
58
+ if st.button("Predict"):
59
+ # Perform prediction
60
+ prediction = predict_trend(title, duration, categoryId)
61
+
62
+ if prediction[0] == 1:
63
+ st.success("This video is predicted to be a trend!")
64
+ else:
65
+ st.info("This video is predicted not to be a trend.")
66
+
67
+ # Function to make predictions
68
+ def predict_trend(title, duration, category_id):
69
+ duration = str(duration)
70
+ category_id = str(category_id)
71
+ clean_new_title = preprocess(title)
72
+ # Join the preprocessed words back into a string
73
+ clean_new_title_str = ' '.join(clean_new_title)
74
+ # Prepare the input data
75
+ data = {
76
+ 'cleanTitle': [clean_new_title_str],
77
+ 'titleLength' : [len(title)],
78
+ 'categoryId': [category_id],
79
+ 'duration': [duration]
80
+ }
81
+ data = pd.DataFrame(data)
82
+ data['categoryId'] = data['categoryId'].astype('category')
83
+ data['duration'] = data['duration'].astype('float64')
84
+ # Make the prediction
85
+ print(model.predict_proba(data))
86
+ prediction = model.predict(data)
87
+ return prediction
88
+
89
+ if __name__ == "__main__":
90
+ main()
preprocessText.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import nltk
3
+ from nltk.tokenize import word_tokenize
4
+ from nltk.stem import WordNetLemmatizer
5
+ from nltk.corpus import stopwords
6
+ nltk.download('punkt')
7
+ nltk.download('corpus')
8
+ nltk.download('stopwords')
9
+ nltk.download('wordnet')
10
+ nltk.download('omw-1.4')
11
+ stop_words = set(stopwords.words('english')) # set of English stop words
12
+ lemmatizer = WordNetLemmatizer()
13
+
14
+ def preprocess(text,target_language='en'):
15
+
16
+ if not isinstance(text, str):
17
+ try:
18
+ text = str(text)
19
+ except:
20
+ raise TypeError('Input must be a string or a float')
21
+ # convert to lowercase
22
+ text = text.lower()
23
+ # Remove URLs
24
+ text = re.sub(r'http\S+', '', text)
25
+ # Remove special characters and punctuation
26
+ text = re.sub(r'[^a-zA-Z\s]', '', text)
27
+ # Removing repeated characters
28
+ text = re.sub(r'(.)\1{2,}', r'\1', text)
29
+
30
+ words = word_tokenize(text)
31
+ words = [lemmatizer.lemmatize(w) for w in words]
32
+ words = [w for w in words if not w in stop_words]
33
+ return words