Spaces:
Sleeping
Sleeping
import re | |
import pandas as pd | |
from urllib.parse import urlparse, parse_qs | |
from preprocessText import preprocess | |
from googleapiclient.discovery import build | |
import isodate | |
api_keys = ['AIzaSyC4hp-RHBw5uY4NcthYw-A2fqYyrG22kaE', | |
'AIzaSyC7KzwigUsNJ4KNvqGfPqXVK9QcDBsKU78', | |
'AIzaSyDEPBCb1PhEaYHuBgzW6D5-ldTHUCowuq4', | |
'AIzaSyD-LN8Z7xG8OHtMQ89GRDvIaRQwkVHzfEo', | |
'AIzaSyCW5J_uI37UPmq3mJVAhVdWNdGSMAMg5tI', | |
'AIzaSyC8VVO0DhDY91lfyqqaUW85VKriqBiahBA', | |
'AIzaSyDC744JL3Xa3eORSxORoxKpunKFPPMGb3Y', | |
'AIzaSyA-DwJmtgWFO-I-Dwv1hcISJKXGDjbpZok', | |
'AIzaSyDC744JL3Xa3eORSxORoxKpunKFPPMGb3Y', | |
'AIzaSyD74KqDih_2AyOIJV-HaIvU9DdUOIyRONs', | |
'AIzaSyALgq5vR27iGsuFuLiz-Ry4NGy6E-L1PUY', | |
'AIzaSyC4hp-RHBw5uY4NcthYw-A2fqYyrG22kaE'] | |
current_key_index = 0 # Declare current_key_index as a global variable | |
def get_video_id(url): | |
video_id = None | |
parsed_url = urlparse(url) | |
query_params = parse_qs(parsed_url.query) | |
if parsed_url.netloc == 'youtu.be': | |
video_id = parsed_url.path[1:] | |
elif parsed_url.netloc in ('www.youtube.com', 'youtube.com'): | |
if 'v' in query_params: | |
video_id = query_params['v'][0] | |
return video_id | |
def get_next_api_key(): | |
global current_key_index | |
current_key_index = (current_key_index + 1) % len(api_keys) | |
return api_keys[current_key_index] | |
def get_video_metadata(video_id): | |
try: | |
# Get the next API key | |
api_key = get_next_api_key() | |
# Set up the YouTube Data API client | |
youtube = build('youtube', 'v3', developerKey=api_key) | |
# Call the API to retrieve video metadata | |
response = youtube.videos().list( | |
part='snippet,contentDetails,statistics', | |
id=video_id | |
).execute() | |
# Extract the relevant metadata | |
if 'items' in response and len(response['items']) > 0: | |
video = response['items'][0] | |
metadata = { | |
'title': video['snippet']['title'], | |
'description': video['snippet']['description'], | |
'channel_title': video['snippet']['channelTitle'], | |
'publish_date': video['snippet']['publishedAt'], | |
'duration': video['contentDetails']['duration'], | |
'views': video['statistics']['viewCount'], | |
'likes': video['statistics']['likeCount'], | |
'comments': video['statistics']['commentCount'], | |
'category_id': video['snippet']['categoryId'], | |
'thumbnail_link': video['snippet']['thumbnails']['default']['url'] | |
} | |
return metadata | |
except Exception as e: | |
print("An error occurred:", str(e)) | |
return None | |
def get_metadata(url): | |
# Set up the YouTube Data API client | |
video_id = get_video_id(url) | |
metadata = get_video_metadata(video_id) | |
if metadata is not None: | |
# Create a DataFrame from the metadata | |
df = pd.DataFrame([metadata]) | |
df['duration'] = df['duration'].apply(lambda x: isodate.parse_duration(x).total_seconds()) | |
df['cleanTitle'] = df['title'].apply(preprocess) | |
df['cleanTitle'] = df['cleanTitle'].apply(lambda x: ' '.join(x)) | |
df['titleLength'] = df['title'].apply(lambda x: len(x)) | |
df['descriptionLength'] = df['description'].apply(lambda x: len(x)) | |
return df | |
else: | |
return 0 | |