Spaces:
Sleeping
Sleeping
import os | |
import requests | |
from bs4 import BeautifulSoup | |
import sqlite3 | |
from dotenv import load_dotenv | |
# Load environment variables | |
load_dotenv() | |
# Database setup | |
def create_database(): | |
conn = sqlite3.connect('courses.db') | |
c = conn.cursor() | |
# Create table with an additional column 'price' to indicate free or paid | |
c.execute('''CREATE TABLE IF NOT EXISTS courses | |
(id INTEGER PRIMARY KEY, title TEXT, description TEXT, price TEXT)''') | |
conn.commit() | |
conn.close() | |
# Web scraping function to get course data from a specific page | |
def scrape_courses_from_page(page_number): | |
url = f"https://courses.analyticsvidhya.com/collections/courses?page={page_number}" | |
response = requests.get(url) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
courses = [] | |
# Find all course items from the products__list | |
course_items = soup.find_all('li', class_='products__list-item') | |
for course_item in course_items: | |
# Extract the course title | |
title_tag = course_item.find('h3') | |
title = title_tag.text.strip() if title_tag else 'No title available' | |
# Extract course price | |
price_tag = course_item.find('span', class_='course-card__price') | |
price = price_tag.text.strip() if price_tag else "Price not available" | |
# Description is not always explicitly provided, using course body for more details | |
description_tag = course_item.find('h4') | |
description = description_tag.text.strip() if description_tag else 'No description available' | |
# Append course details (title, description, price) | |
courses.append((title, description, price)) | |
return courses | |
# Scrape all pages (total 8 pages) and insert data into the database | |
def scrape_all_pages(): | |
all_courses = [] | |
# Loop through pages 1 to 8 | |
for page in range(1, 9): | |
print(f"Scraping page {page}...") | |
courses = scrape_courses_from_page(page) | |
all_courses.extend(courses) | |
return all_courses | |
# Insert scraped data into the database | |
def insert_data_to_db(courses): | |
conn = sqlite3.connect('courses.db') | |
c = conn.cursor() | |
c.executemany('INSERT INTO courses (title, description, price) VALUES (?, ?, ?)', courses) | |
conn.commit() | |
conn.close() | |
if __name__ == "__main__": | |
create_database() | |
all_courses = scrape_all_pages() | |
insert_data_to_db(all_courses) | |
print(f"Data from all pages has been successfully scraped and inserted into the database.") | |