import threading import time from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options import hashlib import sqlite3 import csv import os import logging import traceback def create_database(): try: conn = sqlite3.connect('/home/user/app/scraped_data/culver/culvers_changes.db') c = conn.cursor() c.execute('''CREATE TABLE IF NOT EXISTS changes (id INTEGER PRIMARY KEY AUTOINCREMENT, date TEXT, time TEXT, url TEXT, change TEXT)''') conn.commit() conn.close() logging.info("Database created or already exists") except Exception as e: logging.error(f"Error creating database: {e}") traceback.print_exc() def insert_change(date, time, url, change): try: conn = sqlite3.connect('/home/user/app/scraped_data/culver/culvers_changes.db') c = conn.cursor() c.execute("INSERT INTO changes (date, time, url, change) VALUES (?, ?, ?, ?)", (date, time, url, change)) conn.commit() conn.close() logging.info(f"Change inserted: {date} {time} {url}") except Exception as e: logging.error(f"Error inserting change: {e}") traceback.print_exc() def continuous_monitoring(storage_location, urls, scrape_interval, content_type): create_database() os.makedirs(os.path.dirname(storage_location), exist_ok=True) previous_hashes = {url: "" for url in urls} options = Options() options.add_argument("--headless") options.add_argument("--no-sandbox") options.add_argument("--disable-dev-shm-usage") service = Service('/usr/bin/chromedriver') logging.info(f"Starting continuous monitoring for URLs: {urls}") try: with webdriver.Chrome(service=service, options=options) as driver: while True: for url in urls: try: logging.info(f"Accessing URL: {url}") driver.get(url) time.sleep(2) # Wait for the page to load if content_type == "text": current_content = driver.page_source elif content_type == "media": current_content = driver.find_elements_by_tag_name("img") else: current_content = driver.page_source current_hash = hashlib.md5(str(current_content).encode('utf-8')).hexdigest() if current_hash != previous_hashes[url]: previous_hashes[url] = current_hash date_time_str = time.strftime("%Y-%m-%d %H:%M:%S") date, time_str = date_time_str.split() change = "Content changed" with open(storage_location, "a", newline='') as csvfile: csv_toolkit = csv.DictWriter(csvfile, fieldnames=["date", "time", "url", "change"]) csv_toolkit.writerow({"date": date, "time": time_str, "url": url, "change": change}) insert_change(date, time_str, url, change) logging.info(f"Change detected at {url} on {date_time_str}") else: logging.info(f"No change detected at {url}") except Exception as e: logging.error(f"Error accessing {url}: {e}") traceback.print_exc() logging.info(f"Sleeping for {scrape_interval} minutes") time.sleep(scrape_interval * 60) # Check every scrape_interval minutes except Exception as e: logging.error(f"Error in continuous monitoring: {e}") traceback.print_exc() def start_background_monitoring(storage_location, urls, scrape_interval, content_type): thread = threading.Thread(target=continuous_monitoring, args=(storage_location, urls, scrape_interval, content_type)) thread.daemon = True thread.start() logging.info("Background monitoring started")