Spaces:
Runtime error
Runtime error
File size: 4,468 Bytes
b5dac12 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
import threading
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import hashlib
import sqlite3
import csv
import os
import logging
import traceback
def create_database():
try:
conn = sqlite3.connect('/home/user/app/scraped_data/culver/culvers_changes.db')
c = conn.cursor()
c.execute('''CREATE TABLE IF NOT EXISTS changes
(id INTEGER PRIMARY KEY AUTOINCREMENT,
date TEXT,
time TEXT,
url TEXT,
change TEXT)''')
conn.commit()
conn.close()
logging.info("Database created or already exists")
except Exception as e:
logging.error(f"Error creating database: {e}")
traceback.print_exc()
def insert_change(date, time, url, change):
try:
conn = sqlite3.connect('/home/user/app/scraped_data/culver/culvers_changes.db')
c = conn.cursor()
c.execute("INSERT INTO changes (date, time, url, change) VALUES (?, ?, ?, ?)",
(date, time, url, change))
conn.commit()
conn.close()
logging.info(f"Change inserted: {date} {time} {url}")
except Exception as e:
logging.error(f"Error inserting change: {e}")
traceback.print_exc()
def continuous_monitoring(storage_location, urls, scrape_interval, content_type):
create_database()
os.makedirs(os.path.dirname(storage_location), exist_ok=True)
previous_hashes = {url: "" for url in urls}
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
service = Service('/usr/bin/chromedriver')
logging.info(f"Starting continuous monitoring for URLs: {urls}")
try:
with webdriver.Chrome(service=service, options=options) as driver:
while True:
for url in urls:
try:
logging.info(f"Accessing URL: {url}")
driver.get(url)
time.sleep(2) # Wait for the page to load
if content_type == "text":
current_content = driver.page_source
elif content_type == "media":
current_content = driver.find_elements_by_tag_name("img")
else:
current_content = driver.page_source
current_hash = hashlib.md5(str(current_content).encode('utf-8')).hexdigest()
if current_hash != previous_hashes[url]:
previous_hashes[url] = current_hash
date_time_str = time.strftime("%Y-%m-%d %H:%M:%S")
date, time_str = date_time_str.split()
change = "Content changed"
with open(storage_location, "a", newline='') as csvfile:
csv_toolkit = csv.DictWriter(csvfile, fieldnames=["date", "time", "url", "change"])
csv_toolkit.writerow({"date": date, "time": time_str, "url": url, "change": change})
insert_change(date, time_str, url, change)
logging.info(f"Change detected at {url} on {date_time_str}")
else:
logging.info(f"No change detected at {url}")
except Exception as e:
logging.error(f"Error accessing {url}: {e}")
traceback.print_exc()
logging.info(f"Sleeping for {scrape_interval} minutes")
time.sleep(scrape_interval * 60) # Check every scrape_interval minutes
except Exception as e:
logging.error(f"Error in continuous monitoring: {e}")
traceback.print_exc()
def start_background_monitoring(storage_location, urls, scrape_interval, content_type):
thread = threading.Thread(target=continuous_monitoring, args=(storage_location, urls, scrape_interval, content_type))
thread.daemon = True
thread.start()
logging.info("Background monitoring started")
|