acecalisto3 commited on
Commit
b5dac12
1 Parent(s): 5b54c93

Create background_tasks.py

Browse files
Files changed (1) hide show
  1. background_tasks.py +105 -0
background_tasks.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import threading
2
+ import time
3
+ from selenium import webdriver
4
+ from selenium.webdriver.chrome.service import Service
5
+ from selenium.webdriver.chrome.options import Options
6
+ import hashlib
7
+ import sqlite3
8
+ import csv
9
+ import os
10
+ import logging
11
+ import traceback
12
+
13
+ def create_database():
14
+ try:
15
+ conn = sqlite3.connect('/home/user/app/scraped_data/culver/culvers_changes.db')
16
+ c = conn.cursor()
17
+ c.execute('''CREATE TABLE IF NOT EXISTS changes
18
+ (id INTEGER PRIMARY KEY AUTOINCREMENT,
19
+ date TEXT,
20
+ time TEXT,
21
+ url TEXT,
22
+ change TEXT)''')
23
+ conn.commit()
24
+ conn.close()
25
+ logging.info("Database created or already exists")
26
+ except Exception as e:
27
+ logging.error(f"Error creating database: {e}")
28
+ traceback.print_exc()
29
+
30
+ def insert_change(date, time, url, change):
31
+ try:
32
+ conn = sqlite3.connect('/home/user/app/scraped_data/culver/culvers_changes.db')
33
+ c = conn.cursor()
34
+ c.execute("INSERT INTO changes (date, time, url, change) VALUES (?, ?, ?, ?)",
35
+ (date, time, url, change))
36
+ conn.commit()
37
+ conn.close()
38
+ logging.info(f"Change inserted: {date} {time} {url}")
39
+ except Exception as e:
40
+ logging.error(f"Error inserting change: {e}")
41
+ traceback.print_exc()
42
+
43
+ def continuous_monitoring(storage_location, urls, scrape_interval, content_type):
44
+ create_database()
45
+
46
+ os.makedirs(os.path.dirname(storage_location), exist_ok=True)
47
+
48
+ previous_hashes = {url: "" for url in urls}
49
+
50
+ options = Options()
51
+ options.add_argument("--headless")
52
+ options.add_argument("--no-sandbox")
53
+ options.add_argument("--disable-dev-shm-usage")
54
+
55
+ service = Service('/usr/bin/chromedriver')
56
+
57
+ logging.info(f"Starting continuous monitoring for URLs: {urls}")
58
+
59
+ try:
60
+ with webdriver.Chrome(service=service, options=options) as driver:
61
+ while True:
62
+ for url in urls:
63
+ try:
64
+ logging.info(f"Accessing URL: {url}")
65
+ driver.get(url)
66
+ time.sleep(2) # Wait for the page to load
67
+ if content_type == "text":
68
+ current_content = driver.page_source
69
+ elif content_type == "media":
70
+ current_content = driver.find_elements_by_tag_name("img")
71
+ else:
72
+ current_content = driver.page_source
73
+
74
+ current_hash = hashlib.md5(str(current_content).encode('utf-8')).hexdigest()
75
+
76
+ if current_hash != previous_hashes[url]:
77
+ previous_hashes[url] = current_hash
78
+ date_time_str = time.strftime("%Y-%m-%d %H:%M:%S")
79
+ date, time_str = date_time_str.split()
80
+ change = "Content changed"
81
+
82
+ with open(storage_location, "a", newline='') as csvfile:
83
+ csv_toolkit = csv.DictWriter(csvfile, fieldnames=["date", "time", "url", "change"])
84
+ csv_toolkit.writerow({"date": date, "time": time_str, "url": url, "change": change})
85
+
86
+ insert_change(date, time_str, url, change)
87
+ logging.info(f"Change detected at {url} on {date_time_str}")
88
+ else:
89
+ logging.info(f"No change detected at {url}")
90
+ except Exception as e:
91
+ logging.error(f"Error accessing {url}: {e}")
92
+ traceback.print_exc()
93
+
94
+ logging.info(f"Sleeping for {scrape_interval} minutes")
95
+ time.sleep(scrape_interval * 60) # Check every scrape_interval minutes
96
+ except Exception as e:
97
+ logging.error(f"Error in continuous monitoring: {e}")
98
+ traceback.print_exc()
99
+
100
+ def start_background_monitoring(storage_location, urls, scrape_interval, content_type):
101
+ thread = threading.Thread(target=continuous_monitoring, args=(storage_location, urls, scrape_interval, content_type))
102
+ thread.daemon = True
103
+ thread.start()
104
+ logging.info("Background monitoring started")
105
+