acecalisto3 commited on
Commit
bf70dc8
1 Parent(s): a4c9236

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +156 -88
app.py CHANGED
@@ -1,6 +1,9 @@
 
 
 
 
1
  import time
2
  import hashlib
3
- import logging
4
  import datetime
5
  import gradio as gr
6
  import csv
@@ -18,7 +21,6 @@ from transformers import pipeline
18
  import feedparser
19
  from bs4 import BeautifulSoup
20
  import threading
21
- import os
22
 
23
  # Configure logging
24
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -30,51 +32,65 @@ HISTORY = []
30
  CURRENT_TASK = None
31
  STOP_THREADS = False
32
 
33
- # Define a function to monitor URLs for changes
34
- def monitor_urls(storage_location, urls, scrape_interval, content_type, stop_scraping_flag):
35
- global HISTORY
36
- previous_hashes = {url: "" for url in urls} # Use a dictionary for better organization
 
 
 
37
 
 
 
38
  try:
39
- with webdriver.Chrome(service=Service(webdriver.ChromeDriverManager().install()), options=Options()) as driver:
40
- while not stop_scraping_flag[0]:
41
- for url in urls:
42
- try:
43
- driver.get(url)
44
- time.sleep(2) # Wait for the page to load
45
- if content_type == "text":
46
- current_content = driver.page_source
47
- elif content_type == "media":
48
- current_content = driver.find_elements(By.TAG_NAME, "img")
49
- else:
50
- current_content = driver.page_source
51
- current_hash = hashlib.md5(str(current_content).encode('utf-8')).hexdigest()
52
- if current_hash != previous_hashes[url]:
53
- previous_hashes[url] = current_hash
54
- date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
55
- HISTORY.append(f"Change detected at {url} on {date_time_str}")
56
- with open(os.path.join(storage_location, f"{urlparse(url).hostname}_changes.csv"), "a", newline="") as csvfile:
57
- csv_writer = csv.DictWriter(csvfile, fieldnames=["date", "time", "url", "change"])
58
- csv_writer.writerow({"date": date_time_str.split()[0], "time": date_time_str.split()[1], "url": url, "change": "Content changed"})
59
- logging.info(f"Change detected at {url} on {date_time_str}")
60
- except (NoSuchElementException, Exception) as e:
61
- logging.error(f"Error accessing {url}: {e}")
62
- time.sleep(scrape_interval * 60) # Check every scrape_interval minutes
63
- except Exception as e:
64
- logging.error(f"Error starting ChromeDriver: {e}")
 
 
 
 
 
 
 
 
65
 
66
- def start_scraping(storage_location, urls, scrape_interval, content_type):
 
67
  global CURRENT_TASK, HISTORY, STOP_THREADS
68
-
69
  CURRENT_TASK = f"Monitoring URLs: {', '.join(urls)}"
70
  HISTORY.append(f"Task started: {CURRENT_TASK}")
71
-
72
  for url in urls:
73
  # Create a folder for the URL
74
  hostname = urlparse(url).hostname
75
  folder_path = os.path.join(storage_location, hostname)
76
  os.makedirs(folder_path, exist_ok=True)
77
-
78
  # Log the initial observation
79
  try:
80
  with webdriver.Chrome(service=Service(webdriver.ChromeDriverManager().install()), options=Options()) as driver:
@@ -92,63 +108,110 @@ def start_scraping(storage_location, urls, scrape_interval, content_type):
92
  file.write(f"Initial observation at {url}: {initial_hash}")
93
  except (NoSuchElementException, Exception) as e:
94
  HISTORY.append(f"Error accessing {url}: {e}")
95
-
96
  # Start a new thread for monitoring URLs
97
- threading.Thread(target=monitor_urls, args=(storage_location, [url], scrape_interval, content_type, [STOP_THREADS])).start()
98
-
99
  return f"Started scraping {', '.join(urls)} every {scrape_interval} minutes."
100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  # Define a function to stop scraping
102
  def stop_scraping():
103
  global STOP_THREADS
104
  STOP_THREADS = True
105
  return "Scraping stopped."
106
 
107
- # Define a function to update model config
108
- def update_model_config(model_name_input, gpu_layers_input):
109
- # Example implementation of update_model_config
110
- # You can replace this with the actual logic you need
111
- return f"Model config updated with model: {model_name_input}, GPU layers: {gpu_layers_input}"
112
-
113
- def display_csv(selected_url):
114
- # Get the CSV file path for the selected URL
115
- hostname = urlparse(selected_url).hostname
116
- csv_file_path = os.path.join(DEFAULT_FILE_PATH, f"{hostname}_changes.csv")
117
-
118
- # Read the CSV file and return its content
119
- if os.path.exists(csv_file_path):
120
- with open(csv_file_path, "r") as csvfile:
121
- csv_content = csvfile.read()
122
- return csv_content
123
- else:
124
- return "No CSV content available for the selected URL."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
 
126
  def handle_message(message, chat_history, system_message, max_tokens, temperature, top_p):
127
- # Process the message and update the chat history
128
  chat_history.append((message, system_message))
129
  response = f"Received message: {message}"
130
  return chat_history, response
131
 
132
- def generate_rss_feed(selected_url):
133
- # Generate the RSS feed for the selected URL
134
- # For this example, we'll just return a placeholder RSS feed
135
- rss_feed = """
136
- <?xml version="1.0" encoding="UTF-8"?>
137
- <rss version="2.0">
138
- <channel>
139
- <title>Example RSS Feed</title>
140
- <link>https://example.com</link>
141
- <description>This is an example RSS feed.</description>
142
- <item>
143
- <title>Example Item</title>
144
- <link>https://example.com/item</link>
145
- <description>This is an example item.</description>
146
- </item>
147
- </channel>
148
- </rss>
149
- """
150
- return rss_feed
151
-
152
  # Define the Gradio interface
153
  def create_interface():
154
  with gr.Blocks() as demo:
@@ -166,17 +229,17 @@ def create_interface():
166
  start_button = gr.Button("Start Scraping")
167
  stop_button = gr.Button("Stop Scraping")
168
  csv_output = gr.Textbox(label="CSV Output", interactive=False)
169
-
170
- model_name_input = gr.Textbox(value="default_model", label="Model Name")
171
  gpu_layers_input = gr.Slider(minimum=0, maximum=8, value=2, step=1, label="GPU Layers")
172
-
173
  with gr.Column():
174
  chat_history = gr.Chatbot(label="Chat History")
175
  response_box = gr.Textbox(label="Response")
176
 
177
  # Connect buttons to their respective functions
178
  start_button.click(
179
- fn=start_scraping,
 
 
180
  inputs=[storage_location, urls, scrape_interval, content_type],
181
  outputs=csv_output
182
  )
@@ -190,7 +253,6 @@ def create_interface():
190
  selected_url = gr.Textbox(label="Select URL for CSV Content")
191
  csv_button = gr.Button("Display CSV Content")
192
  csv_output = gr.Textbox(label="CSV Content Output", interactive=False)
193
-
194
  csv_button.click(display_csv, inputs=[selected_url], outputs=csv_output)
195
 
196
  # Add a button to display the RSS feed for a selected URL
@@ -198,11 +260,17 @@ def create_interface():
198
  selected_url = gr.Textbox(label="Select URL for RSS Feed")
199
  rss_button = gr.Button("Generate RSS Feed")
200
  rss_output = gr.Textbox(label="RSS Feed Output", interactive=False)
201
-
202
- rss_button.click(generate_rss_feed, inputs=[selected_url], outputs=rss_output)
 
 
 
203
 
204
  return demo
205
 
206
- if __name__ == "__main__":
207
- demo = create_interface()
208
- demo.launch()
 
 
 
 
1
+ import mysql.connector
2
+ from mysql.connector import errorcode
3
+ import os
4
+ import logging
5
  import time
6
  import hashlib
 
7
  import datetime
8
  import gradio as gr
9
  import csv
 
21
  import feedparser
22
  from bs4 import BeautifulSoup
23
  import threading
 
24
 
25
  # Configure logging
26
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
32
  CURRENT_TASK = None
33
  STOP_THREADS = False
34
 
35
+ # Define database configuration
36
+ db_config = {
37
+ 'user': os.getenv('DB_USER'),
38
+ 'password': os.getenv('DB_PASSWORD'),
39
+ 'host': os.getenv('DB_HOST'),
40
+ 'raise_on_warnings': True
41
+ }
42
 
43
+ # Define a function to initialize the database
44
+ def initialize_database(config):
45
  try:
46
+ cnx = mysql.connector.connect(**config)
47
+ cursor = cnx.cursor()
48
+
49
+ # Create database if it doesn't exist
50
+ cursor.execute("CREATE DATABASE IF NOT EXISTS scraper_db")
51
+ cnx.database = 'scraper_db'
52
+
53
+ # Create tables
54
+ TABLES = {}
55
+ TABLES['scraped_data'] = (
56
+ "CREATE TABLE IF NOT EXISTS scraped_data ("
57
+ " id INT AUTO_INCREMENT PRIMARY KEY,"
58
+ " url VARCHAR(255) NOT NULL,"
59
+ " content_hash VARCHAR(64) NOT NULL,"
60
+ " change_detected DATETIME NOT NULL"
61
+ ") ENGINE=InnoDB"
62
+ )
63
+
64
+ for table_name in TABLES:
65
+ table_description = TABLES[table_name]
66
+ try:
67
+ cursor.execute(table_description)
68
+ logging.info(f"Table `{table_name}` created successfully.")
69
+ except mysql.connector.Error as err:
70
+ if err.errno == errorcode.ER_TABLE_EXISTS_ERROR:
71
+ logging.warning(f"Table `{table_name}` already exists.")
72
+ else:
73
+ logging.error(err.msg)
74
+
75
+ cursor.close()
76
+ cnx.close()
77
+ logging.info("Database initialization complete.")
78
+ except mysql.connector.Error as err:
79
+ logging.error(f"Database initialization failed: {err}")
80
 
81
+ # Define a function to start scraping
82
+ def start_scraping(storage_location, urls, scrape_interval, content_type, db_config):
83
  global CURRENT_TASK, HISTORY, STOP_THREADS
84
+
85
  CURRENT_TASK = f"Monitoring URLs: {', '.join(urls)}"
86
  HISTORY.append(f"Task started: {CURRENT_TASK}")
87
+
88
  for url in urls:
89
  # Create a folder for the URL
90
  hostname = urlparse(url).hostname
91
  folder_path = os.path.join(storage_location, hostname)
92
  os.makedirs(folder_path, exist_ok=True)
93
+
94
  # Log the initial observation
95
  try:
96
  with webdriver.Chrome(service=Service(webdriver.ChromeDriverManager().install()), options=Options()) as driver:
 
108
  file.write(f"Initial observation at {url}: {initial_hash}")
109
  except (NoSuchElementException, Exception) as e:
110
  HISTORY.append(f"Error accessing {url}: {e}")
111
+
112
  # Start a new thread for monitoring URLs
113
+ threading.Thread(target=monitor_urls, args=(storage_location, [url], scrape_interval, content_type, [STOP_THREADS], db_config)).start()
114
+
115
  return f"Started scraping {', '.join(urls)} every {scrape_interval} minutes."
116
 
117
+ # Define a function to monitor URLs for changes
118
+ def monitor_urls(storage_location, urls, scrape_interval, content_type, stop_scraping_flag, db_config):
119
+ global HISTORY
120
+ previous_hashes = {url: "" for url in urls}
121
+
122
+ try:
123
+ cnx = mysql.connector.connect(**db_config)
124
+ cursor = cnx.cursor()
125
+
126
+ with webdriver.Chrome(service=Service(webdriver.ChromeDriverManager ().install()), options=Options()) as driver:
127
+ while not stop_scraping_flag[0]:
128
+ for url in urls:
129
+ try:
130
+ driver.get(url)
131
+ time.sleep(2) # Wait for the page to load
132
+ if content_type == "text":
133
+ current_content = driver.page_source
134
+ elif content_type == "media":
135
+ current_content = driver.find_elements(By.TAG_NAME, "img")
136
+ else:
137
+ current_content = driver.page_source
138
+ current_hash = hashlib.md5(str(current_content).encode('utf-8')).hexdigest()
139
+
140
+ if current_hash != previous_hashes[url]:
141
+ previous_hashes[url] = current_hash
142
+ date_time = datetime.datetime.now()
143
+ HISTORY.append(f"Change detected at {url} on {date_time}")
144
+
145
+ # Insert into MySQL
146
+ add_change = ("INSERT INTO scraped_data "
147
+ "(url, content_hash, change_detected) "
148
+ "VALUES (%s, %s, %s)")
149
+ data_change = (url, current_hash, date_time)
150
+ cursor.execute(add_change, data_change)
151
+ cnx.commit()
152
+
153
+ logging.info(f"Change detected and logged for {url} at {date_time}")
154
+ except (NoSuchElementException, Exception) as e:
155
+ logging.error(f"Error accessing {url}: {e}")
156
+ time.sleep(scrape_interval * 60) # Check every scrape_interval minutes
157
+ except Exception as e:
158
+ logging.error(f"Error in monitor_urls: {e}")
159
+ finally:
160
+ cursor.close()
161
+ cnx.close()
162
+
163
  # Define a function to stop scraping
164
  def stop_scraping():
165
  global STOP_THREADS
166
  STOP_THREADS = True
167
  return "Scraping stopped."
168
 
169
+ # Define a function to generate RSS feed
170
+ def generate_rss_feed(selected_url, db_config):
171
+ try:
172
+ cnx = mysql.connector.connect(**db_config)
173
+ cursor = cnx.cursor(dictionary=True)
174
+
175
+ query = ("SELECT content_hash, change_detected FROM scraped_data "
176
+ "WHERE url = %s ORDER BY change_detected DESC LIMIT 10")
177
+ cursor.execute(query, (selected_url,))
178
+
179
+ items = cursor.fetchall()
180
+
181
+ rss_items = ""
182
+ for item in items:
183
+ rss_items += f"""
184
+ <item>
185
+ <title>Change Detected</title>
186
+ <link>{selected_url}</link>
187
+ <description>Change detected on {item['change_detected'].strftime('%Y-%m-%d %H:%M:%S')}</description>
188
+ <pubDate>{item['change_detected'].strftime('%a, %d %b %Y %H:%M:%S +0000')}</pubDate>
189
+ </item>
190
+ """
191
+
192
+ rss_feed = f"""<?xml version="1.0" encoding="UTF-8"?>
193
+ <rss version="2.0">
194
+ <channel>
195
+ <title>RSS Feed for {selected_url}</title>
196
+ <link>{selected_url}</link>
197
+ <description>Latest changes detected on {selected_url}.</description>
198
+ {rss_items}
199
+ </channel>
200
+ </rss>"""
201
+
202
+ cursor.close()
203
+ cnx.close()
204
+ return rss_feed
205
+ except mysql.connector.Error as err:
206
+ logging.error(f"Error generating RSS feed: {err}")
207
+ return "Failed to generate RSS feed."
208
 
209
+ # Define a function to handle messages
210
  def handle_message(message, chat_history, system_message, max_tokens, temperature, top_p):
 
211
  chat_history.append((message, system_message))
212
  response = f"Received message: {message}"
213
  return chat_history, response
214
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  # Define the Gradio interface
216
  def create_interface():
217
  with gr.Blocks() as demo:
 
229
  start_button = gr.Button("Start Scraping")
230
  stop_button = gr.Button("Stop Scraping")
231
  csv_output = gr.Textbox(label="CSV Output", interactive=False)
232
+ model_name_input = gr.Textbox(value="default_model", label="Model Name")
 
233
  gpu_layers_input = gr.Slider(minimum=0, maximum=8, value=2, step=1, label="GPU Layers")
 
234
  with gr.Column():
235
  chat_history = gr.Chatbot(label="Chat History")
236
  response_box = gr.Textbox(label="Response")
237
 
238
  # Connect buttons to their respective functions
239
  start_button.click(
240
+ fn=lambda storage, urls, interval, ctype: start_scraping(
241
+ storage, urls.split(", "), interval, ctype, db_config
242
+ ),
243
  inputs=[storage_location, urls, scrape_interval, content_type],
244
  outputs=csv_output
245
  )
 
253
  selected_url = gr.Textbox(label="Select URL for CSV Content")
254
  csv_button = gr.Button("Display CSV Content")
255
  csv_output = gr.Textbox(label="CSV Content Output", interactive=False)
 
256
  csv_button.click(display_csv, inputs=[selected_url], outputs=csv_output)
257
 
258
  # Add a button to display the RSS feed for a selected URL
 
260
  selected_url = gr.Textbox(label="Select URL for RSS Feed")
261
  rss_button = gr.Button("Generate RSS Feed")
262
  rss_output = gr.Textbox(label="RSS Feed Output", interactive=False)
263
+ rss_button.click(
264
+ generate_rss_feed,
265
+ inputs=[selected_url, gr.State(db_config)],
266
+ outputs=rss_output
267
+ )
268
 
269
  return demo
270
 
271
+ # Initialize the database
272
+ initialize_database(db_config)
273
+
274
+ # Launch the Gradio interface
275
+ demo = create_interface()
276
+ demo.launch()