acecalisto3 commited on
Commit
0f0e2ce
1 Parent(s): 04ebae0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +160 -304
app.py CHANGED
@@ -1,356 +1,212 @@
1
- import datetime
2
  import os
 
3
  import csv
4
- import time
5
- import hashlib
6
- import threading
7
- from pathlib import Path
8
  import logging
9
  from typing import List, Tuple
10
-
 
11
  import gradio as gr
12
- from selenium import webdriver
13
- from selenium.webdriver.chrome.service import Service
14
- from selenium.webdriver.chrome.options import Options
15
- from selenium.webdriver.common.by import By
16
- from selenium.common.exceptions import (
17
- WebDriverException,
18
- NoSuchElementException,
19
- TimeoutException,
20
- StaleElementReferenceException,
21
- )
22
- from webdriver_manager.chrome import ChromeDriverManager
23
  from huggingface_hub import InferenceClient
24
- import mysql.connector
25
- import feedparser # For parsing RSS feeds
26
- import sqlite3 # For simple local storage if needed
27
 
28
- # Configure logging
29
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(filename)s - %(lineno)d - %(message)s')
30
 
31
- # Configuration (replace with your actual values or environment variables)
32
- DB_HOST = os.environ.get("DB_HOST", "your_host")
33
- DB_USER = os.environ.get("DB_USER", "your_user")
34
- DB_PASSWORD = os.environ.get("DB_PASSWORD", "your_password")
35
- DB_NAME = os.environ.get("DB_NAME", "your_database")
36
- HUGGINGFACE_API_KEY = os.environ.get("HUGGINGFACE_API_KEY") # Add API key
37
- DEFAULT_MONITORING_INTERVAL = 300 # 5 minutes in seconds
38
- MAX_MONITORING_INTERVAL = 600 # 10 minutes in seconds
39
- CHANGE_FREQUENCY_THRESHOLD = 3 # Number of changes to trigger faster monitoring
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  # Global variables
42
- monitoring_thread = None
43
- stop_event = threading.Event()
44
- db_connection = None
45
- current_task = None
46
  history = []
47
- url_monitoring_intervals = {} # Store monitoring intervals for each URL
48
- change_counts = {} # Track change frequency for each URL
49
-
50
- def get_db_connection():
51
- global db_connection
52
- if db_connection is None or not db_connection.is_connected():
53
- try:
54
- db_connection = mysql.connector.connect(
55
- host=DB_HOST,
56
- user=DB_USER,
57
- password=DB_PASSWORD,
58
- ) # Connect to the server first
59
-
60
- # Select the database
61
- if DB_NAME:
62
- cursor = db_connection.cursor()
63
- cursor.execute(f"USE {DB_NAME}")
64
- cursor.close()
65
- else:
66
- logging.error("Database name is not set.")
67
- return None
68
 
69
- return db_connection
70
- except Exception as e:
71
- logging.error(f"Error connecting to database: {e}")
72
- return None
73
- else:
74
- return db_connection
75
 
76
- # Function to create the articles table if it doesn't exist
77
- def create_articles_table():
78
- conn = get_db_connection()
79
- if conn:
80
- cursor = conn.cursor()
81
- cursor.execute("""
82
- CREATE TABLE IF NOT EXISTS articles (
83
- id INT AUTO_INCREMENT PRIMARY KEY,
84
- url VARCHAR(255) NOT NULL,
85
- title VARCHAR(255),
86
- content TEXT,
87
- hash VARCHAR(32),
88
- timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP
89
- )
90
- """)
91
- conn.commit()
92
- cursor.close()
93
 
94
- # Initialize the articles table
95
- create_articles_table()
96
-
97
- # Function to monitor URLs for changes
98
- def monitor_urls(target_urls: List[str], storage_location: str, feed_rss: bool, stop_event: threading.Event):
99
- global history, url_monitoring_intervals, change_counts
100
- previous_hashes = {url: "" for url in target_urls}
101
- options = Options()
102
- options.headless = True
103
- options.add_argument("--disable-gpu")
104
- options.add_argument("--no-sandbox")
105
- options.add_argument("--disable-dev-shm-usage")
106
- options.add_experimental_option("excludeSwitches", ["enable-logging"]) # Suppress unnecessary logs
107
-
108
- driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
109
- driver.implicitly_wait(10) # Implicit wait for elements
110
 
 
 
111
  try:
112
- while not stop_event.is_set():
113
- for url in target_urls:
114
- try:
115
- # Dynamic monitoring interval
116
- interval = url_monitoring_intervals.get(url, DEFAULT_MONITORING_INTERVAL)
117
-
118
- driver.get(url)
119
- time.sleep(2) # Allow page to load
120
-
121
- # Check for changes
122
- try:
123
- current_content = driver.find_element(By.TAG_NAME, "body").get_attribute("innerHTML")
124
- current_hash = hashlib.md5(current_content.encode('utf-8')).hexdigest()
125
- except (NoSuchElementException, TimeoutException, StaleElementReferenceException) as e:
126
- logging.warning(f"Error getting content for {url}: {e}")
127
- continue
128
-
129
- if current_hash != previous_hashes[url]:
130
- previous_hashes[url] = current_hash
131
- timestamp = datetime.datetime.now()
132
-
133
- try:
134
- title_element = driver.find_element(By.TAG_NAME, "title")
135
- title = title_element.text
136
- except NoSuchElementException:
137
- title = "No Title"
138
-
139
- history.append(f"Change detected at {url} on {timestamp.strftime('%Y-%m-%d %H:%M:%S')}")
140
-
141
- if storage_location:
142
- save_to_storage(storage_location, url, title, current_content, timestamp)
143
-
144
- if feed_rss:
145
- save_to_database(url, title, current_content, current_hash)
146
-
147
- # Adjust monitoring interval based on change frequency
148
- change_counts[url] = change_counts.get(url, 0) + 1
149
- if change_counts[url] >= CHANGE_FREQUENCY_THRESHOLD:
150
- url_monitoring_intervals[url] = 60 # Check more frequently after multiple changes
151
- else:
152
- url_monitoring_intervals[url] = min(interval + 60, MAX_MONITORING_INTERVAL) # Gradually increase interval
153
-
154
- else:
155
- # Increase interval if no changes detected
156
- change_counts[url] = 0 # Reset change count if no change
157
- url_monitoring_intervals[url] = min(interval + 60, MAX_MONITORING_INTERVAL)
158
-
159
- except WebDriverException as e:
160
- logging.error(f"Error accessing {url}: {e}")
161
-
162
- if stop_event.is_set():
163
- break # Exit inner loop if stop event is set
164
-
165
- if not stop_event.is_set():
166
- time.sleep(interval)
167
-
168
- except Exception as e:
169
- logging.error(f"Unexpected error in monitoring thread: {e}")
170
  finally:
171
- driver.quit()
172
- logging.info("Monitoring thread has been stopped.")
173
 
174
- # Function to save data to local storage (CSV)
175
- def save_to_storage(storage_location: str, url: str, title: str, content: str, timestamp: datetime.datetime):
176
  try:
177
  with open(storage_location, "a", newline='', encoding='utf-8') as csvfile:
178
  csv_writer = csv.writer(csvfile)
179
  csv_writer.writerow([timestamp.strftime("%Y-%m-%d %H:%M:%S"), url, title, content])
180
  except Exception as e:
181
- logging.error(f"Error saving to storage: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
 
183
- # Function to save data to the database
184
- def save_to_database(url: str, title: str, content: str, hash: str):
185
- conn = get_db_connection()
186
- if conn:
187
- cursor = conn.cursor()
188
- try:
189
- sql = "INSERT INTO articles (url, title, content, hash) VALUES (%s, %s, %s, %s)"
190
- val = (url, title, content, hash)
191
- cursor.execute(sql, val)
192
- conn.commit()
193
- except Exception as e:
194
- logging.error(f"Error saving to database: {e}")
195
- finally:
196
- cursor.close()
197
 
198
- # Function to generate RSS feed from the database
199
  def generate_rss_feed():
200
- conn = get_db_connection()
201
- if conn:
202
- cursor = conn.cursor()
203
- try:
204
- cursor.execute("SELECT * FROM articles ORDER BY timestamp DESC")
205
- articles = cursor.fetchall()
206
-
207
- feed = feedparser.FeedParserDict()
208
- feed['title'] = 'Website Changes Feed'
209
- feed['link'] = 'http://yourwebsite.com/feed' # Replace with your actual feed URL
210
- feed['description'] = 'Feed of changes detected on monitored websites.'
211
- feed['entries'] = []
212
-
213
- for article in articles:
214
- entry = feedparser.FeedParserDict()
215
- entry['title'] = article[2] # Title
216
- entry['link'] = article[1] # URL
217
- entry['description'] = article[3] # Content
218
- entry['published'] = article[5] # Timestamp
219
- feed['entries'].append(entry)
220
-
221
- return feedparser.FeedGenerator().feed_from_dictionary(feed).writeString('utf-8')
222
- except Exception as e:
223
- logging.error(f"Error generating RSS feed: {e}")
224
- finally:
225
- cursor.close()
226
- return None
227
-
228
- # Function to start monitoring
229
- def start_monitoring(target_urls: List[str], storage_location: str, feed_rss: bool):
230
- global monitoring_thread, stop_event, current_task, history, change_counts
231
- if monitoring_thread and monitoring_thread.is_alive():
232
- return "Monitoring is already running.", history
233
-
234
- stop_event.clear()
235
- current_task = f"Monitoring URLs: {', '.join(target_urls)}"
236
- history.append(f"Task started: {current_task}")
237
- change_counts = {url: 0 for url in target_urls} # Reset change counts
238
- monitoring_thread = threading.Thread(
239
- target=monitor_urls,
240
- args=(target_urls, storage_location, feed_rss, stop_event),
241
- daemon=True
242
- )
243
- monitoring_thread.start()
244
- return "Monitoring started.", history
245
-
246
- # Function to stop monitoring
247
- def stop_monitoring():
248
- global monitoring_thread, stop_event, current_task, history
249
- if monitoring_thread and monitoring_thread.is_alive():
250
- stop_event.set()
251
- monitoring_thread.join()
252
- current_task = None
253
- history.append("Monitoring stopped by user.")
254
- return "Monitoring stopped.", history
255
- else:
256
- return "No monitoring task is currently running.", history
257
 
258
- # Function to handle chatbot responses
259
- def chatbot_response(message: str, history: List[Tuple[str, str]]):
260
  try:
261
  client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1", token=HUGGINGFACE_API_KEY)
262
- response = client.inference(message)
263
  history.append((message, response))
264
  return history, history
265
  except Exception as e:
266
- logging.error(f"Error getting chatbot response: {e}")
267
  history.append((message, "Error: Could not get a response from the chatbot."))
268
  return history, history
269
 
270
- # --- Gradio Interface ---
271
  with gr.Blocks() as demo:
272
  gr.Markdown("# Website Monitor and Chatbot")
273
 
274
- # Configuration Tab
275
  with gr.Tab("Configuration"):
276
- with gr.Row():
277
- target_urls = gr.Textbox(
278
- label="Target URLs (comma-separated)",
279
- placeholder="https://example.com, https://another-site.com"
280
- )
281
- with gr.Row():
282
- storage_location = gr.Textbox(
283
- label="Storage Location (CSV file path)",
284
- placeholder="/path/to/your/file.csv",
285
- visible=False # You can enable this if you want CSV storage
286
- )
287
- with gr.Row():
288
- feed_rss_checkbox = gr.Checkbox(label="Enable RSS Feed")
289
- with gr.Row():
290
- start_button = gr.Button("Start Monitoring")
291
- stop_button = gr.Button("Stop Monitoring")
292
- with gr.Row():
293
- status_text = gr.Textbox(label="Status", interactive=False)
294
- with gr.Row():
295
- history_text = gr.Textbox(
296
- label="History", lines=10, interactive=False
297
- )
298
 
299
- # User-End View Tab
300
  with gr.Tab("User-End View"):
301
- with gr.Row():
302
- feed_content = gr.JSON(label="RSS Feed Content")
303
 
304
- # Chatbot Tab
305
  with gr.Tab("Chatbot"):
306
- chatbot_interface = gr.Chatbot(type='messages')
307
- with gr.Row():
308
- message_input = gr.Textbox(placeholder="Type your message here...")
309
- send_button = gr.Button("Send")
310
-
311
- # --- Event Handlers ---
312
-
313
- # Start monitoring button click
314
- def on_start_click(target_urls_str: str, storage_loc: str, feed_enabled: bool):
315
- global history, url_monitoring_intervals
316
- try:
317
- target_urls = [url.strip() for url in target_urls_str.split(",")]
318
- if not all(target_urls):
319
- return "Please enter valid URLs.", history
320
-
321
- # Reset monitoring intervals when starting
322
- url_monitoring_intervals = {url: DEFAULT_MONITORING_INTERVAL for url in target_urls}
323
-
324
- status, history = start_monitoring(target_urls, storage_loc if storage_loc else None, feed_enabled)
325
- return status, history
326
- except Exception as e:
327
- return f"Error starting monitoring: {e}", history
328
 
329
- start_button.click(
330
- on_start_click,
331
- inputs=[target_urls, storage_location, feed_rss_checkbox],
332
- outputs=[status_text, history_text]
333
- )
334
 
335
- # Stop monitoring button click
336
- stop_button.click(
337
- stop_monitoring,
338
- outputs=[status_text, history_text]
339
- )
340
 
341
- # Send message to chatbot button click
342
- send_button.click(
343
- chatbot_response,
344
- inputs=[message_input, chatbot_interface],
345
- outputs=[chatbot_interface, chatbot_interface]
346
- )
347
 
348
- # Update RSS feed content periodically
349
- def update_feed_content():
350
  return generate_rss_feed()
351
 
352
- # Use gr.Timer to periodically update the RSS feed content
353
- gr.Timer(update_feed_content, every=5).start()
354
 
355
  if __name__ == "__main__":
356
  demo.launch()
 
 
1
  import os
2
+ import asyncio
3
  import csv
 
 
 
 
4
  import logging
5
  from typing import List, Tuple
6
+ from dotenv import load_dotenv
7
+ import aiohttp
8
  import gradio as gr
9
+ from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime
10
+ from sqlalchemy.ext.declarative import declarative_base
11
+ from sqlalchemy.orm import sessionmaker
12
+ from sqlalchemy.exc import SQLAlchemyError
13
+ from bs4 import BeautifulSoup
14
+ import hashlib
15
+ import datetime
16
+ import feedparser
 
 
 
17
  from huggingface_hub import InferenceClient
18
+ import validators
 
 
19
 
20
+ # Load environment variables
21
+ load_dotenv()
22
 
23
+ # Configure logging
24
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
25
+ logger = logging.getLogger(__name__)
26
+
27
+ # Configuration
28
+ DB_URL = os.getenv('DB_URL', 'sqlite:///monitoring.db')
29
+ HUGGINGFACE_API_KEY = os.getenv('HUGGINGFACE_API_KEY')
30
+ DEFAULT_MONITORING_INTERVAL = 300
31
+ MAX_MONITORING_INTERVAL = 600
32
+ CHANGE_FREQUENCY_THRESHOLD = 3
33
+
34
+ # Database setup
35
+ Base = declarative_base()
36
+
37
+ class Article(Base):
38
+ __tablename__ = 'articles'
39
+ id = Column(Integer, primary_key=True)
40
+ url = Column(String(255), nullable=False)
41
+ title = Column(String(255))
42
+ content = Column(Text)
43
+ hash = Column(String(32))
44
+ timestamp = Column(DateTime, default=datetime.datetime.utcnow)
45
+
46
+ engine = create_engine(DB_URL)
47
+ Base.metadata.create_all(engine)
48
+ Session = sessionmaker(bind=engine)
49
 
50
  # Global variables
51
+ monitoring_tasks = {}
52
+ url_monitoring_intervals = {}
53
+ change_counts = {}
 
54
  history = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
+ # Utility functions
57
+ def sanitize_url(url: str) -> str:
58
+ return validators.url(url)
 
 
 
59
 
60
+ async def fetch_url_content(url: str, session: aiohttp.ClientSession) -> Tuple[str, str]:
61
+ async with session.get(url) as response:
62
+ content = await response.text()
63
+ soup = BeautifulSoup(content, 'html.parser')
64
+ title = soup.title.string if soup.title else "No Title"
65
+ return title, content
 
 
 
 
 
 
 
 
 
 
 
66
 
67
+ def calculate_hash(content: str) -> str:
68
+ return hashlib.md5(content.encode('utf-8')).hexdigest()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
+ async def save_to_database(url: str, title: str, content: str, hash: str):
71
+ session = Session()
72
  try:
73
+ article = Article(url=url, title=title, content=content, hash=hash)
74
+ session.add(article)
75
+ session.commit()
76
+ except SQLAlchemyError as e:
77
+ logger.error(f"Database error: {e}")
78
+ session.rollback()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  finally:
80
+ session.close()
 
81
 
82
+ def save_to_csv(storage_location: str, url: str, title: str, content: str, timestamp: datetime.datetime):
 
83
  try:
84
  with open(storage_location, "a", newline='', encoding='utf-8') as csvfile:
85
  csv_writer = csv.writer(csvfile)
86
  csv_writer.writerow([timestamp.strftime("%Y-%m-%d %H:%M:%S"), url, title, content])
87
  except Exception as e:
88
+ logger.error(f"Error saving to CSV: {e}")
89
+
90
+ async def monitor_url(url: str, interval: int, storage_location: str, feed_rss: bool):
91
+ previous_hash = ""
92
+ async with aiohttp.ClientSession() as session:
93
+ while True:
94
+ try:
95
+ title, content = await fetch_url_content(url, session)
96
+ current_hash = calculate_hash(content)
97
+
98
+ if current_hash != previous_hash:
99
+ previous_hash = current_hash
100
+ timestamp = datetime.datetime.now()
101
+
102
+ if feed_rss:
103
+ await save_to_database(url, title, content, current_hash)
104
+
105
+ if storage_location:
106
+ save_to_csv(storage_location, url, title, content, timestamp)
107
+
108
+ history.append(f"Change detected at {url} on {timestamp.strftime('%Y-%m-%d %H:%M:%S')}")
109
+ logger.info(f"Change detected at {url}")
110
+
111
+ change_counts[url] = change_counts.get(url, 0) + 1
112
+ if change_counts[url] >= CHANGE_FREQUENCY_THRESHOLD:
113
+ interval = max(60, interval // 2)
114
+ else:
115
+ change_counts[url] = 0
116
+ interval = min(interval * 2, MAX_MONITORING_INTERVAL)
117
+
118
+ url_monitoring_intervals[url] = interval
119
+ except Exception as e:
120
+ logger.error(f"Error monitoring {url}: {e}")
121
+ history.append(f"Error monitoring {url}: {e}")
122
+
123
+ await asyncio.sleep(interval)
124
+
125
+ async def start_monitoring(urls: List[str], storage_location: str, feed_rss: bool):
126
+ for url in urls:
127
+ if url not in monitoring_tasks:
128
+ sanitized_url = sanitize_url(url)
129
+ if sanitized_url:
130
+ task = asyncio.create_task(monitor_url(sanitized_url, DEFAULT_MONITORING_INTERVAL, storage_location, feed_rss))
131
+ monitoring_tasks[sanitized_url] = task
132
+ else:
133
+ logger.warning(f"Invalid URL: {url}")
134
+ history.append(f"Invalid URL: {url}")
135
 
136
+ def stop_monitoring(url: str):
137
+ if url in monitoring_tasks:
138
+ monitoring_tasks[url].cancel()
139
+ del monitoring_tasks[url]
 
 
 
 
 
 
 
 
 
 
140
 
 
141
  def generate_rss_feed():
142
+ session = Session()
143
+ try:
144
+ articles = session.query(Article).order_by(Article.timestamp.desc()).limit(20).all()
145
+ feed = feedparser.FeedParserDict()
146
+ feed['title'] = 'Website Changes Feed'
147
+ feed['link'] = 'http://yourwebsite.com/feed'
148
+ feed['description'] = 'Feed of changes detected on monitored websites.'
149
+ feed['entries'] = [
150
+ {'title': article.title, 'link': article.url, 'description': article.content, 'published': article.timestamp}
151
+ for article in articles
152
+ ]
153
+ return feedparser.FeedGenerator().feed_from_dictionary(feed).writeString('utf-8')
154
+ except SQLAlchemyError as e:
155
+ logger.error(f"Database error: {e}")
156
+ return None
157
+ finally:
158
+ session.close()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
+ async def chatbot_response(message: str, history: List[Tuple[str, str]]):
 
161
  try:
162
  client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1", token=HUGGINGFACE_API_KEY)
163
+ response = await client.inference(message)
164
  history.append((message, response))
165
  return history, history
166
  except Exception as e:
167
+ logger.error(f"Chatbot error: {e}")
168
  history.append((message, "Error: Could not get a response from the chatbot."))
169
  return history, history
170
 
171
+ # Gradio interface
172
  with gr.Blocks() as demo:
173
  gr.Markdown("# Website Monitor and Chatbot")
174
 
 
175
  with gr.Tab("Configuration"):
176
+ target_urls = gr.Textbox(label="Target URLs (comma-separated)", placeholder="https://example.com, https://another-site.com")
177
+ storage_location = gr.Textbox(label="Storage Location (CSV file path)", placeholder="/path/to/your/file.csv")
178
+ feed_rss_checkbox = gr.Checkbox(label="Enable RSS Feed")
179
+ start_button = gr.Button("Start Monitoring")
180
+ stop_button = gr.Button("Stop Monitoring")
181
+ status_text = gr.Textbox(label="Status", interactive=False)
182
+ history_text = gr.Textbox(label="History", lines=10, interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
 
184
  with gr.Tab("User-End View"):
185
+ feed_content = gr.JSON(label="RSS Feed Content")
 
186
 
 
187
  with gr.Tab("Chatbot"):
188
+ chatbot_interface = gr.Chatbot()
189
+ message_input = gr.Textbox(placeholder="Type your message here...")
190
+ send_button = gr.Button("Send")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
+ async def on_start_click(target_urls_str: str, storage_loc: str, feed_enabled: bool):
193
+ urls = [url.strip() for url in target_urls_str.split(",")]
194
+ await start_monitoring(urls, storage_loc if storage_loc else None, feed_enabled)
195
+ return "Monitoring started for valid URLs."
 
196
 
197
+ async def on_stop_click():
198
+ for url in list(monitoring_tasks.keys()):
199
+ stop_monitoring(url)
200
+ return "Monitoring stopped for all URLs."
 
201
 
202
+ start_button.click(on_start_click, inputs=[target_urls, storage_location, feed_rss_checkbox], outputs=[status_text])
203
+ stop_button.click(on_stop_click, outputs=[status_text])
204
+ send_button.click(chatbot_response, inputs=[message_input, chatbot_interface], outputs=[chatbot_interface, chatbot_interface])
 
 
 
205
 
206
+ async def update_feed_content():
 
207
  return generate_rss_feed()
208
 
209
+ feed_updater = gr.Timer(update_feed_content, every=5)
 
210
 
211
  if __name__ == "__main__":
212
  demo.launch()