acecalisto3 commited on
Commit
d45cc49
1 Parent(s): deaafee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +365 -199
app.py CHANGED
@@ -26,8 +26,10 @@ import gradio as gr
26
  import xml.etree.ElementTree as ET
27
  import torch
28
  import mysql.connector
29
- from mysql.connector import errorcode
30
  from dotenv import load_dotenv
 
 
31
 
32
  # Load environment variables from .env file
33
  load_dotenv()
@@ -37,6 +39,9 @@ logging.basicConfig(
37
  level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
38
  )
39
 
 
 
 
40
  # Define constants
41
  DEFAULT_FILE_PATH = "scraped_data"
42
  PURPOSE = (
@@ -49,35 +54,46 @@ HISTORY = []
49
  CURRENT_TASK = None
50
  STOP_THREADS = False # Flag to stop scraping threads
51
 
52
- # MySQL Database Connection
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  def get_db_connection():
54
  """
55
- Establishes and returns a MySQL database connection using environment variables.
56
- Returns None if connection fails.
57
  """
58
- try:
59
- connection = mysql.connector.connect(
60
- host=os.getenv("DB_HOST"),
61
- user=os.getenv("DB_USER"),
62
- password=os.getenv("DB_PASSWORD"),
63
- database=os.getenv("DB_NAME")
64
- )
65
- if connection.is_connected():
66
- logging.info("Connected to MySQL database.")
67
- return connection
68
- except mysql.connector.Error as err:
69
- if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
70
- logging.warning("Invalid database credentials. Falling back to CSV storage.")
71
- elif err.errno == errorcode.ER_BAD_DB_ERROR:
72
- logging.warning("Database does not exist. Falling back to CSV storage.")
73
- else:
74
- logging.warning(f"MySQL connection error: {err}. Falling back to CSV storage.")
75
  return None
76
 
77
- # Initialize Database
78
  def initialize_database():
79
  """
80
- Initializes the database by creating necessary tables if they do not exist.
81
  """
82
  connection = get_db_connection()
83
  if connection is None:
@@ -98,6 +114,13 @@ def initialize_database():
98
  cursor.execute(create_scraped_data_table)
99
  logging.info("Table 'scraped_data' is ready.")
100
 
 
 
 
 
 
 
 
101
  # Create table for action logs
102
  create_action_logs_table = """
103
  CREATE TABLE IF NOT EXISTS action_logs (
@@ -110,12 +133,92 @@ def initialize_database():
110
  logging.info("Table 'action_logs' is ready.")
111
 
112
  except mysql.connector.Error as err:
113
- logging.error(f"Error creating tables: {err}")
114
  finally:
115
  cursor.close()
116
  connection.close()
117
  logging.info("Database initialization complete.")
118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  # Function to monitor URLs for changes
120
  def monitor_urls(
121
  storage_location: str,
@@ -123,6 +226,7 @@ def monitor_urls(
123
  scrape_interval: int,
124
  content_type: str,
125
  selector: str = None,
 
126
  ):
127
  """
128
  Monitors the specified URLs for changes and logs any detected changes to the database or CSV.
@@ -143,6 +247,8 @@ def monitor_urls(
143
  try:
144
  while not STOP_THREADS:
145
  for url in urls:
 
 
146
  try:
147
  driver.get(url)
148
  WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
@@ -195,6 +301,9 @@ def monitor_urls(
195
  # Fallback to CSV
196
  log_to_csv(storage_location, url, current_hash, date_time_str)
197
 
 
 
 
198
  except (
199
  NoSuchElementException,
200
  StaleElementReferenceException,
@@ -202,90 +311,13 @@ def monitor_urls(
202
  Exception,
203
  ) as e:
204
  logging.error(f"Error accessing {url}: {e}")
 
 
205
  time.sleep(scrape_interval * 60) # Wait for the next scrape interval
206
  finally:
207
  driver.quit()
208
  logging.info("ChromeDriver session ended.")
209
 
210
- def log_to_csv(storage_location: str, url: str, content_hash: str, change_detected: str):
211
- """
212
- Logs the change to a CSV file in the storage_location.
213
- """
214
- try:
215
- os.makedirs(storage_location, exist_ok=True)
216
- csv_file_path = os.path.join(storage_location, f"{urlparse(url).hostname}_changes.csv")
217
- file_exists = os.path.isfile(csv_file_path)
218
-
219
- with open(csv_file_path, "a", newline="", encoding="utf-8") as csvfile:
220
- fieldnames = ["date", "time", "url", "content_hash", "change"]
221
- writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
222
- if not file_exists:
223
- writer.writeheader()
224
- writer.writerow(
225
- {
226
- "date": change_detected.split()[0],
227
- "time": change_detected.split()[1],
228
- "url": url,
229
- "content_hash": content_hash,
230
- "change": "Content changed",
231
- }
232
- )
233
- logging.info(f"Change detected at {url} on {change_detected} and logged to CSV.")
234
- except Exception as e:
235
- logging.error(f"Error logging data to CSV: {e}")
236
-
237
- # Function to create WebDriver
238
- def create_driver(options: Options) -> webdriver.Chrome:
239
- """
240
- Initializes and returns a Selenium Chrome WebDriver instance.
241
- """
242
- try:
243
- driver = webdriver.Chrome(
244
- service=Service(ChromeDriverManager().install()), options=options
245
- )
246
- logging.info("ChromeDriver initialized successfully.")
247
- return driver
248
- except Exception as exception:
249
- logging.error(f"Error initializing ChromeDriver: {exception}")
250
- return None
251
-
252
- # Function to get initial observation
253
- def get_initial_observation(
254
- driver: webdriver.Chrome, url: str, content_type: str, selector: str = None
255
- ) -> str:
256
- """
257
- Retrieves the initial content from the URL and returns its MD5 hash.
258
- """
259
- try:
260
- driver.get(url)
261
- WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
262
- time.sleep(2) # Additional wait for dynamic content
263
-
264
- if content_type == "text":
265
- initial_content = driver.page_source
266
- elif content_type == "media":
267
- if selector:
268
- try:
269
- elements = WebDriverWait(driver, 5).until(
270
- EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))
271
- )
272
- initial_content = [element.get_attribute("src") for element in elements]
273
- except TimeoutException:
274
- logging.warning(f"Timeout waiting for media elements with selector '{selector}' on {url}")
275
- initial_content = []
276
- else:
277
- elements = driver.find_elements(By.TAG_NAME, "img")
278
- initial_content = [element.get_attribute("src") for element in elements]
279
- else:
280
- initial_content = driver.page_source
281
-
282
- initial_hash = hashlib.md5(str(initial_content).encode("utf-8")).hexdigest()
283
- logging.info(f"Initial hash for {url}: {initial_hash}")
284
- return initial_hash
285
- except Exception as exception:
286
- logging.error(f"Error accessing {url}: {exception}")
287
- return None
288
-
289
  # Function to start scraping
290
  def start_scraping(
291
  storage_location: str,
@@ -293,9 +325,10 @@ def start_scraping(
293
  scrape_interval: int,
294
  content_type: str,
295
  selector: str = None,
 
296
  ) -> str:
297
  """
298
- Starts the scraping process in a separate thread.
299
  """
300
  global CURRENT_TASK, HISTORY, STOP_THREADS
301
 
@@ -310,60 +343,61 @@ def start_scraping(
310
  # Initialize database tables
311
  initialize_database()
312
 
313
- for url in url_list:
314
- # Create a folder for the URL (if still needed for CSVs)
315
- hostname = urlparse(url).hostname
316
- folder_path = os.path.join(storage_location, hostname)
317
- os.makedirs(folder_path, exist_ok=True)
 
318
 
319
- # Log the initial observation
320
- try:
321
- options = Options()
322
- options.add_argument("--headless")
323
- options.add_argument("--no-sandbox")
324
- options.add_argument("--disable-dev-shm-usage")
325
-
326
- driver = create_driver(options)
327
- if driver is None:
328
- continue
329
-
330
- initial_hash = get_initial_observation(driver, url, content_type, selector)
331
- if initial_hash:
332
- HISTORY.append(f"Initial observation at {url}: {initial_hash}")
333
-
334
- # Attempt to log to database
335
- connection = get_db_connection()
336
- if connection:
337
- try:
338
- cursor = connection.cursor()
339
- insert_query = """
340
- INSERT INTO scraped_data (url, content_hash, change_detected)
341
- VALUES (%s, %s, %s)
342
- """
343
- cursor.execute(insert_query, (url, initial_hash, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
344
- connection.commit()
345
- logging.info(f"Initial observation logged for {url} in database.")
346
- except mysql.connector.Error as err:
347
- logging.error(f"Error inserting initial observation into database: {err}")
 
 
 
 
348
  # Fallback to CSV
349
- log_to_csv(storage_location, url, initial_hash, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
350
- finally:
351
- cursor.close()
352
- connection.close()
353
- else:
354
- # Fallback to CSV
355
- log_to_csv(storage_location, url, initial_hash, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
356
 
357
- except Exception as e:
358
- HISTORY.append(f"Error accessing {url}: {e}")
359
- logging.error(f"Error accessing {url}: {e}")
360
- finally:
361
- driver.quit()
362
 
363
- # Start the monitoring thread
364
  monitor_thread = threading.Thread(
365
  target=monitor_urls,
366
- args=(storage_location, url_list, scrape_interval, content_type, selector),
367
  daemon=True,
368
  )
369
  monitor_thread.start()
@@ -533,51 +567,52 @@ def generate_rss_feed(storage_location: str, url: str) -> str:
533
  logging.error(f"Error generating RSS feed for {url}: {e}")
534
  return f"Error generating RSS feed for {url}: {e}"
535
 
536
- # Function to load the Mistral model
537
- def load_model():
538
- """
539
- Loads the Mistral model and tokenizer once and returns the pipeline.
540
- """
541
- model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
542
- try:
543
- tokenizer = AutoTokenizer.from_pretrained(model_name)
544
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
545
- pipe = pipeline(
546
- "text-generation",
547
- model=model,
548
- tokenizer=tokenizer,
549
- device=0 if torch.cuda.is_available() else -1,
550
- )
551
- logging.info("Mistral model loaded successfully.")
552
- return pipe
553
- except Exception as e:
554
- logging.error(f"Error loading Mistral model: {e}")
555
- return None
556
-
557
- # Load the model once at the start
558
- chat_pipeline = load_model()
559
-
560
- # Function to parse user commands
561
  def parse_command(message: str) -> tuple:
562
  """
563
- Parses the user message to identify if it contains a command.
564
  Returns the command and its parameters if found, else (None, None).
565
  """
 
 
 
 
566
  # Define command patterns
567
- patterns = {
568
- "filter": r"filter\s+(?P<words>[\w\s,]+)\s+in\s+column\s+(?P<column>\w+)",
569
- "sort": r"sort\s+(?P<column>\w+)\s+(?P<order>ascending|descending)",
570
- "export": r"export\s+to\s+csv\s+as\s+(?P<filename>\w+\.csv)",
571
- "log": r"log\s+action\s+(?P<action>.+)",
572
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
573
 
574
- for command, pattern in patterns.items():
575
- match = re.search(pattern, message, re.IGNORECASE)
 
576
  if match:
577
- params = match.groupdict()
578
- return command, params
 
579
 
580
- return None, None
581
 
582
  # Function to execute parsed commands
583
  def execute_command(command: str, params: dict) -> str:
@@ -585,7 +620,7 @@ def execute_command(command: str, params: dict) -> str:
585
  Executes the corresponding function based on the command and parameters.
586
  """
587
  if command == "filter":
588
- words = [word.strip() for word in params["words"].split(",")]
589
  column = params["column"]
590
  return filter_data(column, words)
591
  elif command == "sort":
@@ -609,7 +644,6 @@ def filter_data(column: str, words: list) -> str:
609
  """
610
  try:
611
  storage_location = DEFAULT_FILE_PATH
612
- url = "" # Placeholder since filtering isn't URL-specific here
613
 
614
  connection = get_db_connection()
615
  if connection:
@@ -635,7 +669,8 @@ def filter_data(column: str, words: list) -> str:
635
  return f"No records found with words {words} in column '{column}'."
636
 
637
  # Save the filtered data to a new CSV
638
- filtered_csv = os.path.join(storage_location, f"filtered_data_{int(time.time())}.csv")
 
639
  filtered_df.to_csv(filtered_csv, index=False)
640
  logging.info(f"Data filtered on column '{column}' for words {words}.")
641
  return f"Data filtered and saved to {filtered_csv}."
@@ -663,7 +698,8 @@ def filter_data(column: str, words: list) -> str:
663
  return f"No records found with words {words} in column '{column}'."
664
 
665
  # Save the filtered data to a new CSV
666
- filtered_csv = latest_csv.replace(".csv", f"_filtered_{int(time.time())}.csv")
 
667
  filtered_df.to_csv(filtered_csv, index=False)
668
  logging.info(f"Data filtered on column '{column}' for words {words}.")
669
  return f"Data filtered and saved to {filtered_csv}."
@@ -678,7 +714,6 @@ def sort_data(column: str, order: str) -> str:
678
  """
679
  try:
680
  storage_location = DEFAULT_FILE_PATH
681
- url = "" # Placeholder since sorting isn't URL-specific here
682
 
683
  connection = get_db_connection()
684
  if connection:
@@ -700,7 +735,8 @@ def sort_data(column: str, order: str) -> str:
700
  sorted_df = df.sort_values(by=column, ascending=ascending)
701
 
702
  # Save the sorted data to a new CSV
703
- sorted_csv = os.path.join(storage_location, f"sorted_data_{column}_{order.lower()}_{int(time.time())}.csv")
 
704
  sorted_df.to_csv(sorted_csv, index=False)
705
  logging.info(f"Data sorted on column '{column}' in {order} order.")
706
  return f"Data sorted and saved to {sorted_csv}."
@@ -726,7 +762,8 @@ def sort_data(column: str, order: str) -> str:
726
  sorted_df = df.sort_values(by=column, ascending=ascending)
727
 
728
  # Save the sorted data to a new CSV
729
- sorted_csv = latest_csv.replace(".csv", f"_sorted_{order.lower()}_{int(time.time())}.csv")
 
730
  sorted_df.to_csv(sorted_csv, index=False)
731
  logging.info(f"Data sorted on column '{column}' in {order} order.")
732
  return f"Data sorted and saved to {sorted_csv}."
@@ -988,6 +1025,23 @@ def create_interface() -> gr.Blocks:
988
  label="RSS Feed Output", interactive=False, lines=20
989
  )
990
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
991
  # Connect buttons to their respective functions
992
  start_button.click(
993
  fn=start_scraping,
@@ -997,6 +1051,7 @@ def create_interface() -> gr.Blocks:
997
  scrape_interval,
998
  content_type,
999
  selector,
 
1000
  ],
1001
  outputs=status_output,
1002
  )
@@ -1015,6 +1070,12 @@ def create_interface() -> gr.Blocks:
1015
  outputs=rss_output,
1016
  )
1017
 
 
 
 
 
 
 
1018
  # Connect message submission to the chat interface
1019
  def update_chat(message_input, history, system_msg, max_toks, temp, top_p_val):
1020
  if not message_input.strip():
@@ -1046,9 +1107,114 @@ def create_interface() -> gr.Blocks:
1046
 
1047
  return demo
1048
 
1049
- # Initialize database on script start
1050
- initialize_database()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1051
 
 
1052
  if __name__ == "__main__":
 
 
 
 
1053
  demo = create_interface()
1054
- demo.launch()
 
 
 
 
26
  import xml.etree.ElementTree as ET
27
  import torch
28
  import mysql.connector
29
+ from mysql.connector import errorcode, pooling
30
  from dotenv import load_dotenv
31
+ import spacy
32
+ import unittest
33
 
34
  # Load environment variables from .env file
35
  load_dotenv()
 
39
  level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
40
  )
41
 
42
+ # Initialize spaCy
43
+ nlp = spacy.load("en_core_web_sm")
44
+
45
  # Define constants
46
  DEFAULT_FILE_PATH = "scraped_data"
47
  PURPOSE = (
 
54
  CURRENT_TASK = None
55
  STOP_THREADS = False # Flag to stop scraping threads
56
 
57
+ # Database Pooling Configuration
58
+ DB_POOL_NAME = "mypool"
59
+ DB_POOL_SIZE = 5 # Adjust based on expected load
60
+
61
+ try:
62
+ dbconfig = {
63
+ "host": os.getenv("DB_HOST"),
64
+ "user": os.getenv("DB_USER"),
65
+ "password": os.getenv("DB_PASSWORD"),
66
+ "database": os.getenv("DB_NAME"),
67
+ }
68
+ connection_pool = mysql.connector.pooling.MySQLConnectionPool(
69
+ pool_name=DB_POOL_NAME,
70
+ pool_size=DB_POOL_SIZE,
71
+ pool_reset_session=True,
72
+ **dbconfig
73
+ )
74
+ logging.info("Database connection pool created successfully.")
75
+ except mysql.connector.Error as err:
76
+ logging.warning(f"Database connection pool creation failed: {err}")
77
+ connection_pool = None # Will use CSV as fallback
78
+
79
+ # Function to get a database connection from the pool
80
  def get_db_connection():
81
  """
82
+ Retrieves a connection from the pool. Returns None if pool is not available.
 
83
  """
84
+ if connection_pool:
85
+ try:
86
+ connection = connection_pool.get_connection()
87
+ if connection.is_connected():
88
+ return connection
89
+ except mysql.connector.Error as err:
90
+ logging.error(f"Error getting connection from pool: {err}")
 
 
 
 
 
 
 
 
 
 
91
  return None
92
 
93
+ # Initialize Database: Create tables and indexes
94
  def initialize_database():
95
  """
96
+ Initializes the database by creating necessary tables and indexes if they do not exist.
97
  """
98
  connection = get_db_connection()
99
  if connection is None:
 
114
  cursor.execute(create_scraped_data_table)
115
  logging.info("Table 'scraped_data' is ready.")
116
 
117
+ # Create indexes for performance
118
+ create_index_url = "CREATE INDEX IF NOT EXISTS idx_url ON scraped_data(url)"
119
+ create_index_change = "CREATE INDEX IF NOT EXISTS idx_change_detected ON scraped_data(change_detected)"
120
+ cursor.execute(create_index_url)
121
+ cursor.execute(create_index_change)
122
+ logging.info("Indexes on 'url' and 'change_detected' columns created.")
123
+
124
  # Create table for action logs
125
  create_action_logs_table = """
126
  CREATE TABLE IF NOT EXISTS action_logs (
 
133
  logging.info("Table 'action_logs' is ready.")
134
 
135
  except mysql.connector.Error as err:
136
+ logging.error(f"Error initializing database: {err}")
137
  finally:
138
  cursor.close()
139
  connection.close()
140
  logging.info("Database initialization complete.")
141
 
142
+ # Function to create WebDriver
143
+ def create_driver(options: Options) -> webdriver.Chrome:
144
+ """
145
+ Initializes and returns a Selenium Chrome WebDriver instance.
146
+ """
147
+ try:
148
+ driver = webdriver.Chrome(
149
+ service=Service(ChromeDriverManager().install()), options=options
150
+ )
151
+ logging.info("ChromeDriver initialized successfully.")
152
+ return driver
153
+ except Exception as exception:
154
+ logging.error(f"Error initializing ChromeDriver: {exception}")
155
+ return None
156
+
157
+ # Function to log changes to CSV
158
+ def log_to_csv(storage_location: str, url: str, content_hash: str, change_detected: str):
159
+ """
160
+ Logs the change to a CSV file in the storage_location.
161
+ """
162
+ try:
163
+ os.makedirs(storage_location, exist_ok=True)
164
+ csv_file_path = os.path.join(storage_location, f"{urlparse(url).hostname}_changes.csv")
165
+ file_exists = os.path.isfile(csv_file_path)
166
+
167
+ with open(csv_file_path, "a", newline="", encoding="utf-8") as csvfile:
168
+ fieldnames = ["date", "time", "url", "content_hash", "change"]
169
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
170
+ if not file_exists:
171
+ writer.writeheader()
172
+ writer.writerow(
173
+ {
174
+ "date": change_detected.split()[0],
175
+ "time": change_detected.split()[1],
176
+ "url": url,
177
+ "content_hash": content_hash,
178
+ "change": "Content changed",
179
+ }
180
+ )
181
+ logging.info(f"Change detected at {url} on {change_detected} and logged to CSV.")
182
+ except Exception as e:
183
+ logging.error(f"Error logging data to CSV: {e}")
184
+
185
+ # Function to get initial observation
186
+ def get_initial_observation(
187
+ driver: webdriver.Chrome, url: str, content_type: str, selector: str = None
188
+ ) -> str:
189
+ """
190
+ Retrieves the initial content from the URL and returns its MD5 hash.
191
+ """
192
+ try:
193
+ driver.get(url)
194
+ WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
195
+ time.sleep(2) # Additional wait for dynamic content
196
+
197
+ if content_type == "text":
198
+ initial_content = driver.page_source
199
+ elif content_type == "media":
200
+ if selector:
201
+ try:
202
+ elements = WebDriverWait(driver, 5).until(
203
+ EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))
204
+ )
205
+ initial_content = [element.get_attribute("src") for element in elements]
206
+ except TimeoutException:
207
+ logging.warning(f"Timeout waiting for media elements with selector '{selector}' on {url}")
208
+ initial_content = []
209
+ else:
210
+ elements = driver.find_elements(By.TAG_NAME, "img")
211
+ initial_content = [element.get_attribute("src") for element in elements]
212
+ else:
213
+ initial_content = driver.page_source
214
+
215
+ initial_hash = hashlib.md5(str(initial_content).encode("utf-8")).hexdigest()
216
+ logging.info(f"Initial hash for {url}: {initial_hash}")
217
+ return initial_hash
218
+ except Exception as exception:
219
+ logging.error(f"Error accessing {url}: {exception}")
220
+ return None
221
+
222
  # Function to monitor URLs for changes
223
  def monitor_urls(
224
  storage_location: str,
 
226
  scrape_interval: int,
227
  content_type: str,
228
  selector: str = None,
229
+ progress: gr.Progress = None
230
  ):
231
  """
232
  Monitors the specified URLs for changes and logs any detected changes to the database or CSV.
 
247
  try:
248
  while not STOP_THREADS:
249
  for url in urls:
250
+ if STOP_THREADS:
251
+ break
252
  try:
253
  driver.get(url)
254
  WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
 
301
  # Fallback to CSV
302
  log_to_csv(storage_location, url, current_hash, date_time_str)
303
 
304
+ # Update progress
305
+ if progress:
306
+ progress(1)
307
  except (
308
  NoSuchElementException,
309
  StaleElementReferenceException,
 
311
  Exception,
312
  ) as e:
313
  logging.error(f"Error accessing {url}: {e}")
314
+ if progress:
315
+ progress(1)
316
  time.sleep(scrape_interval * 60) # Wait for the next scrape interval
317
  finally:
318
  driver.quit()
319
  logging.info("ChromeDriver session ended.")
320
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
  # Function to start scraping
322
  def start_scraping(
323
  storage_location: str,
 
325
  scrape_interval: int,
326
  content_type: str,
327
  selector: str = None,
328
+ progress: gr.Progress = None
329
  ) -> str:
330
  """
331
+ Starts the scraping process in a separate thread with progress indication.
332
  """
333
  global CURRENT_TASK, HISTORY, STOP_THREADS
334
 
 
343
  # Initialize database tables
344
  initialize_database()
345
 
346
+ # Log initial observations
347
+ def log_initial_observations():
348
+ options = Options()
349
+ options.add_argument("--headless")
350
+ options.add_argument("--no-sandbox")
351
+ options.add_argument("--disable-dev-shm-usage")
352
 
353
+ driver = create_driver(options)
354
+ if driver is None:
355
+ return
356
+
357
+ for url in url_list:
358
+ if STOP_THREADS:
359
+ break
360
+ try:
361
+ initial_hash = get_initial_observation(driver, url, content_type, selector)
362
+ if initial_hash:
363
+ date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
364
+ HISTORY.append(f"Initial observation at {url}: {initial_hash}")
365
+
366
+ # Attempt to log to database
367
+ connection = get_db_connection()
368
+ if connection:
369
+ try:
370
+ cursor = connection.cursor()
371
+ insert_query = """
372
+ INSERT INTO scraped_data (url, content_hash, change_detected)
373
+ VALUES (%s, %s, %s)
374
+ """
375
+ cursor.execute(insert_query, (url, initial_hash, date_time_str))
376
+ connection.commit()
377
+ logging.info(f"Initial observation logged for {url} in database.")
378
+ except mysql.connector.Error as err:
379
+ logging.error(f"Error inserting initial observation into database: {err}")
380
+ # Fallback to CSV
381
+ log_to_csv(storage_location, url, initial_hash, date_time_str)
382
+ finally:
383
+ cursor.close()
384
+ connection.close()
385
+ else:
386
  # Fallback to CSV
387
+ log_to_csv(storage_location, url, initial_hash, date_time_str)
388
+ except Exception as e:
389
+ HISTORY.append(f"Error accessing {url}: {e}")
390
+ logging.error(f"Error accessing {url}: {e}")
391
+ driver.quit()
 
 
392
 
393
+ # Start logging initial observations
394
+ initial_thread = threading.Thread(target=log_initial_observations, daemon=True)
395
+ initial_thread.start()
 
 
396
 
397
+ # Start the monitoring thread with progress
398
  monitor_thread = threading.Thread(
399
  target=monitor_urls,
400
+ args=(storage_location, url_list, scrape_interval, content_type, selector, progress),
401
  daemon=True,
402
  )
403
  monitor_thread.start()
 
567
  logging.error(f"Error generating RSS feed for {url}: {e}")
568
  return f"Error generating RSS feed for {url}: {e}"
569
 
570
+ # Function to parse user commands using spaCy
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
571
  def parse_command(message: str) -> tuple:
572
  """
573
+ Parses the user message using spaCy to identify if it contains a command.
574
  Returns the command and its parameters if found, else (None, None).
575
  """
576
+ doc = nlp(message.lower())
577
+ command = None
578
+ params = {}
579
+
580
  # Define command patterns
581
+ if "filter" in message.lower():
582
+ # Example: "Filter apples, oranges in column Description"
583
+ match = re.search(r"filter\s+([\w\s,]+)\s+in\s+column\s+(\w+)", message, re.IGNORECASE)
584
+ if match:
585
+ words = [word.strip() for word in match.group(1).split(",")]
586
+ column = match.group(2)
587
+ command = "filter"
588
+ params = {"words": words, "column": column}
589
+
590
+ elif "sort" in message.lower():
591
+ # Example: "Sort Price ascending"
592
+ match = re.search(r"sort\s+(\w+)\s+(ascending|descending)", message, re.IGNORECASE)
593
+ if match:
594
+ column = match.group(1)
595
+ order = match.group(2)
596
+ command = "sort"
597
+ params = {"column": column, "order": order}
598
+
599
+ elif "export to csv as" in message.lower():
600
+ # Example: "Export to CSV as filtered_data.csv"
601
+ match = re.search(r"export\s+to\s+csv\s+as\s+([\w\-]+\.csv)", message, re.IGNORECASE)
602
+ if match:
603
+ filename = match.group(1)
604
+ command = "export"
605
+ params = {"filename": filename}
606
 
607
+ elif "log action" in message.lower():
608
+ # Example: "Log action Filtered data for specific fruits"
609
+ match = re.search(r"log\s+action\s+(.+)", message, re.IGNORECASE)
610
  if match:
611
+ action = match.group(1)
612
+ command = "log"
613
+ params = {"action": action}
614
 
615
+ return command, params
616
 
617
  # Function to execute parsed commands
618
  def execute_command(command: str, params: dict) -> str:
 
620
  Executes the corresponding function based on the command and parameters.
621
  """
622
  if command == "filter":
623
+ words = params["words"]
624
  column = params["column"]
625
  return filter_data(column, words)
626
  elif command == "sort":
 
644
  """
645
  try:
646
  storage_location = DEFAULT_FILE_PATH
 
647
 
648
  connection = get_db_connection()
649
  if connection:
 
669
  return f"No records found with words {words} in column '{column}'."
670
 
671
  # Save the filtered data to a new CSV
672
+ timestamp = int(time.time())
673
+ filtered_csv = os.path.join(storage_location, f"filtered_data_{timestamp}.csv")
674
  filtered_df.to_csv(filtered_csv, index=False)
675
  logging.info(f"Data filtered on column '{column}' for words {words}.")
676
  return f"Data filtered and saved to {filtered_csv}."
 
698
  return f"No records found with words {words} in column '{column}'."
699
 
700
  # Save the filtered data to a new CSV
701
+ timestamp = int(time.time())
702
+ filtered_csv = latest_csv.replace(".csv", f"_filtered_{timestamp}.csv")
703
  filtered_df.to_csv(filtered_csv, index=False)
704
  logging.info(f"Data filtered on column '{column}' for words {words}.")
705
  return f"Data filtered and saved to {filtered_csv}."
 
714
  """
715
  try:
716
  storage_location = DEFAULT_FILE_PATH
 
717
 
718
  connection = get_db_connection()
719
  if connection:
 
735
  sorted_df = df.sort_values(by=column, ascending=ascending)
736
 
737
  # Save the sorted data to a new CSV
738
+ timestamp = int(time.time())
739
+ sorted_csv = os.path.join(storage_location, f"sorted_data_{column}_{order.lower()}_{timestamp}.csv")
740
  sorted_df.to_csv(sorted_csv, index=False)
741
  logging.info(f"Data sorted on column '{column}' in {order} order.")
742
  return f"Data sorted and saved to {sorted_csv}."
 
762
  sorted_df = df.sort_values(by=column, ascending=ascending)
763
 
764
  # Save the sorted data to a new CSV
765
+ timestamp = int(time.time())
766
+ sorted_csv = latest_csv.replace(".csv", f"_sorted_{order.lower()}_{timestamp}.csv")
767
  sorted_df.to_csv(sorted_csv, index=False)
768
  logging.info(f"Data sorted on column '{column}' in {order} order.")
769
  return f"Data sorted and saved to {sorted_csv}."
 
1025
  label="RSS Feed Output", interactive=False, lines=20
1026
  )
1027
 
1028
+ # Historical Data View
1029
+ with gr.Row():
1030
+ historical_view_url = gr.Textbox(
1031
+ label="Select URL for Historical Data",
1032
+ placeholder="https://example.com",
1033
+ )
1034
+ historical_button = gr.Button("View Historical Data")
1035
+ historical_output = gr.Dataframe(
1036
+ headers=["ID", "URL", "Content Hash", "Change Detected"],
1037
+ label="Historical Data",
1038
+ interactive=False
1039
+ )
1040
+
1041
+ # Progress Indicator
1042
+ with gr.Row():
1043
+ progress = gr.Progress(label="Scraping Progress")
1044
+
1045
  # Connect buttons to their respective functions
1046
  start_button.click(
1047
  fn=start_scraping,
 
1051
  scrape_interval,
1052
  content_type,
1053
  selector,
1054
+ progress,
1055
  ],
1056
  outputs=status_output,
1057
  )
 
1070
  outputs=rss_output,
1071
  )
1072
 
1073
+ historical_button.click(
1074
+ fn=display_historical_data,
1075
+ inputs=[storage_location, historical_view_url],
1076
+ outputs=historical_output,
1077
+ )
1078
+
1079
  # Connect message submission to the chat interface
1080
  def update_chat(message_input, history, system_msg, max_toks, temp, top_p_val):
1081
  if not message_input.strip():
 
1107
 
1108
  return demo
1109
 
1110
+ # Function to display historical data
1111
+ def display_historical_data(storage_location: str, url: str):
1112
+ """
1113
+ Retrieves and displays historical scraping data for a given URL.
1114
+ """
1115
+ try:
1116
+ connection = get_db_connection()
1117
+ if connection:
1118
+ try:
1119
+ cursor = connection.cursor(dictionary=True)
1120
+ query = "SELECT * FROM scraped_data WHERE url = %s ORDER BY change_detected DESC"
1121
+ cursor.execute(query, (url,))
1122
+ results = cursor.fetchall()
1123
+
1124
+ if not results:
1125
+ return pd.DataFrame()
1126
+
1127
+ df = pd.DataFrame(results)
1128
+ cursor.close()
1129
+ connection.close()
1130
+ return df
1131
+ except mysql.connector.Error as err:
1132
+ logging.error(f"Error fetching historical data from database: {err}")
1133
+ # Fallback to CSV
1134
+ else:
1135
+ logging.info("No database connection. Fetching historical data from CSV.")
1136
+
1137
+ # Fallback to CSV
1138
+ hostname = urlparse(url).hostname
1139
+ csv_path = os.path.join(storage_location, f"{hostname}_changes.csv")
1140
+ if os.path.exists(csv_path):
1141
+ df = pd.read_csv(csv_path)
1142
+ return df
1143
+ else:
1144
+ return pd.DataFrame()
1145
+ except Exception as e:
1146
+ logging.error(f"Error fetching historical data for {url}: {e}")
1147
+ return pd.DataFrame()
1148
+
1149
+ # Function to load the Mistral model
1150
+ def load_model():
1151
+ """
1152
+ Loads the Mistral model and tokenizer once and returns the pipeline.
1153
+ """
1154
+ model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
1155
+ try:
1156
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
1157
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
1158
+ pipe = pipeline(
1159
+ "text-generation",
1160
+ model=model,
1161
+ tokenizer=tokenizer,
1162
+ device=0 if torch.cuda.is_available() else -1,
1163
+ )
1164
+ logging.info("Mistral model loaded successfully.")
1165
+ return pipe
1166
+ except Exception as e:
1167
+ logging.error(f"Error loading Mistral model: {e}")
1168
+ return None
1169
+
1170
+ # Load the model once at the start
1171
+ chat_pipeline = load_model()
1172
+
1173
+ # Automated Testing using unittest
1174
+ class TestApp(unittest.TestCase):
1175
+ def test_parse_command_filter(self):
1176
+ command = "Filter apples, oranges in column Description"
1177
+ parsed_command = parse_command(command)
1178
+ self.assertEqual(parsed_command[0], "filter")
1179
+ self.assertListEqual(parsed_command[1]["words"], ["apples", "oranges"])
1180
+ self.assertEqual(parsed_command[1]["column"], "Description")
1181
+
1182
+ def test_parse_command_sort(self):
1183
+ command = "Sort Price ascending"
1184
+ parsed_command = parse_command(command)
1185
+ self.assertEqual(parsed_command[0], "sort")
1186
+ self.assertEqual(parsed_command[1]["column"], "Price")
1187
+ self.assertEqual(parsed_command[1]["order"], "ascending")
1188
+
1189
+ def test_parse_command_export(self):
1190
+ command = "Export to CSV as filtered_data.csv"
1191
+ parsed_command = parse_command(command)
1192
+ self.assertEqual(parsed_command[0], "export")
1193
+ self.assertEqual(parsed_command[1]["filename"], "filtered_data.csv")
1194
+
1195
+ def test_parse_command_log(self):
1196
+ command = "Log action Filtered data for specific fruits"
1197
+ parsed_command = parse_command(command)
1198
+ self.assertEqual(parsed_command[0], "log")
1199
+ self.assertEqual(parsed_command[1]["action"], "Filtered data for specific fruits")
1200
+
1201
+ def test_database_connection(self):
1202
+ connection = get_db_connection()
1203
+ # Connection may be None if not configured; adjust the test accordingly
1204
+ if connection:
1205
+ self.assertTrue(connection.is_connected())
1206
+ connection.close()
1207
+ else:
1208
+ self.assertIsNone(connection)
1209
 
1210
+ # Main execution
1211
  if __name__ == "__main__":
1212
+ # Initialize database
1213
+ initialize_database()
1214
+
1215
+ # Create and launch Gradio interface
1216
  demo = create_interface()
1217
+ demo.launch()
1218
+
1219
+ # Run automated tests
1220
+ unittest.main(argv=[''], exit=False)