acecalisto3 commited on
Commit
deaafee
1 Parent(s): ab65de3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +360 -148
app.py CHANGED
@@ -53,6 +53,7 @@ STOP_THREADS = False # Flag to stop scraping threads
53
  def get_db_connection():
54
  """
55
  Establishes and returns a MySQL database connection using environment variables.
 
56
  """
57
  try:
58
  connection = mysql.connector.connect(
@@ -66,11 +67,11 @@ def get_db_connection():
66
  return connection
67
  except mysql.connector.Error as err:
68
  if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
69
- logging.error("Invalid database credentials.")
70
  elif err.errno == errorcode.ER_BAD_DB_ERROR:
71
- logging.error("Database does not exist.")
72
  else:
73
- logging.error(err)
74
  return None
75
 
76
  # Initialize Database
@@ -80,7 +81,7 @@ def initialize_database():
80
  """
81
  connection = get_db_connection()
82
  if connection is None:
83
- logging.error("Failed to connect to the database. Initialization aborted.")
84
  return
85
 
86
  cursor = connection.cursor()
@@ -124,7 +125,7 @@ def monitor_urls(
124
  selector: str = None,
125
  ):
126
  """
127
- Monitors the specified URLs for changes and logs any detected changes to the database.
128
  """
129
  global HISTORY, STOP_THREADS
130
  previous_hashes = {url: "" for url in urls}
@@ -171,21 +172,28 @@ def monitor_urls(
171
  date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
172
  HISTORY.append(f"Change detected at {url} on {date_time_str}")
173
 
174
- # Insert change into MySQL database
175
  connection = get_db_connection()
176
  if connection:
177
- cursor = connection.cursor()
178
- insert_query = """
179
- INSERT INTO scraped_data (url, content_hash, change_detected)
180
- VALUES (%s, %s, %s)
181
- """
182
- cursor.execute(insert_query, (url, current_hash, date_time_str))
183
- connection.commit()
184
- cursor.close()
185
- connection.close()
186
- logging.info(f"Change detected at {url} on {date_time_str} and logged to database.")
 
 
 
 
 
 
187
  else:
188
- logging.error("Failed to connect to database. Change not logged.")
 
189
 
190
  except (
191
  NoSuchElementException,
@@ -199,6 +207,33 @@ def monitor_urls(
199
  driver.quit()
200
  logging.info("ChromeDriver session ended.")
201
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  # Function to create WebDriver
203
  def create_driver(options: Options) -> webdriver.Chrome:
204
  """
@@ -296,21 +331,28 @@ def start_scraping(
296
  if initial_hash:
297
  HISTORY.append(f"Initial observation at {url}: {initial_hash}")
298
 
299
- # Insert initial observation into MySQL database
300
  connection = get_db_connection()
301
  if connection:
302
- cursor = connection.cursor()
303
- insert_query = """
304
- INSERT INTO scraped_data (url, content_hash, change_detected)
305
- VALUES (%s, %s, %s)
306
- """
307
- cursor.execute(insert_query, (url, initial_hash, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
308
- connection.commit()
309
- cursor.close()
310
- connection.close()
311
- logging.info(f"Initial observation logged for {url}")
 
 
 
 
 
 
312
  else:
313
- logging.error("Failed to connect to database. Initial observation not logged.")
 
314
 
315
  except Exception as e:
316
  HISTORY.append(f"Error accessing {url}: {e}")
@@ -339,91 +381,157 @@ def stop_scraping() -> str:
339
  logging.info("Scraping stop signal sent.")
340
  return "Scraping has been stopped."
341
 
342
- # Function to display CSV content from MySQL
343
  def display_csv(storage_location: str, url: str) -> str:
344
  """
345
- Fetches and returns the scraped data for a given URL from the MySQL database.
346
  """
347
  try:
348
  connection = get_db_connection()
349
- if not connection:
350
- return "Failed to connect to the database."
351
-
352
- cursor = connection.cursor(dictionary=True)
353
- query = "SELECT * FROM scraped_data WHERE url = %s ORDER BY change_detected DESC"
354
- cursor.execute(query, (url,))
355
- results = cursor.fetchall()
 
 
 
 
 
 
 
 
 
 
 
 
356
 
357
- if not results:
358
- return "No data available for the selected URL."
 
 
 
 
 
 
359
 
360
- df = pd.DataFrame(results)
361
- cursor.close()
362
- connection.close()
363
- return df.to_string(index=False)
364
  except Exception as e:
365
  logging.error(f"Error fetching data for {url}: {e}")
366
  return f"Error fetching data for {url}: {e}"
367
 
368
- # Function to generate RSS feed from MySQL data
369
  def generate_rss_feed(storage_location: str, url: str) -> str:
370
  """
371
- Generates an RSS feed for the latest changes detected on a given URL from the MySQL database.
372
  """
373
  try:
374
  connection = get_db_connection()
375
- if not connection:
376
- return "Failed to connect to the database."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
 
378
- cursor = connection.cursor(dictionary=True)
379
- query = "SELECT * FROM scraped_data WHERE url = %s ORDER BY change_detected DESC LIMIT 10"
380
- cursor.execute(query, (url,))
381
- results = cursor.fetchall()
 
 
 
382
 
383
- if not results:
384
- return "No changes detected to include in RSS feed."
 
385
 
386
- # Create the root RSS element
387
- rss = ET.Element("rss", version="2.0")
388
- channel = ET.SubElement(rss, "channel")
389
 
390
- # Add channel elements
391
- title = ET.SubElement(channel, "title")
392
- title.text = f"RSS Feed for {urlparse(url).hostname}"
393
 
394
- link = ET.SubElement(channel, "link")
395
- link.text = url
396
 
397
- description = ET.SubElement(channel, "description")
398
- description.text = "Recent changes detected on the website."
 
399
 
400
- # Add items to the feed
401
- for row in results:
402
- item = ET.SubElement(channel, "item")
403
 
404
- item_title = ET.SubElement(item, "title")
405
- item_title.text = f"Change detected at {row['url']}"
406
 
407
- item_link = ET.SubElement(item, "link")
408
- item_link.text = row["url"]
409
 
410
- item_description = ET.SubElement(item, "description")
411
- item_description.text = f"Content changed on {row['change_detected']}"
 
 
412
 
413
- pub_date = ET.SubElement(item, "pubDate")
414
- pub_date.text = datetime.datetime.strptime(
415
- str(row['change_detected']), "%Y-%m-%d %H:%M:%S"
416
- ).strftime("%a, %d %b %Y %H:%M:%S +0000")
 
417
 
418
- # Generate the XML string
419
- rss_feed = ET.tostring(rss, encoding="utf-8", method="xml")
420
- return rss_feed.decode("utf-8")
421
  except Exception as e:
422
  logging.error(f"Error generating RSS feed for {url}: {e}")
423
  return f"Error generating RSS feed for {url}: {e}"
424
- finally:
425
- cursor.close()
426
- connection.close()
427
 
428
  # Function to load the Mistral model
429
  def load_model():
@@ -500,20 +608,62 @@ def filter_data(column: str, words: list) -> str:
500
  Saves the filtered data to a new CSV file.
501
  """
502
  try:
503
- latest_csv = get_latest_csv()
504
- if not latest_csv:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
505
  return "No CSV files found to filter."
506
 
 
 
507
  df = pd.read_csv(latest_csv)
508
- # Create a regex pattern to match any of the words
509
- pattern = '|'.join(words)
510
- filtered_df = df[df[column].astype(str).str.contains(pattern, case=False, na=False)]
 
 
511
 
512
  if filtered_df.empty:
513
  return f"No records found with words {words} in column '{column}'."
514
 
515
  # Save the filtered data to a new CSV
516
- filtered_csv = latest_csv.replace(".csv", "_filtered.csv")
517
  filtered_df.to_csv(filtered_csv, index=False)
518
  logging.info(f"Data filtered on column '{column}' for words {words}.")
519
  return f"Data filtered and saved to {filtered_csv}."
@@ -527,16 +677,56 @@ def sort_data(column: str, order: str) -> str:
527
  Saves the sorted data to a new CSV file.
528
  """
529
  try:
530
- latest_csv = get_latest_csv()
531
- if not latest_csv:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
532
  return "No CSV files found to sort."
533
 
 
 
534
  df = pd.read_csv(latest_csv)
 
 
 
 
535
  ascending = True if order.lower() == "ascending" else False
536
  sorted_df = df.sort_values(by=column, ascending=ascending)
537
 
538
  # Save the sorted data to a new CSV
539
- sorted_csv = latest_csv.replace(".csv", f"_sorted_{order.lower()}.csv")
540
  sorted_df.to_csv(sorted_csv, index=False)
541
  logging.info(f"Data sorted on column '{column}' in {order} order.")
542
  return f"Data sorted and saved to {sorted_csv}."
@@ -549,12 +739,40 @@ def export_csv(filename: str) -> str:
549
  Exports the latest scraped data to a specified CSV filename.
550
  """
551
  try:
552
- latest_csv = get_latest_csv()
553
- if not latest_csv:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
554
  return "No CSV files found to export."
555
 
556
- export_path = os.path.join(os.path.dirname(latest_csv), filename)
 
557
  df = pd.read_csv(latest_csv)
 
558
  df.to_csv(export_path, index=False)
559
  logging.info(f"Data exported to {export_path}.")
560
  return f"Data exported to {export_path}."
@@ -564,50 +782,68 @@ def export_csv(filename: str) -> str:
564
 
565
  def log_action(action: str) -> str:
566
  """
567
- Logs a custom action message to the MySQL database.
568
  """
569
  try:
570
  connection = get_db_connection()
571
- if not connection:
572
- return "Failed to connect to the database."
573
-
574
- cursor = connection.cursor()
575
- insert_query = """
576
- INSERT INTO action_logs (action)
577
- VALUES (%s)
578
- """
579
- cursor.execute(insert_query, (action,))
580
- connection.commit()
581
- cursor.close()
582
- connection.close()
 
 
 
 
 
 
583
 
584
- HISTORY.append(f"User Action Logged: {action}")
585
- logging.info(f"Action logged: {action}")
586
- return f"Action logged: {action}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
587
  except Exception as e:
588
  logging.error(f"Error logging action: {e}")
589
  return f"Error logging action: {e}"
590
 
 
591
  def get_latest_csv() -> str:
592
  """
593
  Retrieves the latest CSV file from the storage directory based on modification time.
594
  """
595
  try:
596
- storage_dirs = [d for d in os.listdir(DEFAULT_FILE_PATH) if os.path.isdir(os.path.join(DEFAULT_FILE_PATH, d))]
597
- if not storage_dirs:
 
598
  return None
599
 
600
- latest_csv = None
601
- latest_time = 0
602
- for dir_name in storage_dirs:
603
- dir_path = os.path.join(DEFAULT_FILE_PATH, dir_name)
604
- csv_files = [f for f in os.listdir(dir_path) if f.endswith("_changes.csv") or f.endswith("_filtered.csv") or f.endswith("_sorted_asc.csv") or f.endswith("_sorted_desc.csv")]
605
- for csv_file in csv_files:
606
- csv_path = os.path.join(dir_path, csv_file)
607
- mod_time = os.path.getmtime(csv_path)
608
- if mod_time > latest_time:
609
- latest_time = mod_time
610
- latest_csv = csv_path
611
  return latest_csv
612
  except Exception as e:
613
  logging.error(f"Error retrieving latest CSV: {e}")
@@ -658,30 +894,6 @@ def respond(
658
  logging.error(f"Error generating response: {e}")
659
  return "Error generating response."
660
 
661
- # Function to load the Mistral model
662
- def load_model():
663
- """
664
- Loads the Mistral model and tokenizer once and returns the pipeline.
665
- """
666
- model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
667
- try:
668
- tokenizer = AutoTokenizer.from_pretrained(model_name)
669
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
670
- pipe = pipeline(
671
- "text-generation",
672
- model=model,
673
- tokenizer=tokenizer,
674
- device=0 if torch.cuda.is_available() else -1,
675
- )
676
- logging.info("Mistral model loaded successfully.")
677
- return pipe
678
- except Exception as e:
679
- logging.error(f"Error loading Mistral model: {e}")
680
- return None
681
-
682
- # Load the model once at the start
683
- chat_pipeline = load_model()
684
-
685
  # Define the Gradio interface
686
  def create_interface() -> gr.Blocks:
687
  """
 
53
  def get_db_connection():
54
  """
55
  Establishes and returns a MySQL database connection using environment variables.
56
+ Returns None if connection fails.
57
  """
58
  try:
59
  connection = mysql.connector.connect(
 
67
  return connection
68
  except mysql.connector.Error as err:
69
  if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
70
+ logging.warning("Invalid database credentials. Falling back to CSV storage.")
71
  elif err.errno == errorcode.ER_BAD_DB_ERROR:
72
+ logging.warning("Database does not exist. Falling back to CSV storage.")
73
  else:
74
+ logging.warning(f"MySQL connection error: {err}. Falling back to CSV storage.")
75
  return None
76
 
77
  # Initialize Database
 
81
  """
82
  connection = get_db_connection()
83
  if connection is None:
84
+ logging.info("Database initialization skipped. Using CSV storage.")
85
  return
86
 
87
  cursor = connection.cursor()
 
125
  selector: str = None,
126
  ):
127
  """
128
+ Monitors the specified URLs for changes and logs any detected changes to the database or CSV.
129
  """
130
  global HISTORY, STOP_THREADS
131
  previous_hashes = {url: "" for url in urls}
 
172
  date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
173
  HISTORY.append(f"Change detected at {url} on {date_time_str}")
174
 
175
+ # Attempt to log to database
176
  connection = get_db_connection()
177
  if connection:
178
+ try:
179
+ cursor = connection.cursor()
180
+ insert_query = """
181
+ INSERT INTO scraped_data (url, content_hash, change_detected)
182
+ VALUES (%s, %s, %s)
183
+ """
184
+ cursor.execute(insert_query, (url, current_hash, date_time_str))
185
+ connection.commit()
186
+ logging.info(f"Change detected at {url} on {date_time_str} and logged to database.")
187
+ except mysql.connector.Error as err:
188
+ logging.error(f"Error inserting data into database: {err}")
189
+ # Fallback to CSV
190
+ log_to_csv(storage_location, url, current_hash, date_time_str)
191
+ finally:
192
+ cursor.close()
193
+ connection.close()
194
  else:
195
+ # Fallback to CSV
196
+ log_to_csv(storage_location, url, current_hash, date_time_str)
197
 
198
  except (
199
  NoSuchElementException,
 
207
  driver.quit()
208
  logging.info("ChromeDriver session ended.")
209
 
210
+ def log_to_csv(storage_location: str, url: str, content_hash: str, change_detected: str):
211
+ """
212
+ Logs the change to a CSV file in the storage_location.
213
+ """
214
+ try:
215
+ os.makedirs(storage_location, exist_ok=True)
216
+ csv_file_path = os.path.join(storage_location, f"{urlparse(url).hostname}_changes.csv")
217
+ file_exists = os.path.isfile(csv_file_path)
218
+
219
+ with open(csv_file_path, "a", newline="", encoding="utf-8") as csvfile:
220
+ fieldnames = ["date", "time", "url", "content_hash", "change"]
221
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
222
+ if not file_exists:
223
+ writer.writeheader()
224
+ writer.writerow(
225
+ {
226
+ "date": change_detected.split()[0],
227
+ "time": change_detected.split()[1],
228
+ "url": url,
229
+ "content_hash": content_hash,
230
+ "change": "Content changed",
231
+ }
232
+ )
233
+ logging.info(f"Change detected at {url} on {change_detected} and logged to CSV.")
234
+ except Exception as e:
235
+ logging.error(f"Error logging data to CSV: {e}")
236
+
237
  # Function to create WebDriver
238
  def create_driver(options: Options) -> webdriver.Chrome:
239
  """
 
331
  if initial_hash:
332
  HISTORY.append(f"Initial observation at {url}: {initial_hash}")
333
 
334
+ # Attempt to log to database
335
  connection = get_db_connection()
336
  if connection:
337
+ try:
338
+ cursor = connection.cursor()
339
+ insert_query = """
340
+ INSERT INTO scraped_data (url, content_hash, change_detected)
341
+ VALUES (%s, %s, %s)
342
+ """
343
+ cursor.execute(insert_query, (url, initial_hash, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
344
+ connection.commit()
345
+ logging.info(f"Initial observation logged for {url} in database.")
346
+ except mysql.connector.Error as err:
347
+ logging.error(f"Error inserting initial observation into database: {err}")
348
+ # Fallback to CSV
349
+ log_to_csv(storage_location, url, initial_hash, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
350
+ finally:
351
+ cursor.close()
352
+ connection.close()
353
  else:
354
+ # Fallback to CSV
355
+ log_to_csv(storage_location, url, initial_hash, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
356
 
357
  except Exception as e:
358
  HISTORY.append(f"Error accessing {url}: {e}")
 
381
  logging.info("Scraping stop signal sent.")
382
  return "Scraping has been stopped."
383
 
384
+ # Function to display CSV content from MySQL or CSV
385
  def display_csv(storage_location: str, url: str) -> str:
386
  """
387
+ Fetches and returns the scraped data for a given URL from the MySQL database or CSV.
388
  """
389
  try:
390
  connection = get_db_connection()
391
+ if connection:
392
+ try:
393
+ cursor = connection.cursor(dictionary=True)
394
+ query = "SELECT * FROM scraped_data WHERE url = %s ORDER BY change_detected DESC"
395
+ cursor.execute(query, (url,))
396
+ results = cursor.fetchall()
397
+
398
+ if not results:
399
+ return "No data available for the selected URL."
400
+
401
+ df = pd.DataFrame(results)
402
+ cursor.close()
403
+ connection.close()
404
+ return df.to_string(index=False)
405
+ except mysql.connector.Error as err:
406
+ logging.error(f"Error fetching data from database: {err}")
407
+ # Fallback to CSV
408
+ else:
409
+ logging.info("No database connection. Fetching data from CSV.")
410
 
411
+ # Fallback to CSV
412
+ hostname = urlparse(url).hostname
413
+ csv_path = os.path.join(storage_location, f"{hostname}_changes.csv")
414
+ if os.path.exists(csv_path):
415
+ df = pd.read_csv(csv_path)
416
+ return df.to_string(index=False)
417
+ else:
418
+ return "No data available."
419
 
 
 
 
 
420
  except Exception as e:
421
  logging.error(f"Error fetching data for {url}: {e}")
422
  return f"Error fetching data for {url}: {e}"
423
 
424
+ # Function to generate RSS feed from MySQL or CSV data
425
  def generate_rss_feed(storage_location: str, url: str) -> str:
426
  """
427
+ Generates an RSS feed for the latest changes detected on a given URL from the MySQL database or CSV.
428
  """
429
  try:
430
  connection = get_db_connection()
431
+ rss_feed = ""
432
+
433
+ if connection:
434
+ try:
435
+ cursor = connection.cursor(dictionary=True)
436
+ query = "SELECT * FROM scraped_data WHERE url = %s ORDER BY change_detected DESC LIMIT 10"
437
+ cursor.execute(query, (url,))
438
+ results = cursor.fetchall()
439
+
440
+ if not results:
441
+ return "No changes detected to include in RSS feed."
442
+
443
+ # Create the root RSS element
444
+ rss = ET.Element("rss", version="2.0")
445
+ channel = ET.SubElement(rss, "channel")
446
+
447
+ # Add channel elements
448
+ title = ET.SubElement(channel, "title")
449
+ title.text = f"RSS Feed for {urlparse(url).hostname}"
450
+
451
+ link = ET.SubElement(channel, "link")
452
+ link.text = url
453
+
454
+ description = ET.SubElement(channel, "description")
455
+ description.text = "Recent changes detected on the website."
456
+
457
+ # Add items to the feed
458
+ for row in results:
459
+ item = ET.SubElement(channel, "item")
460
+
461
+ item_title = ET.SubElement(item, "title")
462
+ item_title.text = f"Change detected at {row['url']}"
463
+
464
+ item_link = ET.SubElement(item, "link")
465
+ item_link.text = row["url"]
466
+
467
+ item_description = ET.SubElement(item, "description")
468
+ item_description.text = f"Content changed on {row['change_detected']}"
469
+
470
+ pub_date = ET.SubElement(item, "pubDate")
471
+ pub_date.text = datetime.datetime.strptime(
472
+ str(row['change_detected']), "%Y-%m-%d %H:%M:%S"
473
+ ).strftime("%a, %d %b %Y %H:%M:%S +0000")
474
+
475
+ # Generate the XML string
476
+ rss_feed = ET.tostring(rss, encoding="utf-8", method="xml").decode("utf-8")
477
+ cursor.close()
478
+ connection.close()
479
+ return rss_feed
480
+ except mysql.connector.Error as err:
481
+ logging.error(f"Error fetching data from database: {err}")
482
+ # Fallback to CSV
483
+ else:
484
+ logging.info("No database connection. Generating RSS feed from CSV.")
485
 
486
+ # Fallback to CSV
487
+ hostname = urlparse(url).hostname
488
+ csv_path = os.path.join(storage_location, f"{hostname}_changes.csv")
489
+ if os.path.exists(csv_path):
490
+ df = pd.read_csv(csv_path).tail(10)
491
+ if df.empty:
492
+ return "No changes detected to include in RSS feed."
493
 
494
+ # Create the root RSS element
495
+ rss = ET.Element("rss", version="2.0")
496
+ channel = ET.SubElement(rss, "channel")
497
 
498
+ # Add channel elements
499
+ title = ET.SubElement(channel, "title")
500
+ title.text = f"RSS Feed for {hostname}"
501
 
502
+ link = ET.SubElement(channel, "link")
503
+ link.text = url
 
504
 
505
+ description = ET.SubElement(channel, "description")
506
+ description.text = "Recent changes detected on the website."
507
 
508
+ # Add items to the feed
509
+ for _, row in df.iterrows():
510
+ item = ET.SubElement(channel, "item")
511
 
512
+ item_title = ET.SubElement(item, "title")
513
+ item_title.text = f"Change detected at {row['url']}"
 
514
 
515
+ item_link = ET.SubElement(item, "link")
516
+ item_link.text = row["url"]
517
 
518
+ item_description = ET.SubElement(item, "description")
519
+ item_description.text = f"Content changed on {row['date']} at {row['time']}"
520
 
521
+ pub_date = ET.SubElement(item, "pubDate")
522
+ pub_date.text = datetime.datetime.strptime(
523
+ f"{row['date']} {row['time']}", "%Y-%m-%d %H:%M:%S"
524
+ ).strftime("%a, %d %b %Y %H:%M:%S +0000")
525
 
526
+ # Generate the XML string
527
+ rss_feed = ET.tostring(rss, encoding="utf-8", method="xml").decode("utf-8")
528
+ return rss_feed
529
+ else:
530
+ return "No data available."
531
 
 
 
 
532
  except Exception as e:
533
  logging.error(f"Error generating RSS feed for {url}: {e}")
534
  return f"Error generating RSS feed for {url}: {e}"
 
 
 
535
 
536
  # Function to load the Mistral model
537
  def load_model():
 
608
  Saves the filtered data to a new CSV file.
609
  """
610
  try:
611
+ storage_location = DEFAULT_FILE_PATH
612
+ url = "" # Placeholder since filtering isn't URL-specific here
613
+
614
+ connection = get_db_connection()
615
+ if connection:
616
+ try:
617
+ cursor = connection.cursor(dictionary=True)
618
+ # Fetch all data
619
+ query = "SELECT * FROM scraped_data"
620
+ cursor.execute(query)
621
+ results = cursor.fetchall()
622
+
623
+ if not results:
624
+ return "No data available to filter."
625
+
626
+ df = pd.DataFrame(results)
627
+ # Create a regex pattern to match any of the words
628
+ pattern = '|'.join(words)
629
+ if column not in df.columns:
630
+ return f"Column '{column}' does not exist in the data."
631
+
632
+ filtered_df = df[df[column].astype(str).str.contains(pattern, case=False, na=False)]
633
+
634
+ if filtered_df.empty:
635
+ return f"No records found with words {words} in column '{column}'."
636
+
637
+ # Save the filtered data to a new CSV
638
+ filtered_csv = os.path.join(storage_location, f"filtered_data_{int(time.time())}.csv")
639
+ filtered_df.to_csv(filtered_csv, index=False)
640
+ logging.info(f"Data filtered on column '{column}' for words {words}.")
641
+ return f"Data filtered and saved to {filtered_csv}."
642
+ except mysql.connector.Error as err:
643
+ logging.error(f"Error fetching data from database: {err}")
644
+ # Fallback to CSV
645
+ else:
646
+ logging.info("No database connection. Filtering data from CSV.")
647
+
648
+ # Fallback to CSV
649
+ csv_files = [f for f in os.listdir(storage_location) if f.endswith("_changes.csv") or f.endswith("_filtered.csv") or f.endswith("_sorted_asc.csv") or f.endswith("_sorted_desc.csv")]
650
+ if not csv_files:
651
  return "No CSV files found to filter."
652
 
653
+ # Assume the latest CSV is the target
654
+ latest_csv = max([os.path.join(storage_location, f) for f in csv_files], key=os.path.getmtime)
655
  df = pd.read_csv(latest_csv)
656
+
657
+ if column not in df.columns:
658
+ return f"Column '{column}' does not exist in the data."
659
+
660
+ filtered_df = df[df[column].astype(str).str.contains('|'.join(words), case=False, na=False)]
661
 
662
  if filtered_df.empty:
663
  return f"No records found with words {words} in column '{column}'."
664
 
665
  # Save the filtered data to a new CSV
666
+ filtered_csv = latest_csv.replace(".csv", f"_filtered_{int(time.time())}.csv")
667
  filtered_df.to_csv(filtered_csv, index=False)
668
  logging.info(f"Data filtered on column '{column}' for words {words}.")
669
  return f"Data filtered and saved to {filtered_csv}."
 
677
  Saves the sorted data to a new CSV file.
678
  """
679
  try:
680
+ storage_location = DEFAULT_FILE_PATH
681
+ url = "" # Placeholder since sorting isn't URL-specific here
682
+
683
+ connection = get_db_connection()
684
+ if connection:
685
+ try:
686
+ cursor = connection.cursor(dictionary=True)
687
+ # Fetch all data
688
+ query = "SELECT * FROM scraped_data"
689
+ cursor.execute(query)
690
+ results = cursor.fetchall()
691
+
692
+ if not results:
693
+ return "No data available to sort."
694
+
695
+ df = pd.DataFrame(results)
696
+ if column not in df.columns:
697
+ return f"Column '{column}' does not exist in the data."
698
+
699
+ ascending = True if order.lower() == "ascending" else False
700
+ sorted_df = df.sort_values(by=column, ascending=ascending)
701
+
702
+ # Save the sorted data to a new CSV
703
+ sorted_csv = os.path.join(storage_location, f"sorted_data_{column}_{order.lower()}_{int(time.time())}.csv")
704
+ sorted_df.to_csv(sorted_csv, index=False)
705
+ logging.info(f"Data sorted on column '{column}' in {order} order.")
706
+ return f"Data sorted and saved to {sorted_csv}."
707
+ except mysql.connector.Error as err:
708
+ logging.error(f"Error fetching data from database: {err}")
709
+ # Fallback to CSV
710
+ else:
711
+ logging.info("No database connection. Sorting data from CSV.")
712
+
713
+ # Fallback to CSV
714
+ csv_files = [f for f in os.listdir(storage_location) if f.endswith("_changes.csv") or f.endswith("_filtered.csv") or f.endswith("_sorted_asc.csv") or f.endswith("_sorted_desc.csv")]
715
+ if not csv_files:
716
  return "No CSV files found to sort."
717
 
718
+ # Assume the latest CSV is the target
719
+ latest_csv = max([os.path.join(storage_location, f) for f in csv_files], key=os.path.getmtime)
720
  df = pd.read_csv(latest_csv)
721
+
722
+ if column not in df.columns:
723
+ return f"Column '{column}' does not exist in the data."
724
+
725
  ascending = True if order.lower() == "ascending" else False
726
  sorted_df = df.sort_values(by=column, ascending=ascending)
727
 
728
  # Save the sorted data to a new CSV
729
+ sorted_csv = latest_csv.replace(".csv", f"_sorted_{order.lower()}_{int(time.time())}.csv")
730
  sorted_df.to_csv(sorted_csv, index=False)
731
  logging.info(f"Data sorted on column '{column}' in {order} order.")
732
  return f"Data sorted and saved to {sorted_csv}."
 
739
  Exports the latest scraped data to a specified CSV filename.
740
  """
741
  try:
742
+ storage_location = DEFAULT_FILE_PATH
743
+
744
+ connection = get_db_connection()
745
+ if connection:
746
+ try:
747
+ cursor = connection.cursor(dictionary=True)
748
+ # Fetch all data
749
+ query = "SELECT * FROM scraped_data"
750
+ cursor.execute(query)
751
+ results = cursor.fetchall()
752
+
753
+ if not results:
754
+ return "No data available to export."
755
+
756
+ df = pd.DataFrame(results)
757
+ export_path = os.path.join(storage_location, filename)
758
+ df.to_csv(export_path, index=False)
759
+ logging.info(f"Data exported to {export_path}.")
760
+ return f"Data exported to {export_path}."
761
+ except mysql.connector.Error as err:
762
+ logging.error(f"Error exporting data from database: {err}")
763
+ # Fallback to CSV
764
+ else:
765
+ logging.info("No database connection. Exporting data from CSV.")
766
+
767
+ # Fallback to CSV
768
+ csv_files = [f for f in os.listdir(storage_location) if f.endswith("_changes.csv") or f.endswith("_filtered.csv") or f.endswith("_sorted_asc.csv") or f.endswith("_sorted_desc.csv")]
769
+ if not csv_files:
770
  return "No CSV files found to export."
771
 
772
+ # Assume the latest CSV is the target
773
+ latest_csv = max([os.path.join(storage_location, f) for f in csv_files], key=os.path.getmtime)
774
  df = pd.read_csv(latest_csv)
775
+ export_path = os.path.join(storage_location, filename)
776
  df.to_csv(export_path, index=False)
777
  logging.info(f"Data exported to {export_path}.")
778
  return f"Data exported to {export_path}."
 
782
 
783
  def log_action(action: str) -> str:
784
  """
785
+ Logs a custom action message to the MySQL database or CSV.
786
  """
787
  try:
788
  connection = get_db_connection()
789
+ if connection:
790
+ try:
791
+ cursor = connection.cursor()
792
+ insert_query = """
793
+ INSERT INTO action_logs (action)
794
+ VALUES (%s)
795
+ """
796
+ cursor.execute(insert_query, (action,))
797
+ connection.commit()
798
+ logging.info(f"Action logged in database: {action}")
799
+ cursor.close()
800
+ connection.close()
801
+ return f"Action logged: {action}"
802
+ except mysql.connector.Error as err:
803
+ logging.error(f"Error logging action to database: {err}")
804
+ # Fallback to CSV
805
+ else:
806
+ logging.info("No database connection. Logging action to CSV.")
807
 
808
+ # Fallback to CSV
809
+ storage_location = DEFAULT_FILE_PATH
810
+ try:
811
+ os.makedirs(storage_location, exist_ok=True)
812
+ csv_file_path = os.path.join(storage_location, "action_logs.csv")
813
+ file_exists = os.path.isfile(csv_file_path)
814
+
815
+ with open(csv_file_path, "a", newline="", encoding="utf-8") as csvfile:
816
+ fieldnames = ["timestamp", "action"]
817
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
818
+ if not file_exists:
819
+ writer.writeheader()
820
+ writer.writerow(
821
+ {
822
+ "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
823
+ "action": action,
824
+ }
825
+ )
826
+ logging.info(f"Action logged to CSV: {action}")
827
+ return f"Action logged: {action}"
828
+ except Exception as e:
829
+ logging.error(f"Error logging action to CSV: {e}")
830
+ return f"Error logging action: {e}"
831
  except Exception as e:
832
  logging.error(f"Error logging action: {e}")
833
  return f"Error logging action: {e}"
834
 
835
+ # Function to get the latest CSV file based on modification time
836
  def get_latest_csv() -> str:
837
  """
838
  Retrieves the latest CSV file from the storage directory based on modification time.
839
  """
840
  try:
841
+ storage_location = DEFAULT_FILE_PATH
842
+ csv_files = [f for f in os.listdir(storage_location) if f.endswith(".csv")]
843
+ if not csv_files:
844
  return None
845
 
846
+ latest_csv = max([os.path.join(storage_location, f) for f in csv_files], key=os.path.getmtime)
 
 
 
 
 
 
 
 
 
 
847
  return latest_csv
848
  except Exception as e:
849
  logging.error(f"Error retrieving latest CSV: {e}")
 
894
  logging.error(f"Error generating response: {e}")
895
  return "Error generating response."
896
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
897
  # Define the Gradio interface
898
  def create_interface() -> gr.Blocks:
899
  """