Spaces:

acecalisto3
/

CEEMEESEEK

Runtime error

App Files Files Community

acecalisto3 commited on Oct 6

Commit

d45cc49

•

1 Parent(s): deaafee

Update app.py

Browse files

Files changed (1) hide show

app.py +365 -199

app.py CHANGED Viewed

@@ -26,8 +26,10 @@ import gradio as gr
 import xml.etree.ElementTree as ET
 import torch
 import mysql.connector
-from mysql.connector import errorcode
 from dotenv import load_dotenv
 # Load environment variables from .env file
 load_dotenv()
@@ -37,6 +39,9 @@ logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
 )
 # Define constants
 DEFAULT_FILE_PATH = "scraped_data"
 PURPOSE = (
@@ -49,35 +54,46 @@ HISTORY = []
 CURRENT_TASK = None
 STOP_THREADS = False  # Flag to stop scraping threads
-# MySQL Database Connection
 def get_db_connection():
     """
-    Establishes and returns a MySQL database connection using environment variables.
-    Returns None if connection fails.
     """
-    try:
-        connection = mysql.connector.connect(
-            host=os.getenv("DB_HOST"),
-            user=os.getenv("DB_USER"),
-            password=os.getenv("DB_PASSWORD"),
-            database=os.getenv("DB_NAME")
-        )
-        if connection.is_connected():
-            logging.info("Connected to MySQL database.")
-            return connection
-    except mysql.connector.Error as err:
-        if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
-            logging.warning("Invalid database credentials. Falling back to CSV storage.")
-        elif err.errno == errorcode.ER_BAD_DB_ERROR:
-            logging.warning("Database does not exist. Falling back to CSV storage.")
-        else:
-            logging.warning(f"MySQL connection error: {err}. Falling back to CSV storage.")
     return None
-# Initialize Database
 def initialize_database():
     """
-    Initializes the database by creating necessary tables if they do not exist.
     """
     connection = get_db_connection()
     if connection is None:
@@ -98,6 +114,13 @@ def initialize_database():
         cursor.execute(create_scraped_data_table)
         logging.info("Table 'scraped_data' is ready.")
         # Create table for action logs
         create_action_logs_table = """
         CREATE TABLE IF NOT EXISTS action_logs (
@@ -110,12 +133,92 @@ def initialize_database():
         logging.info("Table 'action_logs' is ready.")
     except mysql.connector.Error as err:
-        logging.error(f"Error creating tables: {err}")
     finally:
         cursor.close()
         connection.close()
         logging.info("Database initialization complete.")
 # Function to monitor URLs for changes
 def monitor_urls(
     storage_location: str,
@@ -123,6 +226,7 @@ def monitor_urls(
     scrape_interval: int,
     content_type: str,
     selector: str = None,
 ):
     """
     Monitors the specified URLs for changes and logs any detected changes to the database or CSV.
@@ -143,6 +247,8 @@ def monitor_urls(
     try:
         while not STOP_THREADS:
             for url in urls:
                 try:
                     driver.get(url)
                     WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
@@ -195,6 +301,9 @@ def monitor_urls(
                             # Fallback to CSV
                             log_to_csv(storage_location, url, current_hash, date_time_str)
                 except (
                     NoSuchElementException,
                     StaleElementReferenceException,
@@ -202,90 +311,13 @@ def monitor_urls(
                     Exception,
                 ) as e:
                     logging.error(f"Error accessing {url}: {e}")
             time.sleep(scrape_interval * 60)  # Wait for the next scrape interval
     finally:
         driver.quit()
         logging.info("ChromeDriver session ended.")
-def log_to_csv(storage_location: str, url: str, content_hash: str, change_detected: str):
-    """
-    Logs the change to a CSV file in the storage_location.
-    """
-    try:
-        os.makedirs(storage_location, exist_ok=True)
-        csv_file_path = os.path.join(storage_location, f"{urlparse(url).hostname}_changes.csv")
-        file_exists = os.path.isfile(csv_file_path)
-        with open(csv_file_path, "a", newline="", encoding="utf-8") as csvfile:
-            fieldnames = ["date", "time", "url", "content_hash", "change"]
-            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
-            if not file_exists:
-                writer.writeheader()
-            writer.writerow(
-                {
-                    "date": change_detected.split()[0],
-                    "time": change_detected.split()[1],
-                    "url": url,
-                    "content_hash": content_hash,
-                    "change": "Content changed",
-                }
-            )
-        logging.info(f"Change detected at {url} on {change_detected} and logged to CSV.")
-    except Exception as e:
-        logging.error(f"Error logging data to CSV: {e}")
-# Function to create WebDriver
-def create_driver(options: Options) -> webdriver.Chrome:
-    """
-    Initializes and returns a Selenium Chrome WebDriver instance.
-    """
-    try:
-        driver = webdriver.Chrome(
-            service=Service(ChromeDriverManager().install()), options=options
-        )
-        logging.info("ChromeDriver initialized successfully.")
-        return driver
-    except Exception as exception:
-        logging.error(f"Error initializing ChromeDriver: {exception}")
-        return None
-# Function to get initial observation
-def get_initial_observation(
-    driver: webdriver.Chrome, url: str, content_type: str, selector: str = None
-) -> str:
-    """
-    Retrieves the initial content from the URL and returns its MD5 hash.
-    """
-    try:
-        driver.get(url)
-        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
-        time.sleep(2)  # Additional wait for dynamic content
-        if content_type == "text":
-            initial_content = driver.page_source
-        elif content_type == "media":
-            if selector:
-                try:
-                    elements = WebDriverWait(driver, 5).until(
-                        EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))
-                    )
-                    initial_content = [element.get_attribute("src") for element in elements]
-                except TimeoutException:
-                    logging.warning(f"Timeout waiting for media elements with selector '{selector}' on {url}")
-                    initial_content = []
-            else:
-                elements = driver.find_elements(By.TAG_NAME, "img")
-                initial_content = [element.get_attribute("src") for element in elements]
-        else:
-            initial_content = driver.page_source
-        initial_hash = hashlib.md5(str(initial_content).encode("utf-8")).hexdigest()
-        logging.info(f"Initial hash for {url}: {initial_hash}")
-        return initial_hash
-    except Exception as exception:
-        logging.error(f"Error accessing {url}: {exception}")
-        return None
 # Function to start scraping
 def start_scraping(
     storage_location: str,
@@ -293,9 +325,10 @@ def start_scraping(
     scrape_interval: int,
     content_type: str,
     selector: str = None,
 ) -> str:
     """
-    Starts the scraping process in a separate thread.
     """
     global CURRENT_TASK, HISTORY, STOP_THREADS
@@ -310,60 +343,61 @@ def start_scraping(
     # Initialize database tables
     initialize_database()
-    for url in url_list:
-        # Create a folder for the URL (if still needed for CSVs)
-        hostname = urlparse(url).hostname
-        folder_path = os.path.join(storage_location, hostname)
-        os.makedirs(folder_path, exist_ok=True)
-        # Log the initial observation
-        try:
-            options = Options()
-            options.add_argument("--headless")
-            options.add_argument("--no-sandbox")
-            options.add_argument("--disable-dev-shm-usage")
-            driver = create_driver(options)
-            if driver is None:
-                continue
-            initial_hash = get_initial_observation(driver, url, content_type, selector)
-            if initial_hash:
-                HISTORY.append(f"Initial observation at {url}: {initial_hash}")
-                # Attempt to log to database
-                connection = get_db_connection()
-                if connection:
-                    try:
-                        cursor = connection.cursor()
-                        insert_query = """
-                        INSERT INTO scraped_data (url, content_hash, change_detected)
-                        VALUES (%s, %s, %s)
-                        """
-                        cursor.execute(insert_query, (url, initial_hash, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
-                        connection.commit()
-                        logging.info(f"Initial observation logged for {url} in database.")
-                    except mysql.connector.Error as err:
-                        logging.error(f"Error inserting initial observation into database: {err}")
                         # Fallback to CSV
-                        log_to_csv(storage_location, url, initial_hash, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
-                    finally:
-                        cursor.close()
-                        connection.close()
-                else:
-                    # Fallback to CSV
-                    log_to_csv(storage_location, url, initial_hash, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
-        except Exception as e:
-            HISTORY.append(f"Error accessing {url}: {e}")
-            logging.error(f"Error accessing {url}: {e}")
-        finally:
-            driver.quit()
-    # Start the monitoring thread
     monitor_thread = threading.Thread(
         target=monitor_urls,
-        args=(storage_location, url_list, scrape_interval, content_type, selector),
         daemon=True,
     )
     monitor_thread.start()
@@ -533,51 +567,52 @@ def generate_rss_feed(storage_location: str, url: str) -> str:
         logging.error(f"Error generating RSS feed for {url}: {e}")
         return f"Error generating RSS feed for {url}: {e}"
-# Function to load the Mistral model
-def load_model():
-    """
-    Loads the Mistral model and tokenizer once and returns the pipeline.
-    """
-    model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
-    try:
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
-        pipe = pipeline(
-            "text-generation",
-            model=model,
-            tokenizer=tokenizer,
-            device=0 if torch.cuda.is_available() else -1,
-        )
-        logging.info("Mistral model loaded successfully.")
-        return pipe
-    except Exception as e:
-        logging.error(f"Error loading Mistral model: {e}")
-        return None
-# Load the model once at the start
-chat_pipeline = load_model()
-# Function to parse user commands
 def parse_command(message: str) -> tuple:
     """
-    Parses the user message to identify if it contains a command.
     Returns the command and its parameters if found, else (None, None).
     """
     # Define command patterns
-    patterns = {
-        "filter": r"filter\s+(?P<words>[\w\s,]+)\s+in\s+column\s+(?P<column>\w+)",
-        "sort": r"sort\s+(?P<column>\w+)\s+(?P<order>ascending|descending)",
-        "export": r"export\s+to\s+csv\s+as\s+(?P<filename>\w+\.csv)",
-        "log": r"log\s+action\s+(?P<action>.+)",
-    }
-    for command, pattern in patterns.items():
-        match = re.search(pattern, message, re.IGNORECASE)
         if match:
-            params = match.groupdict()
-            return command, params
-    return None, None
 # Function to execute parsed commands
 def execute_command(command: str, params: dict) -> str:
@@ -585,7 +620,7 @@ def execute_command(command: str, params: dict) -> str:
     Executes the corresponding function based on the command and parameters.
     """
     if command == "filter":
-        words = [word.strip() for word in params["words"].split(",")]
         column = params["column"]
         return filter_data(column, words)
     elif command == "sort":
@@ -609,7 +644,6 @@ def filter_data(column: str, words: list) -> str:
     """
     try:
         storage_location = DEFAULT_FILE_PATH
-        url = ""  # Placeholder since filtering isn't URL-specific here
         connection = get_db_connection()
         if connection:
@@ -635,7 +669,8 @@ def filter_data(column: str, words: list) -> str:
                     return f"No records found with words {words} in column '{column}'."
                 # Save the filtered data to a new CSV
-                filtered_csv = os.path.join(storage_location, f"filtered_data_{int(time.time())}.csv")
                 filtered_df.to_csv(filtered_csv, index=False)
                 logging.info(f"Data filtered on column '{column}' for words {words}.")
                 return f"Data filtered and saved to {filtered_csv}."
@@ -663,7 +698,8 @@ def filter_data(column: str, words: list) -> str:
             return f"No records found with words {words} in column '{column}'."
         # Save the filtered data to a new CSV
-        filtered_csv = latest_csv.replace(".csv", f"_filtered_{int(time.time())}.csv")
         filtered_df.to_csv(filtered_csv, index=False)
         logging.info(f"Data filtered on column '{column}' for words {words}.")
         return f"Data filtered and saved to {filtered_csv}."
@@ -678,7 +714,6 @@ def sort_data(column: str, order: str) -> str:
     """
     try:
         storage_location = DEFAULT_FILE_PATH
-        url = ""  # Placeholder since sorting isn't URL-specific here
         connection = get_db_connection()
         if connection:
@@ -700,7 +735,8 @@ def sort_data(column: str, order: str) -> str:
                 sorted_df = df.sort_values(by=column, ascending=ascending)
                 # Save the sorted data to a new CSV
-                sorted_csv = os.path.join(storage_location, f"sorted_data_{column}_{order.lower()}_{int(time.time())}.csv")
                 sorted_df.to_csv(sorted_csv, index=False)
                 logging.info(f"Data sorted on column '{column}' in {order} order.")
                 return f"Data sorted and saved to {sorted_csv}."
@@ -726,7 +762,8 @@ def sort_data(column: str, order: str) -> str:
         sorted_df = df.sort_values(by=column, ascending=ascending)
         # Save the sorted data to a new CSV
-        sorted_csv = latest_csv.replace(".csv", f"_sorted_{order.lower()}_{int(time.time())}.csv")
         sorted_df.to_csv(sorted_csv, index=False)
         logging.info(f"Data sorted on column '{column}' in {order} order.")
         return f"Data sorted and saved to {sorted_csv}."
@@ -988,6 +1025,23 @@ def create_interface() -> gr.Blocks:
                     label="RSS Feed Output", interactive=False, lines=20
                 )
         # Connect buttons to their respective functions
         start_button.click(
             fn=start_scraping,
@@ -997,6 +1051,7 @@ def create_interface() -> gr.Blocks:
                 scrape_interval,
                 content_type,
                 selector,
             ],
             outputs=status_output,
         )
@@ -1015,6 +1070,12 @@ def create_interface() -> gr.Blocks:
             outputs=rss_output,
         )
         # Connect message submission to the chat interface
         def update_chat(message_input, history, system_msg, max_toks, temp, top_p_val):
             if not message_input.strip():
@@ -1046,9 +1107,114 @@ def create_interface() -> gr.Blocks:
     return demo
-# Initialize database on script start
-initialize_database()
 if __name__ == "__main__":
     demo = create_interface()
-    demo.launch()

 import xml.etree.ElementTree as ET
 import torch
 import mysql.connector
+from mysql.connector import errorcode, pooling
 from dotenv import load_dotenv
+import spacy
+import unittest
 # Load environment variables from .env file
 load_dotenv()
     level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
 )
+# Initialize spaCy
+nlp = spacy.load("en_core_web_sm")
 # Define constants
 DEFAULT_FILE_PATH = "scraped_data"
 PURPOSE = (
 CURRENT_TASK = None
 STOP_THREADS = False  # Flag to stop scraping threads
+# Database Pooling Configuration
+DB_POOL_NAME = "mypool"
+DB_POOL_SIZE = 5  # Adjust based on expected load
+try:
+    dbconfig = {
+        "host": os.getenv("DB_HOST"),
+        "user": os.getenv("DB_USER"),
+        "password": os.getenv("DB_PASSWORD"),
+        "database": os.getenv("DB_NAME"),
+    }
+    connection_pool = mysql.connector.pooling.MySQLConnectionPool(
+        pool_name=DB_POOL_NAME,
+        pool_size=DB_POOL_SIZE,
+        pool_reset_session=True,
+        **dbconfig
+    )
+    logging.info("Database connection pool created successfully.")
+except mysql.connector.Error as err:
+    logging.warning(f"Database connection pool creation failed: {err}")
+    connection_pool = None  # Will use CSV as fallback
+# Function to get a database connection from the pool
 def get_db_connection():
     """
+    Retrieves a connection from the pool. Returns None if pool is not available.
     """
+    if connection_pool:
+        try:
+            connection = connection_pool.get_connection()
+            if connection.is_connected():
+                return connection
+        except mysql.connector.Error as err:
+            logging.error(f"Error getting connection from pool: {err}")
     return None
+# Initialize Database: Create tables and indexes
 def initialize_database():
     """
+    Initializes the database by creating necessary tables and indexes if they do not exist.
     """
     connection = get_db_connection()
     if connection is None:
         cursor.execute(create_scraped_data_table)
         logging.info("Table 'scraped_data' is ready.")
+        # Create indexes for performance
+        create_index_url = "CREATE INDEX IF NOT EXISTS idx_url ON scraped_data(url)"
+        create_index_change = "CREATE INDEX IF NOT EXISTS idx_change_detected ON scraped_data(change_detected)"
+        cursor.execute(create_index_url)
+        cursor.execute(create_index_change)
+        logging.info("Indexes on 'url' and 'change_detected' columns created.")
         # Create table for action logs
         create_action_logs_table = """
         CREATE TABLE IF NOT EXISTS action_logs (
         logging.info("Table 'action_logs' is ready.")
     except mysql.connector.Error as err:
+        logging.error(f"Error initializing database: {err}")
     finally:
         cursor.close()
         connection.close()
         logging.info("Database initialization complete.")
+# Function to create WebDriver
+def create_driver(options: Options) -> webdriver.Chrome:
+    """
+    Initializes and returns a Selenium Chrome WebDriver instance.
+    """
+    try:
+        driver = webdriver.Chrome(
+            service=Service(ChromeDriverManager().install()), options=options
+        )
+        logging.info("ChromeDriver initialized successfully.")
+        return driver
+    except Exception as exception:
+        logging.error(f"Error initializing ChromeDriver: {exception}")
+        return None
+# Function to log changes to CSV
+def log_to_csv(storage_location: str, url: str, content_hash: str, change_detected: str):
+    """
+    Logs the change to a CSV file in the storage_location.
+    """
+    try:
+        os.makedirs(storage_location, exist_ok=True)
+        csv_file_path = os.path.join(storage_location, f"{urlparse(url).hostname}_changes.csv")
+        file_exists = os.path.isfile(csv_file_path)
+        with open(csv_file_path, "a", newline="", encoding="utf-8") as csvfile:
+            fieldnames = ["date", "time", "url", "content_hash", "change"]
+            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+            if not file_exists:
+                writer.writeheader()
+            writer.writerow(
+                {
+                    "date": change_detected.split()[0],
+                    "time": change_detected.split()[1],
+                    "url": url,
+                    "content_hash": content_hash,
+                    "change": "Content changed",
+                }
+            )
+        logging.info(f"Change detected at {url} on {change_detected} and logged to CSV.")
+    except Exception as e:
+        logging.error(f"Error logging data to CSV: {e}")
+# Function to get initial observation
+def get_initial_observation(
+    driver: webdriver.Chrome, url: str, content_type: str, selector: str = None
+) -> str:
+    """
+    Retrieves the initial content from the URL and returns its MD5 hash.
+    """
+    try:
+        driver.get(url)
+        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
+        time.sleep(2)  # Additional wait for dynamic content
+        if content_type == "text":
+            initial_content = driver.page_source
+        elif content_type == "media":
+            if selector:
+                try:
+                    elements = WebDriverWait(driver, 5).until(
+                        EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))
+                    )
+                    initial_content = [element.get_attribute("src") for element in elements]
+                except TimeoutException:
+                    logging.warning(f"Timeout waiting for media elements with selector '{selector}' on {url}")
+                    initial_content = []
+            else:
+                elements = driver.find_elements(By.TAG_NAME, "img")
+                initial_content = [element.get_attribute("src") for element in elements]
+        else:
+            initial_content = driver.page_source
+        initial_hash = hashlib.md5(str(initial_content).encode("utf-8")).hexdigest()
+        logging.info(f"Initial hash for {url}: {initial_hash}")
+        return initial_hash
+    except Exception as exception:
+        logging.error(f"Error accessing {url}: {exception}")
+        return None
 # Function to monitor URLs for changes
 def monitor_urls(
     storage_location: str,
     scrape_interval: int,
     content_type: str,
     selector: str = None,
+    progress: gr.Progress = None
 ):
     """
     Monitors the specified URLs for changes and logs any detected changes to the database or CSV.
     try:
         while not STOP_THREADS:
             for url in urls:
+                if STOP_THREADS:
+                    break
                 try:
                     driver.get(url)
                     WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
                             # Fallback to CSV
                             log_to_csv(storage_location, url, current_hash, date_time_str)
+                        # Update progress
+                        if progress:
+                            progress(1)
                 except (
                     NoSuchElementException,
                     StaleElementReferenceException,
                     Exception,
                 ) as e:
                     logging.error(f"Error accessing {url}: {e}")
+                    if progress:
+                        progress(1)
             time.sleep(scrape_interval * 60)  # Wait for the next scrape interval
     finally:
         driver.quit()
         logging.info("ChromeDriver session ended.")
 # Function to start scraping
 def start_scraping(
     storage_location: str,
     scrape_interval: int,
     content_type: str,
     selector: str = None,
+    progress: gr.Progress = None
 ) -> str:
     """
+    Starts the scraping process in a separate thread with progress indication.
     """
     global CURRENT_TASK, HISTORY, STOP_THREADS
     # Initialize database tables
     initialize_database()
+    # Log initial observations
+    def log_initial_observations():
+        options = Options()
+        options.add_argument("--headless")
+        options.add_argument("--no-sandbox")
+        options.add_argument("--disable-dev-shm-usage")
+        driver = create_driver(options)
+        if driver is None:
+            return
+        for url in url_list:
+            if STOP_THREADS:
+                break
+            try:
+                initial_hash = get_initial_observation(driver, url, content_type, selector)
+                if initial_hash:
+                    date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+                    HISTORY.append(f"Initial observation at {url}: {initial_hash}")
+                    # Attempt to log to database
+                    connection = get_db_connection()
+                    if connection:
+                        try:
+                            cursor = connection.cursor()
+                            insert_query = """
+                            INSERT INTO scraped_data (url, content_hash, change_detected)
+                            VALUES (%s, %s, %s)
+                            """
+                            cursor.execute(insert_query, (url, initial_hash, date_time_str))
+                            connection.commit()
+                            logging.info(f"Initial observation logged for {url} in database.")
+                        except mysql.connector.Error as err:
+                            logging.error(f"Error inserting initial observation into database: {err}")
+                            # Fallback to CSV
+                            log_to_csv(storage_location, url, initial_hash, date_time_str)
+                        finally:
+                            cursor.close()
+                            connection.close()
+                    else:
                         # Fallback to CSV
+                        log_to_csv(storage_location, url, initial_hash, date_time_str)
+            except Exception as e:
+                HISTORY.append(f"Error accessing {url}: {e}")
+                logging.error(f"Error accessing {url}: {e}")
+        driver.quit()
+    # Start logging initial observations
+    initial_thread = threading.Thread(target=log_initial_observations, daemon=True)
+    initial_thread.start()
+    # Start the monitoring thread with progress
     monitor_thread = threading.Thread(
         target=monitor_urls,
+        args=(storage_location, url_list, scrape_interval, content_type, selector, progress),
         daemon=True,
     )
     monitor_thread.start()
         logging.error(f"Error generating RSS feed for {url}: {e}")
         return f"Error generating RSS feed for {url}: {e}"
+# Function to parse user commands using spaCy
 def parse_command(message: str) -> tuple:
     """
+    Parses the user message using spaCy to identify if it contains a command.
     Returns the command and its parameters if found, else (None, None).
     """
+    doc = nlp(message.lower())
+    command = None
+    params = {}
     # Define command patterns
+    if "filter" in message.lower():
+        # Example: "Filter apples, oranges in column Description"
+        match = re.search(r"filter\s+([\w\s,]+)\s+in\s+column\s+(\w+)", message, re.IGNORECASE)
+        if match:
+            words = [word.strip() for word in match.group(1).split(",")]
+            column = match.group(2)
+            command = "filter"
+            params = {"words": words, "column": column}
+    elif "sort" in message.lower():
+        # Example: "Sort Price ascending"
+        match = re.search(r"sort\s+(\w+)\s+(ascending|descending)", message, re.IGNORECASE)
+        if match:
+            column = match.group(1)
+            order = match.group(2)
+            command = "sort"
+            params = {"column": column, "order": order}
+    elif "export to csv as" in message.lower():
+        # Example: "Export to CSV as filtered_data.csv"
+        match = re.search(r"export\s+to\s+csv\s+as\s+([\w\-]+\.csv)", message, re.IGNORECASE)
+        if match:
+            filename = match.group(1)
+            command = "export"
+            params = {"filename": filename}
+    elif "log action" in message.lower():
+        # Example: "Log action Filtered data for specific fruits"
+        match = re.search(r"log\s+action\s+(.+)", message, re.IGNORECASE)
         if match:
+            action = match.group(1)
+            command = "log"
+            params = {"action": action}
+    return command, params
 # Function to execute parsed commands
 def execute_command(command: str, params: dict) -> str:
     Executes the corresponding function based on the command and parameters.
     """
     if command == "filter":
+        words = params["words"]
         column = params["column"]
         return filter_data(column, words)
     elif command == "sort":
     """
     try:
         storage_location = DEFAULT_FILE_PATH
         connection = get_db_connection()
         if connection:
                     return f"No records found with words {words} in column '{column}'."
                 # Save the filtered data to a new CSV
+                timestamp = int(time.time())
+                filtered_csv = os.path.join(storage_location, f"filtered_data_{timestamp}.csv")
                 filtered_df.to_csv(filtered_csv, index=False)
                 logging.info(f"Data filtered on column '{column}' for words {words}.")
                 return f"Data filtered and saved to {filtered_csv}."
             return f"No records found with words {words} in column '{column}'."
         # Save the filtered data to a new CSV
+        timestamp = int(time.time())
+        filtered_csv = latest_csv.replace(".csv", f"_filtered_{timestamp}.csv")
         filtered_df.to_csv(filtered_csv, index=False)
         logging.info(f"Data filtered on column '{column}' for words {words}.")
         return f"Data filtered and saved to {filtered_csv}."
     """
     try:
         storage_location = DEFAULT_FILE_PATH
         connection = get_db_connection()
         if connection:
                 sorted_df = df.sort_values(by=column, ascending=ascending)
                 # Save the sorted data to a new CSV
+                timestamp = int(time.time())
+                sorted_csv = os.path.join(storage_location, f"sorted_data_{column}_{order.lower()}_{timestamp}.csv")
                 sorted_df.to_csv(sorted_csv, index=False)
                 logging.info(f"Data sorted on column '{column}' in {order} order.")
                 return f"Data sorted and saved to {sorted_csv}."
         sorted_df = df.sort_values(by=column, ascending=ascending)
         # Save the sorted data to a new CSV
+        timestamp = int(time.time())
+        sorted_csv = latest_csv.replace(".csv", f"_sorted_{order.lower()}_{timestamp}.csv")
         sorted_df.to_csv(sorted_csv, index=False)
         logging.info(f"Data sorted on column '{column}' in {order} order.")
         return f"Data sorted and saved to {sorted_csv}."
                     label="RSS Feed Output", interactive=False, lines=20
                 )
+        # Historical Data View
+        with gr.Row():
+            historical_view_url = gr.Textbox(
+                label="Select URL for Historical Data",
+                placeholder="https://example.com",
+            )
+            historical_button = gr.Button("View Historical Data")
+            historical_output = gr.Dataframe(
+                headers=["ID", "URL", "Content Hash", "Change Detected"],
+                label="Historical Data",
+                interactive=False
+            )
+        # Progress Indicator
+        with gr.Row():
+            progress = gr.Progress(label="Scraping Progress")
         # Connect buttons to their respective functions
         start_button.click(
             fn=start_scraping,
                 scrape_interval,
                 content_type,
                 selector,
+                progress,
             ],
             outputs=status_output,
         )
             outputs=rss_output,
         )
+        historical_button.click(
+            fn=display_historical_data,
+            inputs=[storage_location, historical_view_url],
+            outputs=historical_output,
+        )
         # Connect message submission to the chat interface
         def update_chat(message_input, history, system_msg, max_toks, temp, top_p_val):
             if not message_input.strip():
     return demo
+# Function to display historical data
+def display_historical_data(storage_location: str, url: str):
+    """
+    Retrieves and displays historical scraping data for a given URL.
+    """
+    try:
+        connection = get_db_connection()
+        if connection:
+            try:
+                cursor = connection.cursor(dictionary=True)
+                query = "SELECT * FROM scraped_data WHERE url = %s ORDER BY change_detected DESC"
+                cursor.execute(query, (url,))
+                results = cursor.fetchall()
+                if not results:
+                    return pd.DataFrame()
+                df = pd.DataFrame(results)
+                cursor.close()
+                connection.close()
+                return df
+            except mysql.connector.Error as err:
+                logging.error(f"Error fetching historical data from database: {err}")
+                # Fallback to CSV
+        else:
+            logging.info("No database connection. Fetching historical data from CSV.")
+        # Fallback to CSV
+        hostname = urlparse(url).hostname
+        csv_path = os.path.join(storage_location, f"{hostname}_changes.csv")
+        if os.path.exists(csv_path):
+            df = pd.read_csv(csv_path)
+            return df
+        else:
+            return pd.DataFrame()
+    except Exception as e:
+        logging.error(f"Error fetching historical data for {url}: {e}")
+        return pd.DataFrame()
+# Function to load the Mistral model
+def load_model():
+    """
+    Loads the Mistral model and tokenizer once and returns the pipeline.
+    """
+    model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+        pipe = pipeline(
+            "text-generation",
+            model=model,
+            tokenizer=tokenizer,
+            device=0 if torch.cuda.is_available() else -1,
+        )
+        logging.info("Mistral model loaded successfully.")
+        return pipe
+    except Exception as e:
+        logging.error(f"Error loading Mistral model: {e}")
+        return None
+# Load the model once at the start
+chat_pipeline = load_model()
+# Automated Testing using unittest
+class TestApp(unittest.TestCase):
+    def test_parse_command_filter(self):
+        command = "Filter apples, oranges in column Description"
+        parsed_command = parse_command(command)
+        self.assertEqual(parsed_command[0], "filter")
+        self.assertListEqual(parsed_command[1]["words"], ["apples", "oranges"])
+        self.assertEqual(parsed_command[1]["column"], "Description")
+    def test_parse_command_sort(self):
+        command = "Sort Price ascending"
+        parsed_command = parse_command(command)
+        self.assertEqual(parsed_command[0], "sort")
+        self.assertEqual(parsed_command[1]["column"], "Price")
+        self.assertEqual(parsed_command[1]["order"], "ascending")
+    def test_parse_command_export(self):
+        command = "Export to CSV as filtered_data.csv"
+        parsed_command = parse_command(command)
+        self.assertEqual(parsed_command[0], "export")
+        self.assertEqual(parsed_command[1]["filename"], "filtered_data.csv")
+    def test_parse_command_log(self):
+        command = "Log action Filtered data for specific fruits"
+        parsed_command = parse_command(command)
+        self.assertEqual(parsed_command[0], "log")
+        self.assertEqual(parsed_command[1]["action"], "Filtered data for specific fruits")
+    def test_database_connection(self):
+        connection = get_db_connection()
+        # Connection may be None if not configured; adjust the test accordingly
+        if connection:
+            self.assertTrue(connection.is_connected())
+            connection.close()
+        else:
+            self.assertIsNone(connection)
+# Main execution
 if __name__ == "__main__":
+    # Initialize database
+    initialize_database()
+    # Create and launch Gradio interface
     demo = create_interface()
+    demo.launch()
+    # Run automated tests
+    unittest.main(argv=[''], exit=False)