Spaces:

acecalisto3
/

CEEMEESEEK

Runtime error

App Files Files Community

acecalisto3 commited on Oct 15

Commit

2bed3a1

•

1 Parent(s): bcc7de3

Delete app.py

Browse files

Files changed (1) hide show

app.py +0 -1569

app.py DELETED Viewed

@@ -1,1569 +0,0 @@
-limport datetimeimport osimport csvimport timeimport hashlibimport loggingfrom collections import defaultdictimport mysql.connectorimport threadingfrom urllib.parse import urlparseimport gradio as grfrom selenium import webdriverfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as
- ECfrom selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException, TimeoutExceptionfrom selenium.webdriver.chrome.service
- import Servicefrom selenium.webdriver.chrome.options import Optionsfrom webdriver_manager.chrome import ChromeDriverManager
-from huggingface_hub import InferenceClient, loginfrom transformers import AutoTokenizer, AutoModelForCausalLM, pipelineimport randomimport yamlimport torchimport pandas as pdimport xml.etree.ElementTree as ETimport reimport spacyimport unittestfrom dotenv import load_dotenvimport nltk# Initialize NLTK resources (you may need to download these)
-nltk.download('punkt')nltk.download('averaged_perceptron_tagger')
-nltk.download('maxent_ne_chunker')
-nltk.download('words')
-# Load spaCy model
-nlp = spacy.load("en_core_web_sm")
-# Dictionary to store model loading functions
-model_loaders = {
-    "Falcon": lambda: load_model("tiiuae/falcon-7b"),
-    "Flan-T5": lambda: load_model("google/flan-t5-xl"),
-    "Flan-T5-Small": lambda: load_model("google/flan-t5-small")  # Add a smaller model
-}# Load environment variables from .env file
-load_dotenv()
-HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")if not HUGGINGFACE_TOKEN:
-    raise ValueError("HUGGINGFACE_TOKEN is not set in the environment variables.")
-login(token=HUGGINGFACE_TOKEN, add_to_git_credential=True)# Configure logging
-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
-)# Define constants
-DEFAULT_FILE_PATH = "scraped_data"
-PURPOSE = (
-    "You monitor urls. You log what you observe. You seek any changes on them since your last observation. "
-    "Anything new gets logged and dumped into csv, stored in your log folder at user/app/scraped_data."
-)# Global variables for task management
-HISTORY = []
-CURRENT_TASK = None
-STOP_THREADS = False  # Flag to stop scraping threads# Database Pooling Configuration
-DB_POOL_NAME = "mypool"
-DB_POOL_SIZE = 5  # Adjust based on expected loadtry:
-    dbconfig = {
-        "host": os.getenv("DB_HOST"),
-        "user": os.getenv("DB_USER"),
-        "password": os.getenv("DB_PASSWORD"),
-        "database": os.getenv("DB_NAME"),
-    }
-    connection_pool = mysql.connector.pooling.MySQLConnectionPool(
-        pool_name=DB_POOL_NAME,
-        pool_size=DB_POOL_SIZE,
-        pool_reset_session=True,
-        **dbconfig
-    )
-    logging.info("Database connection pool created successfully.")except mysql.connector.Error as err:
-    logging.warning(f"Database connection pool creation failed: {err}")
-    connection_pool = None  # Will use CSV as fallback# Function to get a database connection from the pooldef get_db_connection():
-    """
-    Retrieves a connection from the pool. Returns None if pool is not available.
-    """
-    if connection_pool:
-        try:
-            connection = connection_pool.get_connection()
-            if connection.is_connected():
-                return connection
-        except mysql.connector.Error as err:
-            logging.error(f"Error getting connection from pool: {err}")
-    return None# Initialize Database: Create tables and indexesdef initialize_database():
-    """
-    Initializes the database by creating necessary tables and indexes if they do not exist.
-    """
-    connection = get_db_connection()
-    if connection is None:
-        logging.info("Database initialization skipped. Using CSV storage.")
-        return
-    cursor = connection.cursor()
-    try:
-        # Create table for scraped data
-        create_scraped_data_table = """
-        CREATE TABLE IF NOT EXISTS scraped_data (
-            id INT AUTO_INCREMENT PRIMARY KEY,
-            url VARCHAR(255) NOT NULL,
-            content_hash VARCHAR(64) NOT NULL,
-            change_detected DATETIME NOT NULL
-        )
-        """
-        cursor.execute(create_scraped_data_table)
-        logging.info("Table 'scraped_data' is ready.")
-        # Create indexes for performance
-        create_index_url = "CREATE INDEX IF NOT EXISTS idx_url ON scraped_data(url)"
-        create_index_change = "CREATE INDEX IF NOT EXISTS idx_change_detected ON scraped_data(change_detected)"
-        cursor.execute(create_index_url)
-        cursor.execute(create_index_change)
-        logging.info("Indexes on 'url' and 'change_detected' columns created.")
-        # Create table for action logs
-        create_action_logs_table = """
-        CREATE TABLE IF NOT EXISTS action_logs (
-            id INT AUTO_INCREMENT PRIMARY KEY,
-            action VARCHAR(255) NOT NULL,
-            timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
-        )
-        """
-        cursor.execute(create_action_logs_table)
-        logging.info("Table 'action_logs' is ready.")
-    except mysql.connector.Error as err:
-        logging.error(f"Error initializing database: {err}")
-    finally:
-        cursor.close()
-        connection.close()
-        logging.info("Database initialization complete.")# Function to create WebDriverdef create_driver(options: Options) -> webdriver.Chrome:
-    """
-    Initializes and returns a Selenium Chrome WebDriver instance.
-    """
-    try:
-        driver = webdriver.Chrome(
-            service=Service(ChromeDriverManager().install()), options=options
-        )
-        logging.info("ChromeDriver initialized successfully.")
-        return driver
-    except Exception as exception:
-        logging.error(f"Error initializing ChromeDriver: {exception}")
-        return None# Function to log changes to CSVdef log_to_csv(storage_location: str, url: str, content_hash: str, change_detected: str):
-    """
-    Logs the change to a CSV file in the storage_location.
-    """
-    try:
-        os.makedirs(storage_location, exist_ok=True)
-        csv_file_path = os.path.join(storage_location, f"{urlparse(url).hostname}_changes.csv")
-        file_exists = os.path.isfile(csv_file_path)
-        with open(csv_file_path, "a", newline="", encoding="utf-8") as csvfile:
-            fieldnames = ["date", "time", "url", "content_hash", "change"]
-            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
-            if not file_exists:
-                writer.writeheader()
-            writer.writerow(
-                {
-                    "date": change_detected.split()[0],
-                    "time": change_detected.split()[1],
-                    "url": url,
-                    "content_hash": content_hash,
-                    "change": "Content changed",
-                }
-            )
-        logging.info(f"Change detected at {url} on {change_detected} and logged to CSV.")
-    except Exception as e:
-        logging.error(f"Error logging data to CSV: {e}")# Function to get initial observationdef get_initial_observation(
-    driver: webdriver.Chrome, url: str, content_type: str, selector: str = None) -> str:
-    """
-    Retrieves the initial content from the URL and returns its MD5 hash.
-    """
-    try:
-        driver.get(url)
-        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
-        time.sleep(2)  # Additional wait for dynamic content
-        if content_type == "text":
-            initial_content = driver.page_source
-        elif content_type == "media":
-            if selector:
-                try:
-                    elements = WebDriverWait(driver, 5).until(
-                        EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))
-                    )
-                    initial_content = [element.get_attribute("src") for element in elements]
-                except TimeoutException:                     logging.warning(f"Timeout waiting for media elements with selector '{selector}' on {url}")
-                    initial_content = []
-            else:
-                elements = driver.find_elements(By.TAG_NAME, "img")
-                initial_content = [element.get_attribute("src") for element in elements]
-        else:
-            initial_content = driver.page_source
-        initial_hash = hashlib.md5(str(initial_content).encode("utf-8")).hexdigest()
-        logging.info(f"Initial hash for {url}: {initial_hash}")
-        return initial_hash
-    except Exception as exception:
-        logging.error(f"Error accessing {url}: {exception}")
-        return None# Function to monitor URLs for changesdef monitor_urls(
-    storage_location: str,
-    urls: list,
-    scrape_interval: int,
-    content_type: str,
-    selector: str = None,
-    progress: gr.Progress = None):
-    """
-    Monitors the specified URLs for changes and logs any detected changes to the database or CSV.
-    """
-    global HISTORY, STOP_THREADS
-    previous_hashes = {url: "" for url in urls}
-    options = Options()
-    options.add_argument("--headless")
-    options.add_argument("--no-sandbox")
-    options.add_argument("--disable-dev-shm-usage")
-    driver = create_driver(options)
-    if driver is None:
-        logging.error("WebDriver could not be initialized. Exiting monitor.")
-        return
-    try:
-        while not STOP_THREADS:
-            for url in urls:
-                if STOP_THREADS:
-                    break
-                try:
-                    driver.get(url)
-                    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
-                    time.sleep(2)  # Additional wait for dynamic content
-                    if content_type == "text":
-                        current_content = driver.page_source
-                    elif content_type == "media":
-                        if selector:
-                            try:
-                                elements = WebDriverWait(driver, 5).until(
-                                    EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))
-                                )
-                                current_content = [element.get_attribute("src") for element in elements]
-                            except TimeoutException:
-                                logging.warning(f"Timeout waiting for media elements with selector '{selector}' on {url}")
-                                current_content = []
-                        else:
-                            elements = driver.find_elements(By.TAG_NAME, "img")
-                            current_content = [element.get_attribute("src") for element in elements]
-                    else:
-                        current_content = driver.page_source
-                    current_hash = hashlib.md5(str(current_content).encode("utf-8")).hexdigest()
-                    if current_hash != previous_hashes[url]:
-                        previous_hashes[url] = current_hash
-                        date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-                        HISTORY.append(f"Change detected at {url} on {date_time_str}")
-                        # Attempt to log to database
-                        connection = get_db_connection()
-                        if connection:
-                            try:
-                                cursor = connection.cursor()
-                                insert_query = """
-                                INSERT INTO scraped_data (url, content_hash, change_detected)
-                                VALUES (%s, %s, %s)
-                                """
-                                cursor.execute(insert_query, (url, current_hash, date_time_str))
-                                connection.commit()
-                                logging.info(f"Change detected at {url} on {date_time_str} and logged to database.")
-                            except mysql.connector.Error as err:
-                                logging.error(f"Error inserting data into database: {err}")
-                                # Fallback to CSV
-                                log_to_csv(storage_location, url, current_hash, date_time_str)
-                            finally:
-                                cursor.close()
-                                connection.close()
-                        else:
-                            # Fallback to CSV
-                            log_to_csv(storage_location, url, current_hash, date_time_str)
-                        # Update progress
-                        if progress:
-                            progress(1)
-                except (
-                    NoSuchElementException,
-                    StaleElementReferenceException,
-                    TimeoutException,
-                    Exception,
-                ) as e:
-                    logging.error(f"Error accessing {url}: {e}")
-                    if progress:
-                        progress(1)
-            time.sleep(scrape_interval * 60)  # Wait for the next scrape interval
-    finally:
-        driver.quit()
-        logging.info("ChromeDriver session ended.")# Function to start scrapingdef start_scraping(
-    storage_location: str,
-    urls: str,
-    scrape_interval: int,
-    content_type: str,
-    selector: str = None,
-    progress: gr.Progress = None) -> str:
-    """
-    Starts the scraping process in a separate thread with progress indication.
-    """
-    global CURRENT_TASK, HISTORY, STOP_THREADS
-    if STOP_THREADS:
-        STOP_THREADS = False  # Reset the flag if previously stopped
-    url_list = [url.strip() for url in urls.split(",") if url.strip()]
-    CURRENT_TASK = f"Monitoring URLs: {', '.join(url_list)}"
-    HISTORY.append(f"Task started: {CURRENT_TASK}")
-    logging.info(f"Task started: {CURRENT_TASK}")
-    # Initialize database tables
-    initialize_database()
-    # Log initial observations
-    def log_initial_observations():
-        options = Options()
-        options.add_argument("--headless")
-        options.add_argument("--no-sandbox")
-        options.add_argument("--disable-dev-shm-usage")
-        driver = create_driver(options)
-        if driver is None:
-            return
-        for url in url_list:
-            if STOP_THREADS:
-                break
-            try:
-                initial_hash = get_initial_observation(driver, url, content_type, selector)
-                if initial_hash:
-                    date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-                    HISTORY.append(f"Initial observation at {url}: {initial_hash}")
-                    # Attempt to log to database
-                    connection = get_db_connection()
-                    if connection:
-                        try:
-                            cursor = connection.cursor()
-                            insert_query = """
-                            INSERT INTO scraped_data (url, content_hash, change_detected)
-                            VALUES (%s, %s, %s)
-                            """
-                            cursor.execute(insert_query, (url, initial_hash, date_time_str))
-                            connection.commit()
-                            logging.info(f"Initial observation logged for {url} in database.")
-                        except mysql.connector.Error as err:
-                            logging.error(f"Error inserting initial observation into database: {err}")
-                            # Fallback to CSV
-                            log_to_csv(storage_location, url, initial_hash, date_time_str)
-                        finally:
-                            cursor.close()
-                            connection.close()
-                    else:
-                        # Fallback to CSV
-                        log_to_csv(storage_location, url, initial_hash, date_time_str)
-            except Exception as e:
-                HISTORY.append(f"Error accessing {url}: {e}")
-                logging.error(f"Error accessing {url}: {e}")
-        driver.quit()
-    # Start logging initial observations
-    initial_thread = threading.Thread(target=log_initial_observations, daemon=True)
-    initial_thread.start()
-    # Start the monitoring thread with progress
-    monitor_thread = threading.Thread(
-        target=monitor_urls,
-        args=(storage_location, url_list, scrape_interval, content_type, selector, progress),
-        daemon=True,
-    )
-    monitor_thread.start()
-    logging.info("Started scraping thread.")
-    return f"Started scraping {', '.join(url_list)} every {scrape_interval} minutes."
-# Function to stop scraping
-def stop_scraping() -> str:
-    """
-    Stops all ongoing scraping threads.
-    """
-    global STOP_THREADS
-    STOP_THREADS = True
-    HISTORY.append("Scraping stopped by user.")
-    logging.info("Scraping stop signal sent.")
-    return "Scraping has been stopped."
-# Function to display CSV content from MySQL or CSV
-def display_csv(storage_location: str, url: str) -> str:
-    """
-    Fetches and returns the scraped data for a given URL from the MySQL database or CSV.
-    """
-    try:
-        connection = get_db_connection()
-        if connection:
-            try:
-                cursor = connection.cursor(dictionary=True)
-                query = "SELECT * FROM scraped_data WHERE url = %s ORDER BY change_detected DESC"
-                cursor.execute(query, (url,))
-                results = cursor.fetchall()
-                if not results:
-                    return "No data available for the selected URL."
-                df = pd.DataFrame(results)
-                cursor.close()
-                connection.close()
-                return df.to_string(index=False)
-            except mysql.connector.Error as err:
-                logging.error(f"Error fetching data from database: {err}")
-                # Fallback to CSV
-        else:
-            logging.info("No database connection. Fetching data from CSV.")
-        # Fallback to CSV
-        hostname = urlparse(url).hostname
-        csv_path = os.path.join(storage_location, f"{hostname}_changes.csv")
-        if os.path.exists(csv_path):
-            df = pd.read_csv(csv_path)
-            return df.to_string(index=False)
-        else:
-            return "No data available."
-    except Exception as e:
-        logging.error(f"Error fetching data for {url}: {e}")
-        return f"Error fetching data for {url}: {e}"
-# Function to generate RSS feed from MySQL or CSV data
-def generate_rss_feed(storage_location: str, url: str) -> str:
-    """
-    Generates an RSS feed for the latest changes detected on a given URL from the MySQL database or CSV.
-    """
-    try:
-        connection = get_db_connection()
-        rss_feed = ""
-        if connection:
-            try:
-                cursor = connection.cursor(dictionary=True)
-                query = "SELECT * FROM scraped_data WHERE url = %s ORDER BY change_detected DESC LIMIT 10"
-                cursor.execute(query, (url,))
-                results = cursor.fetchall()
-                if not results:
-                    return "No changes detected to include in RSS feed."
-                # Create the root RSS element
-                rss = ET.Element("rss", version="2.0")
-                channel = ET.SubElement(rss, "channel")
-                # Add channel elements
-                title = ET.SubElement(channel, "title")
-                title.text = f"RSS Feed for {urlparse(url).hostname}"
-                link = ET.SubElement(channel, "link")
-                link.text = url
-                description = ET.SubElement(channel, "description")
-                description.text = "Recent changes detected on the website."
-                # Add items to the feed
-                for row in results:
-                    item = ET.SubElement(channel, "item")
-                    item_title = ET.SubElement(item, "title")
-                    item_title.text = f"Change detected at {row['url']}"
-                    item_link = ET.SubElement(item, "link")
-                    item_link.text = row["url"]
-                    item_description = ET.SubElement(item, "description")
-                    item_description.text = f"Content changed on {row['change_detected']}"
-                    pub_date = ET.SubElement(item, "pubDate")
-                    pub_date.text = datetime.datetime.strptime(
-                        str(row['change_detected']), "%Y-%m-%d %H:%M:%S"
-                    ).strftime("%a, %d %b %Y %H:%M:%S +0000")
-                # Generate the XML string
-                rss_feed = ET.tostring(rss, encoding="utf-8", method="xml").decode("utf-8")
-                cursor.close()
-                connection.close()
-                return rss_feed
-            except mysql.connector.Error as err:
-                logging.error(f"Error fetching data from database: {err}")
-                # Fallback to CSV
-        else:
-            logging.info("No database connection. Generating RSS feed from CSV.")
-        # Fallback to CSV
-        hostname = urlparse(url).hostname
-        csv_path = os.path.join(storage_location, f"{hostname}_changes.csv")
-        if os.path.exists(csv_path):
-            df = pd.read_csv(csv_path).tail(10)
-            if df.empty:
-                return "No changes detected to include in RSS feed."
-            # Create the root RSS element
-            rss = ET.Element("rss", version="2.0")
-            channel = ET.SubElement(rss, "channel")
-            # Add channel elements
-            title = ET.SubElement(channel, "title")
-            title.text = f"RSS Feed for {hostname}"
-            link = ET.SubElement(channel, "link")
-            link.text = url
-            description = ET.SubElement(channel, "description")
-            description.text = "Recent changes detected on the website."
-            # Add items to the feed
-            for _, row in df.iterrows():
-                item = ET.SubElement(channel, "item")
-                item_title = ET.SubElement(item, "title")
-                item_title.text = f"Change detected at {row['url']}"
-                item_link = ET.SubElement(item, "link")
-                item_link.text = row["url"]
-                item_description = ET.SubElement(item, "description")
-                item_description.text = f"Content changed on {row['date']} at {row['time']}"
-                pub_date = ET.SubElement(item, "pubDate")
-                pub_date.text = datetime.datetime.strptime(
-                    f"{row['date']} {row['time']}", "%Y-%m-%d %H:%M:%S"
-                ).strftime("%a, %d %b %Y %H:%M:%S +0000")
-            # Generate the XML string
-            rss_feed = ET.tostring(rss, encoding="utf-8", method="xml").decode("utf-8")
-            return rss_feed
-        else:
-            return "No data available."
-    except Exception as e:
-        logging.error(f"Error generating RSS feed for {url}: {e}")
-        return f"Error generating RSS feed for {url}: {e}"
-# Function to parse user commands using spaCy
-def parse_command(message: str) -> tuple:
-    """
-    Parses the user message using spaCy to identify if it contains a command.
-    Returns the command and its parameters if found, else (None, None).
-    """
-    doc = nlp(message.lower())
-    command = None
-    params = {}
-    # Define command patterns
-    if "filter" in message.lower():
-        # Example: "Filter apples, oranges in column Description"
-        match = re.search(r"filter\s+([\w\s,]+)\s+in\s+column\s+(\w+)", message, re.IGNORECASE)
-        if match:
-            words = [word.strip() for word in match.group(1).split(",")]
-            column = match.group(2)
-            command = "filter"
-            params = {"words": words, "column": column}
-    elif "sort" in message.lower():
-        # Example: "Sort Price ascending"
-        match = re.search(r"sort\s+(\w+)\s+(ascending|descending)", message, re.IGNORECASE)
-        if match:
-            column = match.group(1)
-            order = match.group(2)
-            command = "sort"
-            params = {"column": column, "order": order}
-    elif "export to csv as" in message.lower():
-        # Example: "Export to CSV as filtered_data.csv"
-        match = re.search(r"export\s+to\s+csv\s+as\s+([\w\-]+\.csv)", message, re.IGNORECASE)
-        if match:
-            filename = match.group(1)
-            command = "export"
-            params = {"filename": filename}
-    elif "log action" in message.lower():
-        # Example: "Log action Filtered data for specific fruits"
-        match = re.search(r"log\s+action\s+(.+)", message, re.IGNORECASE)
-        if match:
-            action = match.group(1)
-            command = "log"
-            params = {"action": action}
-    return command, params
-# Function to execute parsed commands
-def execute_command(command: str, params: dict) -> str:
-    """
-    Executes the corresponding function based on the command and parameters.
-    """
-    if command == "filter":
-        words = params["words"]
-        column = params["column"]
-        return filter_data(column, words)
-    elif command == "sort":
-        column = params["column"]
-        order = params["order"]
-        return sort_data(column, order)
-    elif command == "export":
-        filename = params["filename"]
-        return export_csv(filename)
-    elif command == "log":
-        action = params["action"]
-        return log_action(action)
-    else:
-        return "Unknown command."
-# Data Manipulation Functions
-def filter_data(column: str, words: list) -> str:
-    """
-    Filters the scraped data to include only rows where the specified column contains the given words.
-    Saves the filtered data to a new CSV file.
-    """
-    try:
-        storage_location = DEFAULT_FILE_PATH
-        connection = get_db_connection()
-        if connection:
-            try:
-                cursor = connection.cursor(dictionary=True)
-                # Fetch all data
-                query = "SELECT * FROM scraped_data"
-                cursor.execute(query)
-                results = cursor.fetchall()
-                if not results:
-                    return "No data available to filter."
-                df = pd.DataFrame(results)
-                # Create a regex pattern to match any of the words
-                pattern = '|'.join(words)
-                if column not in df.columns:
-                    return f"Column '{column}' does not exist in the data."
-                filtered_df = df[df[column].astype(str).str.contains(pattern, case=False, na=False)]
-                if filtered_df.empty:
-                    return f"No records found with words {words} in column '{column}'."
-                # Save the filtered data to a new CSV
-                timestamp = int(time.time())
-                filtered_csv = os.path.join(storage_location, f"filtered_data_{timestamp}.csv")
-                filtered_df.to_csv(filtered_csv, index=False)
-                logging.info(f"Data filtered on column '{column}' for words {words}.")
-                return f"Data filtered and saved to {filtered_csv}."
-            except mysql.connector.Error as err:
-                logging.error(f"Error fetching data from database: {err}")
-                # Fallback to CSV
-        else:
-            logging.info("No database connection. Filtering data from CSV.")
-        # Fallback to CSV
-        csv_files = [f for f in os.listdir(storage_location) if f.endswith("_changes.csv") or f.endswith("_filtered.csv") or f.endswith("_sorted_asc.csv") or f.endswith("_sorted_desc.csv")]
-        if not csv_files:
-            return "No CSV files found to filter."
-        # Assume the latest CSV is the target
-        latest_csv = max([os.path.join(storage_location, f) for f in csv_files], key=os.path.getmtime)
-        df = pd.read_csv(latest_csv)
-        if column not in df.columns:
-            return f"Column '{column}' does not exist in the data."
-        filtered_df = df[df[column].astype(str).str.contains('|'.join(words), case=False, na=False)]
-        if filtered_df.empty:
-            return f"No records found with words {words} in column '{column}'."
-        # Save the filtered data to a new CSV
-        timestamp = int(time.time())
-        filtered_csv = latest_csv.replace(".csv", f"_filtered_{timestamp}.csv")
-        filtered_df.to_csv(filtered_csv, index=False)
-        logging.info(f"Data filtered on column '{column}' for words {words}.")
-        return f"Data filtered and saved to {filtered_csv}."
-    except Exception as e:
-        logging.error(f"Error filtering data: {e}")
-        return f"Error filtering data: {e}"
-def sort_data(column: str, order: str) -> str:
-    """
-    Sorts the scraped data based on the specified column and order.
-    Saves the sorted data to a new CSV file.
-    """
-    try:
-        storage_location = DEFAULT_FILE_PATH
-        connection = get_db_connection()
-        if connection:
-            try:
-                cursor = connection.cursor(dictionary=True)
-                # Fetch all data
-                query = "SELECT * FROM scraped_data"
-                cursor.execute(query)
-                results = cursor.fetchall()
-                if not results:
-                    return "No data available to sort."
-                df = pd.DataFrame(results)
-                if column not in df.columns:
-                    return f"Column '{column}' does not exist in the data."
-                ascending = True if order.lower() == "ascending" else False
-                sorted_df = df.sort_values(by=column, ascending=ascending)
-                # Save the sorted data to a new CSV
-                timestamp = int(time.time())
-                sorted_csv = os.path.join(storage_location, f"sorted_data_{column}_{order.lower()}_{timestamp}.csv")
-                sorted_df.to_csv(sorted_csv, index=False)
-                logging.info(f"Data sorted on column '{column}' in {order} order.")
-                return f"Data sorted and saved to {sorted_csv}."
-            except mysql.connector.Error as err:
-                logging.error(f"Error fetching data from database: {err}")
-                # Fallback to CSV
-        else:
-            logging.info("No database connection. Sorting data from CSV.")
-        # Fallback to CSV
-        csv_files = [f for f in os.listdir(storage_location) if f.endswith("_changes.csv") or f.endswith("_filtered.csv") or f.endswith("_sorted_asc.csv") or f.endswith("_sorted_desc.csv")]
-        if not csv_files:
-            return "No CSV files found to sort."
-        # Assume the latest CSV is the target
-        latest_csv = max([os.path.join(storage_location, f) for f in csv_files], key=os.path.getmtime)
-        df = pd.read_csv(latest_csv)
-        if column not in df.columns:
-            return f"Column '{column}' does not exist in the data."
-        ascending = True if order.lower() == "ascending" else False
-        sorted_df = df.sort_values(by=column, ascending=ascending)
-        # Save the sorted data to a new CSV
-        timestamp = int(time.time())
-        sorted_csv = latest_csv.replace(".csv", f"_sorted_{order.lower()}_{timestamp}.csv")
-        sorted_df.to_csv(sorted_csv, index=False)
-        logging.info(f"Data sorted on column '{column}' in {order} order.")
-        return f"Data sorted and saved to {sorted_csv}."
-    except Exception as e:
-        logging.error(f"Error sorting data: {e}")
-        return f"Error sorting data: {e}"
-def export_csv(filename: str) -> str:
-    """
-    Exports the latest scraped data to a specified CSV filename.
-    """
-    try:
-        storage_location = DEFAULT_FILE_PATH
-        connection = get_db_connection()
-        if connection:
-            try:
-                cursor = connection.cursor(dictionary=True)
-                # Fetch all data
-                query = "SELECT * FROM scraped_data"
-                cursor.execute(query)
-                results = cursor.fetchall()
-                if not results:
-                    return "No data available to export."
-                df = pd.DataFrame(results)
-                export_path = os.path.join(storage_location, filename)
-                df.to_csv(export_path, index=False)
-                logging.info(f"Data exported to {export_path}.")
-                return f"Data exported to {export_path}."
-            except mysql.connector.Error as err:
-                logging.error(f"Error exporting data from database: {err}")
-                # Fallback to CSV
-        else:
-            logging.info("No database connection. Exporting data from CSV.")
-        # Fallback to CSV
-        csv_files = [f for f in os.listdir(storage_location) if f.endswith("_changes.csv") or f.endswith("_filtered.csv") or f.endswith("_sorted_asc.csv") or f.endswith("_sorted_desc.csv")]
-        if not csv_files:
-            return "No CSV files found to export."
-        # Assume the latest CSV is the target
-        latest_csv = max([os.path.join(storage_location, f) for f in csv_files], key=os.path.getmtime)
-        df = pd.read_csv(latest_csv)
-        export_path = os.path.join(storage_location, filename)
-        df.to_csv(export_path, index=False)
-        logging.info(f"Data exported to {export_path}.")
-        return f"Data exported to {export_path}."
-    except Exception as e:
-        logging.error(f"Error exporting CSV: {e}")
-        return f"Error exporting CSV: {e}"
-def log_action(action: str) -> str:
-    """
-    Logs a custom action message to the MySQL database or CSV.
-    """
-    try:
-        connection = get_db_connection()
-        if connection:
-            try:
-                cursor = connection.cursor()
-                insert_query = """
-                INSERT INTO action_logs (action)
-                VALUES (%s)
-                """
-                cursor.execute(insert_query, (action,))
-                connection.commit()
-                logging.info(f"Action logged in database: {action}")
-                cursor.close()
-                connection.close()
-                return f"Action logged: {action}"
-            except mysql.connector.Error as err:
-                logging.error(f"Error logging action to database: {err}")
-                # Fallback to CSV
-        else:
-            logging.info("No database connection. Logging action to CSV.")
-        # Fallback to CSV
-        storage_location = DEFAULT_FILE_PATH
-        try:
-            os.makedirs(storage_location, exist_ok=True)
-            csv_file_path = os.path.join(storage_location, "action_logs.csv")
-            file_exists = os.path.isfile(csv_file_path)
-            with open(csv_file_path, "a", newline="", encoding="utf-8") as csvfile:
-                fieldnames = ["timestamp", "action"]
-                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
-                if not file_exists:
-                    writer.writeheader()
-                writer.writerow(
-                    {
-                        "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
-                        "action": action,
-                    }
-                )
-            logging.info(f"Action logged to CSV: {action}")
-            return f"Action logged: {action}"
-        except Exception as e:
-            logging.error(f"Error logging action to CSV: {e}")
-            return f"Error logging action: {e}"
-    except Exception as e:
-        logging.error(f"Error logging action: {e}")
-        return f"Error logging action: {e}"
-# Function to get the latest CSV file based on modification time
-def get_latest_csv() -> str:
-    """
-    Retrieves the latest CSV file from the storage directory based on modification time.
-    """
-    try:
-        storage_location = "/home/users/app/scraped_data"
-        csv_files = [f for f in os.listdir(storage_location) if f.endswith(".csv")]
-        if not csv_files:
-            return None
-        latest_csv = max([os.path.join(storage_location, f) for f in csv_files], key=os.path.getmtime)
-        return latest_csv
-    except Exception as e:
-        logging.error(f"Error retrieving latest CSV: {e}")
-        return None
-def respond(
-    message: str,
-    history: list,
-    system_message: str,
-    max_tokens: int,
-    temperature: float,
-    top_p: float,
-) -> str:
-    """
-    Generates a response using OpenLlamaForCausalLM.
-    """
-    try:
-        # Check if the message contains a command
-        command, params = parse_command(message)
-        if command:
-            # Execute the corresponding function
-            response = execute_command(command, params)
-        else:
-            # Generate a regular response using OpenLlama
-            prompt = (
-                f"System: {system_message}\n"
-                f"History: {history}\n"
-                f"User: {message}\n"
-                f"Assistant:"
-            )
-            response = openllama_pipeline(
-                prompt,
-                max_length=max_tokens,
-                temperature=temperature,
-                top_p=top_p,
-            )[0]["generated_text"]
-            # Extract the assistant's reply
-            response = response.split("Assistant:")[-1].strip()
-        return response
-    except Exception as e:
-        logging.error(f"Error generating response: {e}")
-        return "Error generating response."
-# Define the Gradio interface
-def create_interface() -> gr.Blocks():
-    """
-    Defines and returns the Gradio interface for the application.
-    """
-    with gr.Blocks() as demo:
-        gr.Markdown("# All-in-One Scraper, Database, and RSS Feeder")
-        with gr.Row():
-            with gr.Column():
-                # Scraping Controls
-                storage_location = gr.Textbox(
-                    value=DEFAULT_FILE_PATH, label="Storage Location"
-                )
-                urls = gr.Textbox(
-                    label="URLs (comma separated)",
-                    placeholder="https://example.com, https://anotherexample.com",
-                )
-                scrape_interval = gr.Slider(
-                    minimum=1,
-                    maximum=60,
-                    value=5,
-                    step=1,
-                    label="Scrape Interval (minutes)",
-                )
-                content_type = gr.Radio(
-                    choices=["text", "media", "both"],
-                    value="text",
-                    label="Content Type",
-                )
-                selector = gr.Textbox(
-                    label="CSS Selector for Media (Optional)",
-                    placeholder="e.g., img.main-image",
-                )
-                start_button = gr.Button("Start Scraping")
-                stop_button = gr.Button("Stop Scraping")
-                status_output = gr.Textbox(
-                    label="Status Output", interactive=False, lines=2
-                )
-            with gr.Column():
-                # Chat Interface
-                chat_history = gr.Chatbot(label="Chat History", type='messages')
-                with gr.Row():
-                    message = gr.Textbox(label="Message", placeholder="Type your message here...")
-                system_message = gr.Textbox(
-                    value="You are a helpful assistant.", label="System message"
-                )
-                max_tokens = gr.Slider(
-                    minimum=1,
-                    maximum=2048,
-                    value=512,
-                    step=1,
-                    label="Max new tokens",
-                )
-                temperature = gr.Slider(
-                    minimum=0.1,
-                    maximum=4.0,
-                    value=0.7,
-                    step=0.1,
-                    label="Temperature",
-                )
-                top_p = gr.Slider(
-                    minimum=0.1,
-                    maximum=1.0,
-                    value=0.95,
-                    step=0.05,
-                    label="Top-p (nucleus sampling)",
-                )
-                response_box = gr.Textbox(label="Response", interactive=False, lines=2)
-        with gr.Row():
-            with gr.Column():
-                # CSV Display Controls
-                selected_url_csv = gr.Textbox(
-                    label="Select URL for CSV Content",
-                    placeholder="https://example.com",
-                )
-                csv_button = gr.Button("Display CSV Content")
-                csv_content_output = gr.Textbox(
-                    label="CSV Content Output", interactive=False, lines=10
-                )
-            with gr.Column():
-                # RSS Feed Generation Controls
-                selected_url_rss = gr.Textbox(
-                    label="Select URL for RSS Feed",
-                    placeholder="https://example.com",
-                )
-                rss_button = gr.Button("Generate RSS Feed")
-                rss_output = gr.Textbox(
-                    label="RSS Feed Output", interactive=False, lines=20
-                )
-        # Historical Data View
-        with gr.Row():
-            with gr.Column():
-                historical_view_url = gr.Textbox(
-                    label="Select URL for Historical Data",
-                    placeholder="https://example.com",
-                )
-                historical_button = gr.Button("View Historical Data")
-                historical_output = gr.Dataframe(
-                    headers=["ID", "URL", "Content Hash", "Change Detected"],
-                    label="Historical Data",
-                    interactive=False
-                )
-        # Connect buttons to their respective functions
-        start_button.click(
-            fn=start_scraping,
-            inputs=[
-                storage_location,
-                urls,
-                scrape_interval,
-                content_type,
-                selector,
-            ],
-            outputs=status_output,
-        )
-        stop_button.click(fn=stop_scraping, outputs=status_output)
-        csv_button.click(
-            fn=display_csv,
-            inputs=[storage_location, selected_url_csv],
-            outputs=csv_content_output,
-        )
-        rss_button.click(
-            fn=generate_rss_feed,
-            inputs=[storage_location, selected_url_rss],
-            outputs=rss_output,
-        )
-        historical_button.click(
-            fn=display_historical_data,
-            inputs=[storage_location, historical_view_url],
-            outputs=historical_output,
-        )
-        # Connect message submission to the chat interface
-        def update_chat(message_input, history, system_msg, max_toks, temp, top_p_val):
-            if not message_input.strip():
-                return history, "Please enter a message."
-            response = respond(
-                message_input,
-                history,
-                system_msg,
-                max_toks,
-                temp,
-                top_p_val,
-            )
-            history.append((message_input, response))
-            return history, response
-        message.submit(
-            update_chat,
-            inputs=[
-                message,
-                chat_history,
-                system_message,
-                max_tokens,
-                temperature,
-                top_p,
-            ],
-            outputs=[chat_history, response_box],
-        )
-    return demo
-# Function to display historical data
-def display_historical_data(storage_location: str, url: str):
-    """
-    Retrieves and displays historical scraping data for a given URL.
-    """
-    try:
-        connection = get_db_connection()
-        if connection:
-            try:
-                cursor = connection.cursor(dictionary=True)
-                query = "SELECT * FROM scraped_data WHERE url = %s ORDER BY change_detected DESC"
-                cursor.execute(query, (url,))
-                results = cursor.fetchall()
-                if not results:
-                    return pd.DataFrame()
-                df = pd.DataFrame(results)
-                cursor.close()
-                connection.close()
-                return df
-            except mysql.connector.Error as err:
-                logging.error(f"Error fetching historical data from database: {err}")
-                # Fallback to CSV
-        else:
-            logging.info("No database connection. Fetching historical data from CSV.")
-        # Fallback to CSV
-        hostname = urlparse(url).hostname
-        csv_path = os.path.join(storage_location, f"{hostname}_changes.csv")
-        if os.path.exists(csv_path):
-            df = pd.read_csv(csv_path)
-            return df
-        else:
-            return pd.DataFrame()
-    except Exception as e:
-        logging.error(f"Error fetching historical data for {url}: {e}")
-        return pd.DataFrame()
-    def load_model():
-        """
-        Loads the openLlama model and tokenizer once and returns the pipeline.
-        """
-    try:
-        model_name = "openlm-research/open_llama_3b_v2"
-        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, legacy=False)
-        model = AutoModelForCausalLM.from_pretrained(model_name)
-        # This should be inside the try block
-        max_supported_length = 2048
-        openllama_pipeline = pipeline(
-            "text-generation",
-            model=model,
-            tokenizer=tokenizer,
-            truncation=True,
-            max_length=max_supported_length,
-            temperature=0.7,
-            top_p=0.95,
-            device=0 if torch.cuda.is_available() else -1,
-        )
-        logging.info("Model loaded successfully.")
-        return openllama_pipeline  # Return the pipeline
-    except Exception as e:
-        logging.error(f"Error loading google/flan-t5-xl model: {e}")
-        return None
-def load_model(model_name: str):
-    """
-    Loads the specified model and tokenizer.
-    """
-    try:
-        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, legacy=False)
-        model = AutoModelForCausalLM.from_pretrained(model_name)
-        # This should be inside the try block
-        max_supported_length = 2048  # Get this from the model config
-        openllama_pipeline = pipeline(
-            "text-generation",
-            model=model,
-            tokenizer=tokenizer,
-            truncation=True,
-            max_length=max_supported_length,
-            temperature=0.7,
-            top_p=0.95,
-            device=0 if torch.cuda.is_available() else -1,
-        )
-        logging.info(f"{model_name} loaded successfully.")
-        return openllama_pipeline
-    except Exception as e:
-        logging.error(f"Error loading {model_name} model: {e}")
-        return None
-# Automated Testing using unittest
-class TestApp(unittest.TestCase):
-    def test_parse_command_filter(self):
-        command = "Filter apples, oranges in column Description"
-        parsed_command = parse_command(command)
-        self.assertEqual(parsed_command[0], "filter")
-        self.assertListEqual(parsed_command[1]["words"], ["apples", "oranges"])
-        self.assertEqual(parsed_command[1]["column"], "Description")
-    def test_parse_command_sort(self):
-        command = "Sort Price ascending"
-        parsed_command = parse_command(command)
-        self.assertEqual(parsed_command[0], "sort")
-        self.assertEqual(parsed_command[1]["column"], "Price")
-        self.assertEqual(parsed_command[1]["order"], "ascending")
-    def test_parse_command_export(self):
-        command = "Export to CSV as filtered_data.csv"
-        parsed_command = parse_command(command)
-        self.assertEqual(parsed_command[0], "export")
-        self.assertEqual(parsed_command[1]["filename"], "filtered_data.csv")
-    def test_parse_command_log(self):
-        command = "Log action Filtered data for specific fruits"
-        parsed_command = parse_command(command)
-        self.assertEqual(parsed_command[0], "log")
-        self.assertEqual(parsed_command[1]["action"], "Filtered data for specific fruits")
-    def test_database_connection(self):
-        connection = get_db_connection()
-        # Connection may be None if not configured; adjust the test accordingly
-        if connection:
-            self.assertTrue(connection.is_connected())
-            connection.close()
-        else:
-            self.assertIsNone(connection)
-def main():
-    # Initialize and run the application
-    logging.info("Starting the application...")
-    model = load_model()
-    if model:
-        logging.info("Application started successfully.")
-        print("Main function executed")
-        print("Creating interface...")
-        demo = create_interface()
-        print("Launching interface...")
-        demo.launch(server_name="0.0.0.0", server_port=7860)
-    else:
-        logging.error("Failed to start the application.")
-# Main execution
-if __name__ == "__main__":
-    # Initialize database
-    initialize_database()
-    # Create and launch Gradio interface
-    demo = create_interface()
-    demo.launch()
-    # Run automated tests
-    unittest.main(argv=[''], exit=False)