Spaces:

acecalisto3
/

CEEMEESEEK

Runtime error

App Files Files Community

acecalisto3 commited on Oct 6

Commit

bf70dc8

•

1 Parent(s): a4c9236

Update app.py

Browse files

Files changed (1) hide show

app.py +156 -88

app.py CHANGED Viewed

@@ -1,6 +1,9 @@
 import time
 import hashlib
-import logging
 import datetime
 import gradio as gr
 import csv
@@ -18,7 +21,6 @@ from transformers import pipeline
 import feedparser
 from bs4 import BeautifulSoup
 import threading
-import os
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -30,51 +32,65 @@ HISTORY = []
 CURRENT_TASK = None
 STOP_THREADS = False
-# Define a function to monitor URLs for changes
-def monitor_urls(storage_location, urls, scrape_interval, content_type, stop_scraping_flag):
-    global HISTORY
-    previous_hashes = {url: "" for url in urls}  # Use a dictionary for better organization
     try:
-        with webdriver.Chrome(service=Service(webdriver.ChromeDriverManager().install()), options=Options()) as driver:
-            while not stop_scraping_flag[0]:
-                for url in urls:
-                    try:
-                        driver.get(url)
-                        time.sleep(2)  # Wait for the page to load
-                        if content_type == "text":
-                            current_content = driver.page_source
-                        elif content_type == "media":
-                            current_content = driver.find_elements(By.TAG_NAME, "img")
-                        else:
-                            current_content = driver.page_source
-                        current_hash = hashlib.md5(str(current_content).encode('utf-8')).hexdigest()
-                        if current_hash != previous_hashes[url]:
-                            previous_hashes[url] = current_hash
-                            date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-                            HISTORY.append(f"Change detected at {url} on {date_time_str}")
-                            with open(os.path.join(storage_location, f"{urlparse(url).hostname}_changes.csv"), "a", newline="") as csvfile:
-                                csv_writer = csv.DictWriter(csvfile, fieldnames=["date", "time", "url", "change"])
-                                csv_writer.writerow({"date": date_time_str.split()[0], "time": date_time_str.split()[1], "url": url, "change": "Content changed"})
-                                logging.info(f"Change detected at {url} on {date_time_str}")
-                    except (NoSuchElementException, Exception) as e:
-                        logging.error(f"Error accessing {url}: {e}")
-                time.sleep(scrape_interval * 60)  # Check every scrape_interval minutes
-    except Exception as e:
-        logging.error(f"Error starting ChromeDriver: {e}")
-def start_scraping(storage_location, urls, scrape_interval, content_type):
     global CURRENT_TASK, HISTORY, STOP_THREADS
     CURRENT_TASK = f"Monitoring URLs: {', '.join(urls)}"
     HISTORY.append(f"Task started: {CURRENT_TASK}")
     for url in urls:
         # Create a folder for the URL
         hostname = urlparse(url).hostname
         folder_path = os.path.join(storage_location, hostname)
         os.makedirs(folder_path, exist_ok=True)
         # Log the initial observation
         try:
             with webdriver.Chrome(service=Service(webdriver.ChromeDriverManager().install()), options=Options()) as driver:
@@ -92,63 +108,110 @@ def start_scraping(storage_location, urls, scrape_interval, content_type):
                     file.write(f"Initial observation at {url}: {initial_hash}")
         except (NoSuchElementException, Exception) as e:
             HISTORY.append(f"Error accessing {url}: {e}")
         # Start a new thread for monitoring URLs
-        threading.Thread(target=monitor_urls, args=(storage_location, [url], scrape_interval, content_type, [STOP_THREADS])).start()
     return f"Started scraping {', '.join(urls)} every {scrape_interval} minutes."
 # Define a function to stop scraping
 def stop_scraping():
     global STOP_THREADS
     STOP_THREADS = True
     return "Scraping stopped."
-# Define a function to update model config
-def update_model_config(model_name_input, gpu_layers_input):
-    # Example implementation of update_model_config
-    # You can replace this with the actual logic you need
-    return f"Model config updated with model: {model_name_input}, GPU layers: {gpu_layers_input}"
-def display_csv(selected_url):
-    # Get the CSV file path for the selected URL
-    hostname = urlparse(selected_url).hostname
-    csv_file_path = os.path.join(DEFAULT_FILE_PATH, f"{hostname}_changes.csv")
-    # Read the CSV file and return its content
-    if os.path.exists(csv_file_path):
-        with open(csv_file_path, "r") as csvfile:
-            csv_content = csvfile.read()
-        return csv_content
-    else:
-        return "No CSV content available for the selected URL."
 def handle_message(message, chat_history, system_message, max_tokens, temperature, top_p):
-    # Process the message and update the chat history
     chat_history.append((message, system_message))
     response = f"Received message: {message}"
     return chat_history, response
-def generate_rss_feed(selected_url):
-    # Generate the RSS feed for the selected URL
-    # For this example, we'll just return a placeholder RSS feed
-    rss_feed = """
-    <?xml version="1.0" encoding="UTF-8"?>
-    <rss version="2.0">
-        <channel>
-            <title>Example RSS Feed</title>
-            <link>https://example.com</link>
-            <description>This is an example RSS feed.</description>
-            <item>
-                <title>Example Item</title>
-                <link>https://example.com/item</link>
-                <description>This is an example item.</description>
-            </item>
-        </channel>
-    </rss>
-    """
-    return rss_feed
 # Define the Gradio interface
 def create_interface():
     with gr.Blocks() as demo:
@@ -166,17 +229,17 @@ def create_interface():
                 start_button = gr.Button("Start Scraping")
                 stop_button = gr.Button("Stop Scraping")
                 csv_output = gr.Textbox(label="CSV Output", interactive=False)
-                model_name_input = gr.Textbox(value="default_model", label="Model Name")
                 gpu_layers_input = gr.Slider(minimum=0, maximum=8, value=2, step=1, label="GPU Layers")
             with gr.Column():
                 chat_history = gr.Chatbot(label="Chat History")
                 response_box = gr.Textbox(label="Response")
         # Connect buttons to their respective functions
         start_button.click(
-            fn=start_scraping,
             inputs=[storage_location, urls, scrape_interval, content_type],
             outputs=csv_output
         )
@@ -190,7 +253,6 @@ def create_interface():
             selected_url = gr.Textbox(label="Select URL for CSV Content")
             csv_button = gr.Button("Display CSV Content")
             csv_output = gr.Textbox(label="CSV Content Output", interactive=False)
         csv_button.click(display_csv, inputs=[selected_url], outputs=csv_output)
         # Add a button to display the RSS feed for a selected URL
@@ -198,11 +260,17 @@ def create_interface():
             selected_url = gr.Textbox(label="Select URL for RSS Feed")
             rss_button = gr.Button("Generate RSS Feed")
             rss_output = gr.Textbox(label="RSS Feed Output", interactive=False)
-        rss_button.click(generate_rss_feed, inputs=[selected_url], outputs=rss_output)
     return demo
-if __name__ == "__main__":
-    demo = create_interface()
-    demo.launch()

+import mysql.connector
+from mysql.connector import errorcode
+import os
+import logging
 import time
 import hashlib
 import datetime
 import gradio as gr
 import csv
 import feedparser
 from bs4 import BeautifulSoup
 import threading
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 CURRENT_TASK = None
 STOP_THREADS = False
+# Define database configuration
+db_config = {
+    'user': os.getenv('DB_USER'),
+    'password': os.getenv('DB_PASSWORD'),
+    'host': os.getenv('DB_HOST'),
+    'raise_on_warnings': True
+}
+# Define a function to initialize the database
+def initialize_database(config):
     try:
+        cnx = mysql.connector.connect(**config)
+        cursor = cnx.cursor()
+        # Create database if it doesn't exist
+        cursor.execute("CREATE DATABASE IF NOT EXISTS scraper_db")
+        cnx.database = 'scraper_db'
+        # Create tables
+        TABLES = {}
+        TABLES['scraped_data'] = (
+            "CREATE TABLE IF NOT EXISTS scraped_data ("
+            "  id INT AUTO_INCREMENT PRIMARY KEY,"
+            "  url VARCHAR(255) NOT NULL,"
+            "  content_hash VARCHAR(64) NOT NULL,"
+            "  change_detected DATETIME NOT NULL"
+            ") ENGINE=InnoDB"
+        )
+        for table_name in TABLES:
+            table_description = TABLES[table_name]
+            try:
+                cursor.execute(table_description)
+                logging.info(f"Table `{table_name}` created successfully.")
+            except mysql.connector.Error as err:
+                if err.errno == errorcode.ER_TABLE_EXISTS_ERROR:
+                    logging.warning(f"Table `{table_name}` already exists.")
+                else:
+                    logging.error(err.msg)
+        cursor.close()
+        cnx.close()
+        logging.info("Database initialization complete.")
+    except mysql.connector.Error as err:
+        logging.error(f"Database initialization failed: {err}")
+# Define a function to start scraping
+def start_scraping(storage_location, urls, scrape_interval, content_type, db_config):
     global CURRENT_TASK, HISTORY, STOP_THREADS
     CURRENT_TASK = f"Monitoring URLs: {', '.join(urls)}"
     HISTORY.append(f"Task started: {CURRENT_TASK}")
     for url in urls:
         # Create a folder for the URL
         hostname = urlparse(url).hostname
         folder_path = os.path.join(storage_location, hostname)
         os.makedirs(folder_path, exist_ok=True)
         # Log the initial observation
         try:
             with webdriver.Chrome(service=Service(webdriver.ChromeDriverManager().install()), options=Options()) as driver:
                     file.write(f"Initial observation at {url}: {initial_hash}")
         except (NoSuchElementException, Exception) as e:
             HISTORY.append(f"Error accessing {url}: {e}")
         # Start a new thread for monitoring URLs
+        threading.Thread(target=monitor_urls, args=(storage_location, [url], scrape_interval, content_type, [STOP_THREADS], db_config)).start()
     return f"Started scraping {', '.join(urls)} every {scrape_interval} minutes."
+# Define a function to monitor URLs for changes
+def monitor_urls(storage_location, urls, scrape_interval, content_type, stop_scraping_flag, db_config):
+    global HISTORY
+    previous_hashes = {url: "" for url in urls}
+    try:
+        cnx = mysql.connector.connect(**db_config)
+        cursor = cnx.cursor()
+        with webdriver.Chrome(service=Service(webdriver.ChromeDriverManager ().install()), options=Options()) as driver:
+            while not stop_scraping_flag[0]:
+                for url in urls:
+                    try:
+                        driver.get(url)
+                        time.sleep(2)  # Wait for the page to load
+                        if content_type == "text":
+                            current_content = driver.page_source
+                        elif content_type == "media":
+                            current_content = driver.find_elements(By.TAG_NAME, "img")
+                        else:
+                            current_content = driver.page_source
+                        current_hash = hashlib.md5(str(current_content).encode('utf-8')).hexdigest()
+                        if current_hash != previous_hashes[url]:
+                            previous_hashes[url] = current_hash
+                            date_time = datetime.datetime.now()
+                            HISTORY.append(f"Change detected at {url} on {date_time}")
+                            # Insert into MySQL
+                            add_change = ("INSERT INTO scraped_data "
+                                          "(url, content_hash, change_detected) "
+                                          "VALUES (%s, %s, %s)")
+                            data_change = (url, current_hash, date_time)
+                            cursor.execute(add_change, data_change)
+                            cnx.commit()
+                            logging.info(f"Change detected and logged for {url} at {date_time}")
+                    except (NoSuchElementException, Exception) as e:
+                        logging.error(f"Error accessing {url}: {e}")
+                time.sleep(scrape_interval * 60)  # Check every scrape_interval minutes
+    except Exception as e:
+        logging.error(f"Error in monitor_urls: {e}")
+    finally:
+        cursor.close()
+        cnx.close()
 # Define a function to stop scraping
 def stop_scraping():
     global STOP_THREADS
     STOP_THREADS = True
     return "Scraping stopped."
+# Define a function to generate RSS feed
+def generate_rss_feed(selected_url, db_config):
+    try:
+        cnx = mysql.connector.connect(**db_config)
+        cursor = cnx.cursor(dictionary=True)
+        query = ("SELECT content_hash, change_detected FROM scraped_data "
+                 "WHERE url = %s ORDER BY change_detected DESC LIMIT 10")
+        cursor.execute(query, (selected_url,))
+        items = cursor.fetchall()
+        rss_items = ""
+        for item in items:
+            rss_items += f"""
+            <item>
+                <title>Change Detected</title>
+                <link>{selected_url}</link>
+                <description>Change detected on {item['change_detected'].strftime('%Y-%m-%d %H:%M:%S')}</description>
+                <pubDate>{item['change_detected'].strftime('%a, %d %b %Y %H:%M:%S +0000')}</pubDate>
+            </item>
+            """
+        rss_feed = f"""<?xml version="1.0" encoding="UTF-8"?>
+        <rss version="2.0">
+            <channel>
+                <title>RSS Feed for {selected_url}</title>
+                <link>{selected_url}</link>
+                <description>Latest changes detected on {selected_url}.</description>
+                {rss_items}
+            </channel>
+        </rss>"""
+        cursor.close()
+        cnx.close()
+        return rss_feed
+    except mysql.connector.Error as err:
+        logging.error(f"Error generating RSS feed: {err}")
+        return "Failed to generate RSS feed."
+# Define a function to handle messages
 def handle_message(message, chat_history, system_message, max_tokens, temperature, top_p):
     chat_history.append((message, system_message))
     response = f"Received message: {message}"
     return chat_history, response
 # Define the Gradio interface
 def create_interface():
     with gr.Blocks() as demo:
                 start_button = gr.Button("Start Scraping")
                 stop_button = gr.Button("Stop Scraping")
                 csv_output = gr.Textbox(label="CSV Output", interactive=False)
+                 model_name_input = gr.Textbox(value="default_model", label="Model Name")
                 gpu_layers_input = gr.Slider(minimum=0, maximum=8, value=2, step=1, label="GPU Layers")
             with gr.Column():
                 chat_history = gr.Chatbot(label="Chat History")
                 response_box = gr.Textbox(label="Response")
         # Connect buttons to their respective functions
         start_button.click(
+            fn=lambda storage, urls, interval, ctype: start_scraping(
+                storage, urls.split(", "), interval, ctype, db_config
+            ),
             inputs=[storage_location, urls, scrape_interval, content_type],
             outputs=csv_output
         )
             selected_url = gr.Textbox(label="Select URL for CSV Content")
             csv_button = gr.Button("Display CSV Content")
             csv_output = gr.Textbox(label="CSV Content Output", interactive=False)
         csv_button.click(display_csv, inputs=[selected_url], outputs=csv_output)
         # Add a button to display the RSS feed for a selected URL
             selected_url = gr.Textbox(label="Select URL for RSS Feed")
             rss_button = gr.Button("Generate RSS Feed")
             rss_output = gr.Textbox(label="RSS Feed Output", interactive=False)
+        rss_button.click(
+            generate_rss_feed,
+            inputs=[selected_url, gr.State(db_config)],
+            outputs=rss_output
+        )
     return demo
+# Initialize the database
+initialize_database(db_config)
+# Launch the Gradio interface
+demo = create_interface()
+demo.launch()