Spaces:
Runtime error
Runtime error
acecalisto3
commited on
Commit
•
bf70dc8
1
Parent(s):
a4c9236
Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,9 @@
|
|
|
|
|
|
|
|
|
|
1 |
import time
|
2 |
import hashlib
|
3 |
-
import logging
|
4 |
import datetime
|
5 |
import gradio as gr
|
6 |
import csv
|
@@ -18,7 +21,6 @@ from transformers import pipeline
|
|
18 |
import feedparser
|
19 |
from bs4 import BeautifulSoup
|
20 |
import threading
|
21 |
-
import os
|
22 |
|
23 |
# Configure logging
|
24 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
@@ -30,51 +32,65 @@ HISTORY = []
|
|
30 |
CURRENT_TASK = None
|
31 |
STOP_THREADS = False
|
32 |
|
33 |
-
# Define
|
34 |
-
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
37 |
|
|
|
|
|
38 |
try:
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
-
|
|
|
67 |
global CURRENT_TASK, HISTORY, STOP_THREADS
|
68 |
-
|
69 |
CURRENT_TASK = f"Monitoring URLs: {', '.join(urls)}"
|
70 |
HISTORY.append(f"Task started: {CURRENT_TASK}")
|
71 |
-
|
72 |
for url in urls:
|
73 |
# Create a folder for the URL
|
74 |
hostname = urlparse(url).hostname
|
75 |
folder_path = os.path.join(storage_location, hostname)
|
76 |
os.makedirs(folder_path, exist_ok=True)
|
77 |
-
|
78 |
# Log the initial observation
|
79 |
try:
|
80 |
with webdriver.Chrome(service=Service(webdriver.ChromeDriverManager().install()), options=Options()) as driver:
|
@@ -92,63 +108,110 @@ def start_scraping(storage_location, urls, scrape_interval, content_type):
|
|
92 |
file.write(f"Initial observation at {url}: {initial_hash}")
|
93 |
except (NoSuchElementException, Exception) as e:
|
94 |
HISTORY.append(f"Error accessing {url}: {e}")
|
95 |
-
|
96 |
# Start a new thread for monitoring URLs
|
97 |
-
threading.Thread(target=monitor_urls, args=(storage_location, [url], scrape_interval, content_type, [STOP_THREADS])).start()
|
98 |
-
|
99 |
return f"Started scraping {', '.join(urls)} every {scrape_interval} minutes."
|
100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
# Define a function to stop scraping
|
102 |
def stop_scraping():
|
103 |
global STOP_THREADS
|
104 |
STOP_THREADS = True
|
105 |
return "Scraping stopped."
|
106 |
|
107 |
-
# Define a function to
|
108 |
-
def
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
|
|
|
126 |
def handle_message(message, chat_history, system_message, max_tokens, temperature, top_p):
|
127 |
-
# Process the message and update the chat history
|
128 |
chat_history.append((message, system_message))
|
129 |
response = f"Received message: {message}"
|
130 |
return chat_history, response
|
131 |
|
132 |
-
def generate_rss_feed(selected_url):
|
133 |
-
# Generate the RSS feed for the selected URL
|
134 |
-
# For this example, we'll just return a placeholder RSS feed
|
135 |
-
rss_feed = """
|
136 |
-
<?xml version="1.0" encoding="UTF-8"?>
|
137 |
-
<rss version="2.0">
|
138 |
-
<channel>
|
139 |
-
<title>Example RSS Feed</title>
|
140 |
-
<link>https://example.com</link>
|
141 |
-
<description>This is an example RSS feed.</description>
|
142 |
-
<item>
|
143 |
-
<title>Example Item</title>
|
144 |
-
<link>https://example.com/item</link>
|
145 |
-
<description>This is an example item.</description>
|
146 |
-
</item>
|
147 |
-
</channel>
|
148 |
-
</rss>
|
149 |
-
"""
|
150 |
-
return rss_feed
|
151 |
-
|
152 |
# Define the Gradio interface
|
153 |
def create_interface():
|
154 |
with gr.Blocks() as demo:
|
@@ -166,17 +229,17 @@ def create_interface():
|
|
166 |
start_button = gr.Button("Start Scraping")
|
167 |
stop_button = gr.Button("Stop Scraping")
|
168 |
csv_output = gr.Textbox(label="CSV Output", interactive=False)
|
169 |
-
|
170 |
-
model_name_input = gr.Textbox(value="default_model", label="Model Name")
|
171 |
gpu_layers_input = gr.Slider(minimum=0, maximum=8, value=2, step=1, label="GPU Layers")
|
172 |
-
|
173 |
with gr.Column():
|
174 |
chat_history = gr.Chatbot(label="Chat History")
|
175 |
response_box = gr.Textbox(label="Response")
|
176 |
|
177 |
# Connect buttons to their respective functions
|
178 |
start_button.click(
|
179 |
-
fn=start_scraping
|
|
|
|
|
180 |
inputs=[storage_location, urls, scrape_interval, content_type],
|
181 |
outputs=csv_output
|
182 |
)
|
@@ -190,7 +253,6 @@ def create_interface():
|
|
190 |
selected_url = gr.Textbox(label="Select URL for CSV Content")
|
191 |
csv_button = gr.Button("Display CSV Content")
|
192 |
csv_output = gr.Textbox(label="CSV Content Output", interactive=False)
|
193 |
-
|
194 |
csv_button.click(display_csv, inputs=[selected_url], outputs=csv_output)
|
195 |
|
196 |
# Add a button to display the RSS feed for a selected URL
|
@@ -198,11 +260,17 @@ def create_interface():
|
|
198 |
selected_url = gr.Textbox(label="Select URL for RSS Feed")
|
199 |
rss_button = gr.Button("Generate RSS Feed")
|
200 |
rss_output = gr.Textbox(label="RSS Feed Output", interactive=False)
|
201 |
-
|
202 |
-
|
|
|
|
|
|
|
203 |
|
204 |
return demo
|
205 |
|
206 |
-
|
207 |
-
|
208 |
-
|
|
|
|
|
|
|
|
1 |
+
import mysql.connector
|
2 |
+
from mysql.connector import errorcode
|
3 |
+
import os
|
4 |
+
import logging
|
5 |
import time
|
6 |
import hashlib
|
|
|
7 |
import datetime
|
8 |
import gradio as gr
|
9 |
import csv
|
|
|
21 |
import feedparser
|
22 |
from bs4 import BeautifulSoup
|
23 |
import threading
|
|
|
24 |
|
25 |
# Configure logging
|
26 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
|
32 |
CURRENT_TASK = None
|
33 |
STOP_THREADS = False
|
34 |
|
35 |
+
# Define database configuration
|
36 |
+
db_config = {
|
37 |
+
'user': os.getenv('DB_USER'),
|
38 |
+
'password': os.getenv('DB_PASSWORD'),
|
39 |
+
'host': os.getenv('DB_HOST'),
|
40 |
+
'raise_on_warnings': True
|
41 |
+
}
|
42 |
|
43 |
+
# Define a function to initialize the database
|
44 |
+
def initialize_database(config):
|
45 |
try:
|
46 |
+
cnx = mysql.connector.connect(**config)
|
47 |
+
cursor = cnx.cursor()
|
48 |
+
|
49 |
+
# Create database if it doesn't exist
|
50 |
+
cursor.execute("CREATE DATABASE IF NOT EXISTS scraper_db")
|
51 |
+
cnx.database = 'scraper_db'
|
52 |
+
|
53 |
+
# Create tables
|
54 |
+
TABLES = {}
|
55 |
+
TABLES['scraped_data'] = (
|
56 |
+
"CREATE TABLE IF NOT EXISTS scraped_data ("
|
57 |
+
" id INT AUTO_INCREMENT PRIMARY KEY,"
|
58 |
+
" url VARCHAR(255) NOT NULL,"
|
59 |
+
" content_hash VARCHAR(64) NOT NULL,"
|
60 |
+
" change_detected DATETIME NOT NULL"
|
61 |
+
") ENGINE=InnoDB"
|
62 |
+
)
|
63 |
+
|
64 |
+
for table_name in TABLES:
|
65 |
+
table_description = TABLES[table_name]
|
66 |
+
try:
|
67 |
+
cursor.execute(table_description)
|
68 |
+
logging.info(f"Table `{table_name}` created successfully.")
|
69 |
+
except mysql.connector.Error as err:
|
70 |
+
if err.errno == errorcode.ER_TABLE_EXISTS_ERROR:
|
71 |
+
logging.warning(f"Table `{table_name}` already exists.")
|
72 |
+
else:
|
73 |
+
logging.error(err.msg)
|
74 |
+
|
75 |
+
cursor.close()
|
76 |
+
cnx.close()
|
77 |
+
logging.info("Database initialization complete.")
|
78 |
+
except mysql.connector.Error as err:
|
79 |
+
logging.error(f"Database initialization failed: {err}")
|
80 |
|
81 |
+
# Define a function to start scraping
|
82 |
+
def start_scraping(storage_location, urls, scrape_interval, content_type, db_config):
|
83 |
global CURRENT_TASK, HISTORY, STOP_THREADS
|
84 |
+
|
85 |
CURRENT_TASK = f"Monitoring URLs: {', '.join(urls)}"
|
86 |
HISTORY.append(f"Task started: {CURRENT_TASK}")
|
87 |
+
|
88 |
for url in urls:
|
89 |
# Create a folder for the URL
|
90 |
hostname = urlparse(url).hostname
|
91 |
folder_path = os.path.join(storage_location, hostname)
|
92 |
os.makedirs(folder_path, exist_ok=True)
|
93 |
+
|
94 |
# Log the initial observation
|
95 |
try:
|
96 |
with webdriver.Chrome(service=Service(webdriver.ChromeDriverManager().install()), options=Options()) as driver:
|
|
|
108 |
file.write(f"Initial observation at {url}: {initial_hash}")
|
109 |
except (NoSuchElementException, Exception) as e:
|
110 |
HISTORY.append(f"Error accessing {url}: {e}")
|
111 |
+
|
112 |
# Start a new thread for monitoring URLs
|
113 |
+
threading.Thread(target=monitor_urls, args=(storage_location, [url], scrape_interval, content_type, [STOP_THREADS], db_config)).start()
|
114 |
+
|
115 |
return f"Started scraping {', '.join(urls)} every {scrape_interval} minutes."
|
116 |
|
117 |
+
# Define a function to monitor URLs for changes
|
118 |
+
def monitor_urls(storage_location, urls, scrape_interval, content_type, stop_scraping_flag, db_config):
|
119 |
+
global HISTORY
|
120 |
+
previous_hashes = {url: "" for url in urls}
|
121 |
+
|
122 |
+
try:
|
123 |
+
cnx = mysql.connector.connect(**db_config)
|
124 |
+
cursor = cnx.cursor()
|
125 |
+
|
126 |
+
with webdriver.Chrome(service=Service(webdriver.ChromeDriverManager ().install()), options=Options()) as driver:
|
127 |
+
while not stop_scraping_flag[0]:
|
128 |
+
for url in urls:
|
129 |
+
try:
|
130 |
+
driver.get(url)
|
131 |
+
time.sleep(2) # Wait for the page to load
|
132 |
+
if content_type == "text":
|
133 |
+
current_content = driver.page_source
|
134 |
+
elif content_type == "media":
|
135 |
+
current_content = driver.find_elements(By.TAG_NAME, "img")
|
136 |
+
else:
|
137 |
+
current_content = driver.page_source
|
138 |
+
current_hash = hashlib.md5(str(current_content).encode('utf-8')).hexdigest()
|
139 |
+
|
140 |
+
if current_hash != previous_hashes[url]:
|
141 |
+
previous_hashes[url] = current_hash
|
142 |
+
date_time = datetime.datetime.now()
|
143 |
+
HISTORY.append(f"Change detected at {url} on {date_time}")
|
144 |
+
|
145 |
+
# Insert into MySQL
|
146 |
+
add_change = ("INSERT INTO scraped_data "
|
147 |
+
"(url, content_hash, change_detected) "
|
148 |
+
"VALUES (%s, %s, %s)")
|
149 |
+
data_change = (url, current_hash, date_time)
|
150 |
+
cursor.execute(add_change, data_change)
|
151 |
+
cnx.commit()
|
152 |
+
|
153 |
+
logging.info(f"Change detected and logged for {url} at {date_time}")
|
154 |
+
except (NoSuchElementException, Exception) as e:
|
155 |
+
logging.error(f"Error accessing {url}: {e}")
|
156 |
+
time.sleep(scrape_interval * 60) # Check every scrape_interval minutes
|
157 |
+
except Exception as e:
|
158 |
+
logging.error(f"Error in monitor_urls: {e}")
|
159 |
+
finally:
|
160 |
+
cursor.close()
|
161 |
+
cnx.close()
|
162 |
+
|
163 |
# Define a function to stop scraping
|
164 |
def stop_scraping():
|
165 |
global STOP_THREADS
|
166 |
STOP_THREADS = True
|
167 |
return "Scraping stopped."
|
168 |
|
169 |
+
# Define a function to generate RSS feed
|
170 |
+
def generate_rss_feed(selected_url, db_config):
|
171 |
+
try:
|
172 |
+
cnx = mysql.connector.connect(**db_config)
|
173 |
+
cursor = cnx.cursor(dictionary=True)
|
174 |
+
|
175 |
+
query = ("SELECT content_hash, change_detected FROM scraped_data "
|
176 |
+
"WHERE url = %s ORDER BY change_detected DESC LIMIT 10")
|
177 |
+
cursor.execute(query, (selected_url,))
|
178 |
+
|
179 |
+
items = cursor.fetchall()
|
180 |
+
|
181 |
+
rss_items = ""
|
182 |
+
for item in items:
|
183 |
+
rss_items += f"""
|
184 |
+
<item>
|
185 |
+
<title>Change Detected</title>
|
186 |
+
<link>{selected_url}</link>
|
187 |
+
<description>Change detected on {item['change_detected'].strftime('%Y-%m-%d %H:%M:%S')}</description>
|
188 |
+
<pubDate>{item['change_detected'].strftime('%a, %d %b %Y %H:%M:%S +0000')}</pubDate>
|
189 |
+
</item>
|
190 |
+
"""
|
191 |
+
|
192 |
+
rss_feed = f"""<?xml version="1.0" encoding="UTF-8"?>
|
193 |
+
<rss version="2.0">
|
194 |
+
<channel>
|
195 |
+
<title>RSS Feed for {selected_url}</title>
|
196 |
+
<link>{selected_url}</link>
|
197 |
+
<description>Latest changes detected on {selected_url}.</description>
|
198 |
+
{rss_items}
|
199 |
+
</channel>
|
200 |
+
</rss>"""
|
201 |
+
|
202 |
+
cursor.close()
|
203 |
+
cnx.close()
|
204 |
+
return rss_feed
|
205 |
+
except mysql.connector.Error as err:
|
206 |
+
logging.error(f"Error generating RSS feed: {err}")
|
207 |
+
return "Failed to generate RSS feed."
|
208 |
|
209 |
+
# Define a function to handle messages
|
210 |
def handle_message(message, chat_history, system_message, max_tokens, temperature, top_p):
|
|
|
211 |
chat_history.append((message, system_message))
|
212 |
response = f"Received message: {message}"
|
213 |
return chat_history, response
|
214 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
# Define the Gradio interface
|
216 |
def create_interface():
|
217 |
with gr.Blocks() as demo:
|
|
|
229 |
start_button = gr.Button("Start Scraping")
|
230 |
stop_button = gr.Button("Stop Scraping")
|
231 |
csv_output = gr.Textbox(label="CSV Output", interactive=False)
|
232 |
+
model_name_input = gr.Textbox(value="default_model", label="Model Name")
|
|
|
233 |
gpu_layers_input = gr.Slider(minimum=0, maximum=8, value=2, step=1, label="GPU Layers")
|
|
|
234 |
with gr.Column():
|
235 |
chat_history = gr.Chatbot(label="Chat History")
|
236 |
response_box = gr.Textbox(label="Response")
|
237 |
|
238 |
# Connect buttons to their respective functions
|
239 |
start_button.click(
|
240 |
+
fn=lambda storage, urls, interval, ctype: start_scraping(
|
241 |
+
storage, urls.split(", "), interval, ctype, db_config
|
242 |
+
),
|
243 |
inputs=[storage_location, urls, scrape_interval, content_type],
|
244 |
outputs=csv_output
|
245 |
)
|
|
|
253 |
selected_url = gr.Textbox(label="Select URL for CSV Content")
|
254 |
csv_button = gr.Button("Display CSV Content")
|
255 |
csv_output = gr.Textbox(label="CSV Content Output", interactive=False)
|
|
|
256 |
csv_button.click(display_csv, inputs=[selected_url], outputs=csv_output)
|
257 |
|
258 |
# Add a button to display the RSS feed for a selected URL
|
|
|
260 |
selected_url = gr.Textbox(label="Select URL for RSS Feed")
|
261 |
rss_button = gr.Button("Generate RSS Feed")
|
262 |
rss_output = gr.Textbox(label="RSS Feed Output", interactive=False)
|
263 |
+
rss_button.click(
|
264 |
+
generate_rss_feed,
|
265 |
+
inputs=[selected_url, gr.State(db_config)],
|
266 |
+
outputs=rss_output
|
267 |
+
)
|
268 |
|
269 |
return demo
|
270 |
|
271 |
+
# Initialize the database
|
272 |
+
initialize_database(db_config)
|
273 |
+
|
274 |
+
# Launch the Gradio interface
|
275 |
+
demo = create_interface()
|
276 |
+
demo.launch()
|