acecalisto3 commited on
Commit
34bc10c
1 Parent(s): 6bf8a84

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +269 -102
app.py CHANGED
@@ -4,99 +4,151 @@ import csv
4
  import time
5
  import hashlib
6
  import logging
 
 
7
  import gradio as gr
8
  from selenium import webdriver
9
  from selenium.webdriver.chrome.service import Service
10
  from selenium.webdriver.chrome.options import Options
 
11
  from webdriver_manager.chrome import ChromeDriverManager
12
  from huggingface_hub import InferenceClient
13
 
14
  # Configure logging
15
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
 
 
 
 
 
 
16
 
17
  # Define constants
18
  PREFIX = "Task started at {date_time_str}. Purpose: {purpose}"
19
  TASK_PROMPT = "Current task: {task}. History:\n{history}"
20
 
21
- # Define current date/time
22
- date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
23
-
24
  # Define purpose
25
  purpose = """
26
- You go to Culvers sites, you continuously seek changes on them since your last observation.
27
- Anything new that gets logged and dumped into csv, stored in your log folder at user/app/scraped_data.
28
  """
29
 
30
- # Define history
31
  history = []
32
-
33
- # Define current task
34
  current_task = None
 
 
35
 
36
- # Default file path
37
- default_file_path = "user/app/scraped_data/culver/culvers_changes.csv"
38
 
39
  # Ensure the directory exists
40
- os.makedirs(os.path.dirname(default_file_path), exist_ok=True)
41
 
42
- # Function to monitor URLs for changes
43
- def monitor_urls(storage_location, url1, url2, scrape_interval, content_type):
 
 
 
44
  global history
45
- urls = [url1, url2]
46
- previous_hashes = ["", ""]
47
-
48
- # Ensure the directory exists
49
- os.makedirs(os.path.dirname(storage_location), exist_ok=True)
50
-
51
- with open(storage_location, "w", newline='') as csvfile:
52
- csv_toolkit = csv.DictWriter(csvfile, fieldnames=["date", "time", "url", "change"])
53
- csv_toolkit.writeheader()
54
-
55
- options = Options()
56
- options.headless = True
57
- options.add_argument("--disable-gpu")
58
- options.add_argument("--no-sandbox")
59
- options.add_argument("--disable-dev-shm-usage")
60
-
61
- with webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) as driver:
62
- try:
63
- while True:
64
- for i, url in enumerate(urls):
65
- try:
66
- driver.get(url)
67
- time.sleep(2) # Wait for the page to load
68
- if content_type == "text":
69
- current_content = driver.page_source
70
- elif content_type == "media":
71
- current_content = driver.find_elements_by_tag_name("img")
72
- else:
73
- current_content = driver.page_source
74
-
75
- current_hash = hashlib.md5(str(current_content).encode('utf-8')).hexdigest()
76
-
77
- if current_hash != previous_hashes[i]:
78
- previous_hashes[i] = current_hash
79
- date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
80
- history.append(f"Change detected at {url} on {date_time_str}")
81
- csv_toolkit.writerow({"date": date_time_str.split()[0], "time": date_time_str.split()[1], "url": url, "change": "Content changed"})
82
- logging.info(f"Change detected at {url} on {date_time_str}")
83
- except Exception as e:
84
- logging.error(f"Error accessing {url}: {e}")
85
 
86
- time.sleep(scrape_interval * 60) # Check every scrape_interval minutes
87
- except KeyboardInterrupt:
88
- logging.info("Monitoring stopped by user.")
89
- finally:
90
- driver.quit()
91
-
92
- # Define main function to handle user input
93
- def handle_input(storage_location, url1, url2, scrape_interval, content_type):
94
- global current_task, history
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
 
 
 
 
 
 
 
96
  current_task = f"Monitoring URLs: {url1}, {url2}"
97
  history.append(f"Task started: {current_task}")
98
- monitor_urls(storage_location, url1, url2, scrape_interval, content_type)
99
- return TASK_PROMPT.format(task=current_task, history="\n".join(history))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
  # Define the chat response function
102
  client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
@@ -110,52 +162,167 @@ def respond(
110
  top_p,
111
  ):
112
  messages = [{"role": "system", "content": system_message}]
113
-
114
- for val in history:
115
- if val[0]:
116
- messages.append({"role": "user", "content": val[0]})
117
- if val[1]:
118
- messages.append({"role": "assistant", "content": val[1]})
119
-
120
  messages.append({"role": "user", "content": message})
121
-
122
  response = ""
123
-
124
- for message in client.chat_completion(
125
- messages,
126
- max_tokens=max_tokens,
127
- stream=True,
128
- temperature=temperature,
129
- top_p=top_p,
130
- ):
131
- token = message.choices[0].delta.content
132
-
133
- response += token
134
- yield response
 
 
 
135
 
136
  # Create Gradio interface
137
- demo = gr.ChatInterface(
138
- respond,
139
- additional_inputs=[
140
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
141
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
142
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
143
- gr.Slider(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  minimum=0.1,
145
  maximum=1.0,
146
  value=0.95,
147
  step=0.05,
148
- label="Top-p (nucleus sampling)",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  ),
150
- gr.Textbox(value=default_file_path, label="Storage Location"),
151
- gr.Textbox(value="https://www.culver.k12.in.us/", label="URL 1"),
152
- gr.Textbox(value="https://www.facebook.com/CulverCommunitySchools", label="URL 2"),
153
- gr.Slider(minimum=1, maximum=60, value=5, step=1, label="Scrape Interval (minutes)"),
154
- gr.Radio(choices=["text", "media", "both"], value="text", label="Content Type"),
155
- ],
156
- title="Culvers Site Monitor and Chatbot",
157
- description="Monitor changes on Culvers' websites and log them into a CSV file. Also, chat with a friendly chatbot."
158
- )
 
 
 
159
 
160
  if __name__ == "__main__":
161
  demo.launch()
 
4
  import time
5
  import hashlib
6
  import logging
7
+ import threading
8
+ from pathlib import Path
9
  import gradio as gr
10
  from selenium import webdriver
11
  from selenium.webdriver.chrome.service import Service
12
  from selenium.webdriver.chrome.options import Options
13
+ from selenium.webdriver.common.by import By
14
  from webdriver_manager.chrome import ChromeDriverManager
15
  from huggingface_hub import InferenceClient
16
 
17
  # Configure logging
18
+ logging.basicConfig(
19
+ level=logging.INFO,
20
+ format='%(asctime)s - %(levelname)s - %(message)s',
21
+ handlers=[
22
+ logging.FileHandler("monitoring.log"),
23
+ logging.StreamHandler()
24
+ ]
25
+ )
26
 
27
  # Define constants
28
  PREFIX = "Task started at {date_time_str}. Purpose: {purpose}"
29
  TASK_PROMPT = "Current task: {task}. History:\n{history}"
30
 
 
 
 
31
  # Define purpose
32
  purpose = """
33
+ You monitor Culvers sites continuously, seeking changes since your last observation.
34
+ Any new changes are logged and dumped into a CSV, stored in your log folder at user/app/scraped_data.
35
  """
36
 
37
+ # Initialize history and task variables
38
  history = []
 
 
39
  current_task = None
40
+ monitoring_thread = None
41
+ stop_event = threading.Event()
42
 
43
+ # Default file path using pathlib for cross-platform compatibility
44
+ default_file_path = Path("user/app/scraped_data/culver/culvers_changes.csv")
45
 
46
  # Ensure the directory exists
47
+ default_file_path.parent.mkdir(parents=True, exist_ok=True)
48
 
49
+ def monitor_urls(storage_location, urls, scrape_interval, content_type, stop_event):
50
+ """
51
+ Monitor the given URLs for changes and log them into a CSV file.
52
+ Runs in a separate thread.
53
+ """
54
  global history
55
+ previous_hashes = [""] * len(urls)
56
+ storage_path = Path(storage_location)
57
+
58
+ # Initialize CSV file: write header if file doesn't exist
59
+ if not storage_path.exists():
60
+ with storage_path.open("w", newline='', encoding='utf-8') as csvfile:
61
+ csv_toolkit = csv.DictWriter(csvfile, fieldnames=["date", "time", "url", "change"])
62
+ csv_toolkit.writeheader()
63
+
64
+ options = Options()
65
+ options.headless = True
66
+ options.add_argument("--disable-gpu")
67
+ options.add_argument("--no-sandbox")
68
+ options.add_argument("--disable-dev-shm-usage")
69
+
70
+ driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
71
+
72
+ try:
73
+ while not stop_event.is_set():
74
+ for i, url in enumerate(urls):
75
+ try:
76
+ driver.get(url)
77
+ time.sleep(2) # Wait for the page to load
78
+ if content_type == "text":
79
+ current_content = driver.page_source
80
+ elif content_type == "media":
81
+ images = driver.find_elements(By.TAG_NAME, "img")
82
+ current_content = ''.join([img.get_attribute('src') for img in images])
83
+ elif content_type == "both":
84
+ images = driver.find_elements(By.TAG_NAME, "img")
85
+ current_content = driver.page_source + ''.join([img.get_attribute('src') for img in images])
86
+ else:
87
+ current_content = driver.page_source
 
 
 
 
 
 
 
88
 
89
+ current_hash = hashlib.md5(current_content.encode('utf-8')).hexdigest()
90
+
91
+ if current_hash != previous_hashes[i]:
92
+ previous_hashes[i] = current_hash
93
+ date_time = datetime.datetime.now()
94
+ date_time_str = date_time.strftime("%Y-%m-%d %H:%M:%S")
95
+ history_entry = f"Change detected at {url} on {date_time_str}"
96
+ history.append(history_entry)
97
+ with storage_path.open("a", newline='', encoding='utf-8') as csvfile:
98
+ csv_toolkit = csv.DictWriter(csvfile, fieldnames=["date", "time", "url", "change"])
99
+ csv_toolkit.writerow({
100
+ "date": date_time.strftime("%Y-%m-%d"),
101
+ "time": date_time.strftime("%H:%M:%S"),
102
+ "url": url,
103
+ "change": "Content changed"
104
+ })
105
+ logging.info(history_entry)
106
+ except Exception as e:
107
+ logging.error(f"Error accessing {url}: {e}")
108
+ # Sleep in smaller intervals to allow quicker shutdown
109
+ for _ in range(scrape_interval * 60):
110
+ if stop_event.is_set():
111
+ break
112
+ time.sleep(1)
113
+ except Exception as e:
114
+ logging.error(f"Unexpected error in monitoring thread: {e}")
115
+ finally:
116
+ driver.quit()
117
+ logging.info("Monitoring thread has been stopped.")
118
 
119
+ def start_monitoring(storage_location, url1, url2, scrape_interval, content_type):
120
+ global current_task, monitoring_thread, stop_event, history
121
+
122
+ if monitoring_thread and monitoring_thread.is_alive():
123
+ return "Monitoring is already running.", history
124
+
125
+ history = []
126
  current_task = f"Monitoring URLs: {url1}, {url2}"
127
  history.append(f"Task started: {current_task}")
128
+ logging.info(current_task)
129
+
130
+ stop_event.clear()
131
+ urls = [url1, url2]
132
+ monitoring_thread = threading.Thread(
133
+ target=monitor_urls,
134
+ args=(storage_location, urls, scrape_interval, content_type, stop_event),
135
+ daemon=True
136
+ )
137
+ monitoring_thread.start()
138
+ return "Monitoring started.", history
139
+
140
+ def stop_monitoring():
141
+ global current_task, monitoring_thread, stop_event, history
142
+
143
+ if monitoring_thread and monitoring_thread.is_alive():
144
+ stop_event.set()
145
+ monitoring_thread.join()
146
+ history.append("Monitoring stopped by user.")
147
+ logging.info("Monitoring stopped by user.")
148
+ current_task = None
149
+ return "Monitoring stopped.", history
150
+ else:
151
+ return "No monitoring task is currently running.", history
152
 
153
  # Define the chat response function
154
  client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
 
162
  top_p,
163
  ):
164
  messages = [{"role": "system", "content": system_message}]
165
+
166
+ for user_msg, assistant_msg in history:
167
+ if user_msg:
168
+ messages.append({"role": "user", "content": user_msg})
169
+ if assistant_msg:
170
+ messages.append({"role": "assistant", "content": assistant_msg})
171
+
172
  messages.append({"role": "user", "content": message})
173
+
174
  response = ""
175
+
176
+ try:
177
+ for msg in client.chat_completion(
178
+ messages,
179
+ max_tokens=max_tokens,
180
+ stream=True,
181
+ temperature=temperature,
182
+ top_p=top_p,
183
+ ):
184
+ token = msg.choices[0].delta.get("content", "")
185
+ response += token
186
+ yield response
187
+ except Exception as e:
188
+ logging.error(f"Error in chatbot response: {e}")
189
+ yield "An error occurred while generating the response."
190
 
191
  # Create Gradio interface
192
+ with gr.Blocks() as demo:
193
+ gr.Markdown("# Culvers Site Monitor and Chatbot")
194
+ gr.Markdown(
195
+ "Monitor changes on Culvers' websites and log them into a CSV file. "
196
+ "Also, chat with a friendly chatbot."
197
+ )
198
+
199
+ with gr.Tab("Monitor"):
200
+ with gr.Row():
201
+ storage_location = gr.Textbox(
202
+ value=str(default_file_path),
203
+ label="Storage Location",
204
+ placeholder="Path to CSV file where changes will be logged"
205
+ )
206
+ with gr.Row():
207
+ url1 = gr.Textbox(
208
+ value="https://www.culver.k12.in.us/",
209
+ label="URL 1",
210
+ placeholder="First URL to monitor"
211
+ )
212
+ url2 = gr.Textbox(
213
+ value="https://www.facebook.com/CulverCommunitySchools",
214
+ label="URL 2",
215
+ placeholder="Second URL to monitor"
216
+ )
217
+ with gr.Row():
218
+ scrape_interval = gr.Slider(
219
+ minimum=1,
220
+ maximum=60,
221
+ value=5,
222
+ step=1,
223
+ label="Scrape Interval (minutes)"
224
+ )
225
+ content_type = gr.Radio(
226
+ choices=["text", "media", "both"],
227
+ value="text",
228
+ label="Content Type"
229
+ )
230
+ with gr.Row():
231
+ start_button = gr.Button("Start Monitoring")
232
+ stop_button = gr.Button("Stop Monitoring")
233
+ with gr.Row():
234
+ monitoring_status = gr.Textbox(
235
+ value="No active monitoring.",
236
+ label="Monitoring Status",
237
+ interactive=False
238
+ )
239
+ with gr.Row():
240
+ monitoring_history = gr.Textbox(
241
+ value="",
242
+ label="Monitoring History",
243
+ lines=10,
244
+ interactive=False
245
+ )
246
+
247
+ with gr.Tab("Chatbot"):
248
+ chatbot = gr.Chatbot(label="Chat with the Assistant")
249
+ with gr.Row():
250
+ system_message = gr.Textbox(
251
+ value="You are a friendly Chatbot.",
252
+ label="System Message",
253
+ visible=False
254
+ )
255
+ with gr.Row():
256
+ user_input = gr.Textbox(
257
+ label="You:",
258
+ placeholder="Type your message here..."
259
+ )
260
+ submit_button = gr.Button("Send")
261
+ # Parameters
262
+ max_tokens = gr.Slider(
263
+ minimum=1,
264
+ maximum=2048,
265
+ value=512,
266
+ step=1,
267
+ label="Max new tokens"
268
+ )
269
+ temperature = gr.Slider(
270
+ minimum=0.1,
271
+ maximum=4.0,
272
+ value=0.7,
273
+ step=0.1,
274
+ label="Temperature"
275
+ )
276
+ top_p = gr.Slider(
277
  minimum=0.1,
278
  maximum=1.0,
279
  value=0.95,
280
  step=0.05,
281
+ label="Top-p (nucleus sampling)"
282
+ )
283
+
284
+ # Define interactions
285
+ def update_monitoring_history(message, history_text):
286
+ return history_text + message + "\n"
287
+
288
+ start_button.click(
289
+ fn=start_monitoring,
290
+ inputs=[storage_location, url1, url2, scrape_interval, content_type],
291
+ outputs=[monitoring_status, monitoring_history],
292
+ queue=False
293
+ )
294
+
295
+ stop_button.click(
296
+ fn=stop_monitoring,
297
+ inputs=None,
298
+ outputs=[monitoring_status, monitoring_history],
299
+ queue=False
300
+ )
301
+
302
+ def display_history(status, hist):
303
+ return status, "\n".join(hist)
304
+
305
+ # Update monitoring_status and monitoring_history periodically
306
+ def refresh_monitoring(status, hist):
307
+ return status, "\n".join(hist)
308
+
309
+ user_input.submit(
310
+ lambda msg, hist, sys, max_t, temp, tp: (
311
+ gr.update(value=hist + [(msg, "")]),
312
+ respond(msg, hist, sys, max_t, temp, tp)
313
  ),
314
+ inputs=[user_input, chatbot, system_message, max_tokens, temperature, top_p],
315
+ outputs=[chatbot, chatbot]
316
+ )
317
+
318
+ submit_button.click(
319
+ lambda msg, hist, sys, max_t, temp, tp: (
320
+ gr.update(value=hist + [(msg, "")]),
321
+ respond(msg, hist, sys, max_t, temp, tp)
322
+ ),
323
+ inputs=[user_input, chatbot, system_message, max_tokens, temperature, top_p],
324
+ outputs=[chatbot, chatbot]
325
+ )
326
 
327
  if __name__ == "__main__":
328
  demo.launch()