acecalisto3 commited on
Commit
2bed3a1
1 Parent(s): bcc7de3

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -1569
app.py DELETED
@@ -1,1569 +0,0 @@
1
- limport datetimeimport osimport csvimport timeimport hashlibimport loggingfrom collections import defaultdictimport mysql.connectorimport threadingfrom urllib.parse import urlparseimport gradio as grfrom selenium import webdriverfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as  
2
-
3
-
4
-
5
- ECfrom selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException, TimeoutExceptionfrom selenium.webdriver.chrome.service  
6
-
7
-
8
-
9
- import Servicefrom selenium.webdriver.chrome.options import Optionsfrom webdriver_manager.chrome import ChromeDriverManager  
10
-
11
-
12
-
13
- from huggingface_hub import InferenceClient, loginfrom transformers import AutoTokenizer, AutoModelForCausalLM, pipelineimport randomimport yamlimport torchimport pandas as pdimport xml.etree.ElementTree as ETimport reimport spacyimport unittestfrom dotenv import load_dotenvimport nltk# Initialize NLTK resources (you may need to download these)
14
-
15
- nltk.download('punkt')nltk.download('averaged_perceptron_tagger')
16
-
17
- nltk.download('maxent_ne_chunker')
18
-
19
- nltk.download('words')  
20
-
21
-
22
-
23
- # Load spaCy model
24
-
25
- nlp = spacy.load("en_core_web_sm")  
26
-
27
-
28
-
29
- # Dictionary to store model loading functions
30
-
31
- model_loaders = {
32
-
33
- "Falcon": lambda: load_model("tiiuae/falcon-7b"),
34
-
35
- "Flan-T5": lambda: load_model("google/flan-t5-xl"),
36
-
37
- "Flan-T5-Small": lambda: load_model("google/flan-t5-small") # Add a smaller model
38
-
39
- }# Load environment variables from .env file
40
-
41
- load_dotenv()
42
-
43
-
44
-
45
- HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")if not HUGGINGFACE_TOKEN:
46
-
47
- raise ValueError("HUGGINGFACE_TOKEN is not set in the environment variables.")
48
-
49
-
50
-
51
- login(token=HUGGINGFACE_TOKEN, add_to_git_credential=True)# Configure logging
52
-
53
- logging.basicConfig(
54
-
55
- level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
56
-
57
- )# Define constants
58
-
59
- DEFAULT_FILE_PATH = "scraped_data"
60
-
61
- PURPOSE = (
62
-
63
- "You monitor urls. You log what you observe. You seek any changes on them since your last observation. "
64
-
65
- "Anything new gets logged and dumped into csv, stored in your log folder at user/app/scraped_data."
66
-
67
- )# Global variables for task management
68
-
69
- HISTORY = []
70
-
71
- CURRENT_TASK = None
72
-
73
- STOP_THREADS = False # Flag to stop scraping threads# Database Pooling Configuration
74
-
75
- DB_POOL_NAME = "mypool"
76
-
77
- DB_POOL_SIZE = 5 # Adjust based on expected loadtry:
78
-
79
- dbconfig = {
80
-
81
- "host": os.getenv("DB_HOST"),
82
-
83
- "user": os.getenv("DB_USER"),
84
-
85
- "password": os.getenv("DB_PASSWORD"),
86
-
87
- "database": os.getenv("DB_NAME"),
88
-
89
- }
90
-
91
- connection_pool = mysql.connector.pooling.MySQLConnectionPool(
92
-
93
- pool_name=DB_POOL_NAME,
94
-
95
- pool_size=DB_POOL_SIZE,
96
-
97
- pool_reset_session=True,
98
-
99
- **dbconfig
100
-
101
- )
102
-
103
- logging.info("Database connection pool created successfully.")except mysql.connector.Error as err:
104
-
105
- logging.warning(f"Database connection pool creation failed: {err}")
106
-
107
- connection_pool = None # Will use CSV as fallback# Function to get a database connection from the pooldef get_db_connection():
108
-
109
- """
110
-
111
- Retrieves a connection from the pool. Returns None if pool is not available.
112
-
113
- """
114
-
115
- if connection_pool:
116
-
117
- try:
118
-
119
- connection = connection_pool.get_connection()
120
-
121
- if connection.is_connected():
122
-
123
- return connection
124
-
125
- except mysql.connector.Error as err:
126
-
127
- logging.error(f"Error getting connection from pool: {err}")
128
-
129
- return None# Initialize Database: Create tables and indexesdef initialize_database():
130
-
131
- """
132
-
133
- Initializes the database by creating necessary tables and indexes if they do not exist.
134
-
135
- """
136
-
137
- connection = get_db_connection()
138
-
139
- if connection is None:
140
-
141
- logging.info("Database initialization skipped. Using CSV storage.")
142
-
143
- return
144
-
145
-
146
-
147
- cursor = connection.cursor()
148
-
149
- try:
150
-
151
- # Create table for scraped data
152
-
153
- create_scraped_data_table = """
154
-
155
- CREATE TABLE IF NOT EXISTS scraped_data (
156
-
157
- id INT AUTO_INCREMENT PRIMARY KEY,
158
-
159
- url VARCHAR(255) NOT NULL,
160
-
161
- content_hash VARCHAR(64) NOT NULL,
162
-
163
- change_detected DATETIME NOT NULL
164
-
165
- )
166
-
167
- """
168
-
169
- cursor.execute(create_scraped_data_table)
170
-
171
- logging.info("Table 'scraped_data' is ready.")
172
-
173
-
174
-
175
- # Create indexes for performance
176
-
177
- create_index_url = "CREATE INDEX IF NOT EXISTS idx_url ON scraped_data(url)"
178
-
179
- create_index_change = "CREATE INDEX IF NOT EXISTS idx_change_detected ON scraped_data(change_detected)"
180
-
181
- cursor.execute(create_index_url)
182
-
183
- cursor.execute(create_index_change)
184
-
185
- logging.info("Indexes on 'url' and 'change_detected' columns created.")
186
-
187
-
188
-
189
- # Create table for action logs
190
-
191
- create_action_logs_table = """
192
-
193
- CREATE TABLE IF NOT EXISTS action_logs (
194
-
195
- id INT AUTO_INCREMENT PRIMARY KEY,
196
-
197
- action VARCHAR(255) NOT NULL,
198
-
199
- timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
200
-
201
- )
202
-
203
- """
204
-
205
- cursor.execute(create_action_logs_table)
206
-
207
- logging.info("Table 'action_logs' is ready.")
208
-
209
-
210
-
211
- except mysql.connector.Error as err:
212
-
213
- logging.error(f"Error initializing database: {err}")
214
-
215
- finally:
216
-
217
- cursor.close()
218
-
219
- connection.close()
220
-
221
- logging.info("Database initialization complete.")# Function to create WebDriverdef create_driver(options: Options) -> webdriver.Chrome:
222
-
223
- """
224
-
225
- Initializes and returns a Selenium Chrome WebDriver instance.
226
-
227
- """
228
-
229
- try:
230
-
231
- driver = webdriver.Chrome(
232
-
233
- service=Service(ChromeDriverManager().install()), options=options
234
-
235
- )
236
-
237
- logging.info("ChromeDriver initialized successfully.")
238
-
239
- return driver
240
-
241
- except Exception as exception:
242
-
243
- logging.error(f"Error initializing ChromeDriver: {exception}")
244
-
245
- return None# Function to log changes to CSVdef log_to_csv(storage_location: str, url: str, content_hash: str, change_detected: str):
246
-
247
- """
248
-
249
- Logs the change to a CSV file in the storage_location.
250
-
251
- """
252
-
253
- try:
254
-
255
- os.makedirs(storage_location, exist_ok=True)
256
-
257
- csv_file_path = os.path.join(storage_location, f"{urlparse(url).hostname}_changes.csv")
258
-
259
- file_exists = os.path.isfile(csv_file_path)
260
-
261
-
262
-
263
- with open(csv_file_path, "a", newline="", encoding="utf-8") as csvfile:
264
-
265
- fieldnames = ["date", "time", "url", "content_hash", "change"]
266
-
267
- writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
268
-
269
- if not file_exists:
270
-
271
- writer.writeheader()
272
-
273
- writer.writerow(
274
-
275
- {
276
-
277
- "date": change_detected.split()[0],
278
-
279
- "time": change_detected.split()[1],
280
-
281
- "url": url,
282
-
283
- "content_hash": content_hash,
284
-
285
- "change": "Content changed",
286
-
287
- }
288
-
289
- )
290
-
291
- logging.info(f"Change detected at {url} on {change_detected} and logged to CSV.")
292
-
293
- except Exception as e:
294
-
295
- logging.error(f"Error logging data to CSV: {e}")# Function to get initial observationdef get_initial_observation(
296
-
297
- driver: webdriver.Chrome, url: str, content_type: str, selector: str = None) -> str:
298
-
299
- """
300
-
301
- Retrieves the initial content from the URL and returns its MD5 hash.
302
-
303
- """
304
-
305
- try:
306
-
307
- driver.get(url)
308
-
309
- WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
310
-
311
- time.sleep(2) # Additional wait for dynamic content
312
-
313
-
314
-
315
- if content_type == "text":
316
-
317
- initial_content = driver.page_source
318
-
319
- elif content_type == "media":
320
-
321
- if selector:
322
-
323
- try:
324
-
325
- elements = WebDriverWait(driver, 5).until(
326
-
327
- EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))
328
-
329
- )
330
-
331
- initial_content = [element.get_attribute("src") for element in elements]
332
-
333
- except TimeoutException:                     logging.warning(f"Timeout waiting for media elements with selector '{selector}' on {url}")
334
-
335
- initial_content = []
336
-
337
- else:
338
-
339
- elements = driver.find_elements(By.TAG_NAME, "img")
340
-
341
- initial_content = [element.get_attribute("src") for element in elements]
342
-
343
- else:
344
-
345
- initial_content = driver.page_source
346
-
347
-
348
-
349
- initial_hash = hashlib.md5(str(initial_content).encode("utf-8")).hexdigest()
350
-
351
- logging.info(f"Initial hash for {url}: {initial_hash}")
352
-
353
- return initial_hash
354
-
355
- except Exception as exception:
356
-
357
- logging.error(f"Error accessing {url}: {exception}")
358
-
359
- return None# Function to monitor URLs for changesdef monitor_urls(
360
-
361
- storage_location: str,
362
-
363
- urls: list,
364
-
365
- scrape_interval: int,
366
-
367
- content_type: str,
368
-
369
- selector: str = None,
370
-
371
- progress: gr.Progress = None):
372
-
373
- """
374
-
375
- Monitors the specified URLs for changes and logs any detected changes to the database or CSV.
376
-
377
- """
378
-
379
- global HISTORY, STOP_THREADS
380
-
381
- previous_hashes = {url: "" for url in urls}
382
-
383
-
384
-
385
- options = Options()
386
-
387
- options.add_argument("--headless")
388
-
389
- options.add_argument("--no-sandbox")
390
-
391
- options.add_argument("--disable-dev-shm-usage")  
392
-
393
-
394
-
395
-
396
-
397
-
398
-
399
- driver = create_driver(options)
400
-
401
- if driver is None:
402
-
403
- logging.error("WebDriver could not be initialized. Exiting monitor.")
404
-
405
- return
406
-
407
-
408
-
409
- try:
410
-
411
- while not STOP_THREADS:
412
-
413
- for url in urls:
414
-
415
- if STOP_THREADS:
416
-
417
- break
418
-
419
- try:
420
-
421
- driver.get(url)
422
-
423
- WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
424
-
425
- time.sleep(2) # Additional wait for dynamic content
426
-
427
-
428
-
429
- if content_type == "text":
430
-
431
- current_content = driver.page_source
432
-
433
- elif content_type == "media":
434
-
435
- if selector:
436
-
437
- try:
438
-
439
- elements = WebDriverWait(driver, 5).until(
440
-
441
- EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))
442
-
443
- )
444
-
445
- current_content = [element.get_attribute("src") for element in elements]
446
-
447
- except TimeoutException:
448
-
449
- logging.warning(f"Timeout waiting for media elements with selector '{selector}' on {url}")
450
-
451
- current_content = []
452
-
453
- else:
454
-
455
- elements = driver.find_elements(By.TAG_NAME, "img")
456
-
457
- current_content = [element.get_attribute("src") for element in elements]
458
-
459
- else:
460
-
461
- current_content = driver.page_source
462
-
463
-
464
-
465
- current_hash = hashlib.md5(str(current_content).encode("utf-8")).hexdigest()
466
-
467
- if current_hash != previous_hashes[url]:
468
-
469
- previous_hashes[url] = current_hash
470
-
471
- date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
472
-
473
- HISTORY.append(f"Change detected at {url} on {date_time_str}")
474
-
475
-
476
-
477
- # Attempt to log to database
478
-
479
- connection = get_db_connection()
480
-
481
- if connection:
482
-
483
- try:
484
-
485
- cursor = connection.cursor()
486
-
487
- insert_query = """
488
-
489
- INSERT INTO scraped_data (url, content_hash, change_detected)
490
-
491
- VALUES (%s, %s, %s)
492
-
493
- """
494
-
495
- cursor.execute(insert_query, (url, current_hash, date_time_str))
496
-
497
- connection.commit()
498
-
499
- logging.info(f"Change detected at {url} on {date_time_str} and logged to database.")
500
-
501
- except mysql.connector.Error as err:
502
-
503
- logging.error(f"Error inserting data into database: {err}")
504
-
505
- # Fallback to CSV
506
-
507
- log_to_csv(storage_location, url, current_hash, date_time_str)
508
-
509
- finally:
510
-
511
- cursor.close()
512
-
513
- connection.close()
514
-
515
- else:
516
-
517
- # Fallback to CSV
518
-
519
- log_to_csv(storage_location, url, current_hash, date_time_str)
520
-
521
-
522
-
523
- # Update progress
524
-
525
- if progress:
526
-
527
- progress(1)
528
-
529
- except (
530
-
531
- NoSuchElementException,
532
-
533
- StaleElementReferenceException,
534
-
535
- TimeoutException,
536
-
537
- Exception,
538
-
539
- ) as e:
540
-
541
- logging.error(f"Error accessing {url}: {e}")
542
-
543
- if progress:
544
-
545
- progress(1)
546
-
547
- time.sleep(scrape_interval * 60) # Wait for the next scrape interval
548
-
549
- finally:
550
-
551
- driver.quit()
552
-
553
- logging.info("ChromeDriver session ended.")# Function to start scrapingdef start_scraping(
554
-
555
- storage_location: str,
556
-
557
- urls: str,
558
-
559
- scrape_interval: int,
560
-
561
- content_type: str,
562
-
563
- selector: str = None,
564
-
565
- progress: gr.Progress = None) -> str:
566
-
567
- """
568
-
569
- Starts the scraping process in a separate thread with progress indication.
570
-
571
- """
572
-
573
- global CURRENT_TASK, HISTORY, STOP_THREADS
574
-
575
-
576
-
577
- if STOP_THREADS:
578
-
579
- STOP_THREADS = False # Reset the flag if previously stopped
580
-
581
-
582
-
583
- url_list = [url.strip() for url in urls.split(",") if url.strip()]
584
-
585
- CURRENT_TASK = f"Monitoring URLs: {', '.join(url_list)}"
586
-
587
- HISTORY.append(f"Task started: {CURRENT_TASK}")
588
-
589
- logging.info(f"Task started: {CURRENT_TASK}")
590
-
591
-
592
-
593
- # Initialize database tables
594
-
595
- initialize_database()
596
-
597
-
598
-
599
- # Log initial observations
600
-
601
- def log_initial_observations():
602
-
603
- options = Options()
604
-
605
- options.add_argument("--headless")
606
-
607
- options.add_argument("--no-sandbox")
608
-
609
- options.add_argument("--disable-dev-shm-usage")  
610
-
611
-
612
-
613
-
614
-
615
-
616
-
617
- driver = create_driver(options)
618
-
619
- if driver is None:
620
-
621
- return
622
-
623
-
624
-
625
- for url in url_list:
626
-
627
- if STOP_THREADS:
628
-
629
- break
630
-
631
- try:
632
-
633
- initial_hash = get_initial_observation(driver, url, content_type, selector)
634
-
635
- if initial_hash:
636
-
637
- date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
638
-
639
- HISTORY.append(f"Initial observation at {url}: {initial_hash}")
640
-
641
-
642
-
643
- # Attempt to log to database
644
-
645
- connection = get_db_connection()
646
-
647
- if connection:
648
-
649
- try:
650
-
651
- cursor = connection.cursor()
652
-
653
- insert_query = """
654
-
655
- INSERT INTO scraped_data (url, content_hash, change_detected)
656
-
657
- VALUES (%s, %s, %s)
658
-
659
- """
660
-
661
- cursor.execute(insert_query, (url, initial_hash, date_time_str))
662
-
663
- connection.commit()
664
-
665
- logging.info(f"Initial observation logged for {url} in database.")
666
-
667
- except mysql.connector.Error as err:
668
-
669
- logging.error(f"Error inserting initial observation into database: {err}")
670
-
671
- # Fallback to CSV
672
-
673
- log_to_csv(storage_location, url, initial_hash, date_time_str)
674
-
675
- finally:
676
-
677
- cursor.close()
678
-
679
- connection.close()
680
-
681
- else:
682
-
683
- # Fallback to CSV
684
-
685
- log_to_csv(storage_location, url, initial_hash, date_time_str)
686
-
687
- except Exception as e:
688
-
689
- HISTORY.append(f"Error accessing {url}: {e}")
690
-
691
- logging.error(f"Error accessing {url}: {e}")
692
-
693
- driver.quit()
694
-
695
-
696
-
697
- # Start logging initial observations
698
-
699
- initial_thread = threading.Thread(target=log_initial_observations, daemon=True)
700
-
701
- initial_thread.start()
702
-
703
-
704
-
705
- # Start the monitoring thread with progress
706
-
707
- monitor_thread = threading.Thread(
708
-
709
- target=monitor_urls,
710
-
711
- args=(storage_location, url_list, scrape_interval, content_type, selector, progress),
712
-
713
- daemon=True,
714
-
715
- )
716
-
717
- monitor_thread.start()
718
-
719
- logging.info("Started scraping thread.")
720
-
721
- return f"Started scraping {', '.join(url_list)} every {scrape_interval} minutes."
722
-
723
- # Function to stop scraping
724
- def stop_scraping() -> str:
725
- """
726
- Stops all ongoing scraping threads.
727
- """
728
- global STOP_THREADS
729
- STOP_THREADS = True
730
- HISTORY.append("Scraping stopped by user.")
731
- logging.info("Scraping stop signal sent.")
732
- return "Scraping has been stopped."
733
-
734
- # Function to display CSV content from MySQL or CSV
735
- def display_csv(storage_location: str, url: str) -> str:
736
- """
737
- Fetches and returns the scraped data for a given URL from the MySQL database or CSV.
738
- """
739
- try:
740
- connection = get_db_connection()
741
- if connection:
742
- try:
743
- cursor = connection.cursor(dictionary=True)
744
- query = "SELECT * FROM scraped_data WHERE url = %s ORDER BY change_detected DESC"
745
- cursor.execute(query, (url,))
746
- results = cursor.fetchall()
747
-
748
- if not results:
749
- return "No data available for the selected URL."
750
-
751
- df = pd.DataFrame(results)
752
- cursor.close()
753
- connection.close()
754
- return df.to_string(index=False)
755
- except mysql.connector.Error as err:
756
- logging.error(f"Error fetching data from database: {err}")
757
- # Fallback to CSV
758
- else:
759
- logging.info("No database connection. Fetching data from CSV.")
760
-
761
- # Fallback to CSV
762
- hostname = urlparse(url).hostname
763
- csv_path = os.path.join(storage_location, f"{hostname}_changes.csv")
764
- if os.path.exists(csv_path):
765
- df = pd.read_csv(csv_path)
766
- return df.to_string(index=False)
767
- else:
768
- return "No data available."
769
-
770
- except Exception as e:
771
- logging.error(f"Error fetching data for {url}: {e}")
772
- return f"Error fetching data for {url}: {e}"
773
-
774
- # Function to generate RSS feed from MySQL or CSV data
775
- def generate_rss_feed(storage_location: str, url: str) -> str:
776
- """
777
- Generates an RSS feed for the latest changes detected on a given URL from the MySQL database or CSV.
778
- """
779
- try:
780
- connection = get_db_connection()
781
- rss_feed = ""
782
-
783
- if connection:
784
- try:
785
- cursor = connection.cursor(dictionary=True)
786
- query = "SELECT * FROM scraped_data WHERE url = %s ORDER BY change_detected DESC LIMIT 10"
787
- cursor.execute(query, (url,))
788
- results = cursor.fetchall()
789
-
790
- if not results:
791
- return "No changes detected to include in RSS feed."
792
-
793
- # Create the root RSS element
794
- rss = ET.Element("rss", version="2.0")
795
- channel = ET.SubElement(rss, "channel")
796
-
797
- # Add channel elements
798
- title = ET.SubElement(channel, "title")
799
- title.text = f"RSS Feed for {urlparse(url).hostname}"
800
-
801
- link = ET.SubElement(channel, "link")
802
- link.text = url
803
-
804
- description = ET.SubElement(channel, "description")
805
- description.text = "Recent changes detected on the website."
806
-
807
- # Add items to the feed
808
- for row in results:
809
- item = ET.SubElement(channel, "item")
810
-
811
- item_title = ET.SubElement(item, "title")
812
- item_title.text = f"Change detected at {row['url']}"
813
-
814
- item_link = ET.SubElement(item, "link")
815
- item_link.text = row["url"]
816
-
817
- item_description = ET.SubElement(item, "description")
818
- item_description.text = f"Content changed on {row['change_detected']}"
819
-
820
- pub_date = ET.SubElement(item, "pubDate")
821
- pub_date.text = datetime.datetime.strptime(
822
- str(row['change_detected']), "%Y-%m-%d %H:%M:%S"
823
- ).strftime("%a, %d %b %Y %H:%M:%S +0000")
824
-
825
- # Generate the XML string
826
- rss_feed = ET.tostring(rss, encoding="utf-8", method="xml").decode("utf-8")
827
- cursor.close()
828
- connection.close()
829
- return rss_feed
830
- except mysql.connector.Error as err:
831
- logging.error(f"Error fetching data from database: {err}")
832
- # Fallback to CSV
833
- else:
834
- logging.info("No database connection. Generating RSS feed from CSV.")
835
-
836
- # Fallback to CSV
837
- hostname = urlparse(url).hostname
838
- csv_path = os.path.join(storage_location, f"{hostname}_changes.csv")
839
- if os.path.exists(csv_path):
840
- df = pd.read_csv(csv_path).tail(10)
841
- if df.empty:
842
- return "No changes detected to include in RSS feed."
843
-
844
- # Create the root RSS element
845
- rss = ET.Element("rss", version="2.0")
846
- channel = ET.SubElement(rss, "channel")
847
-
848
- # Add channel elements
849
- title = ET.SubElement(channel, "title")
850
- title.text = f"RSS Feed for {hostname}"
851
-
852
- link = ET.SubElement(channel, "link")
853
- link.text = url
854
-
855
- description = ET.SubElement(channel, "description")
856
- description.text = "Recent changes detected on the website."
857
-
858
- # Add items to the feed
859
- for _, row in df.iterrows():
860
- item = ET.SubElement(channel, "item")
861
-
862
- item_title = ET.SubElement(item, "title")
863
- item_title.text = f"Change detected at {row['url']}"
864
-
865
- item_link = ET.SubElement(item, "link")
866
- item_link.text = row["url"]
867
-
868
- item_description = ET.SubElement(item, "description")
869
- item_description.text = f"Content changed on {row['date']} at {row['time']}"
870
-
871
- pub_date = ET.SubElement(item, "pubDate")
872
- pub_date.text = datetime.datetime.strptime(
873
- f"{row['date']} {row['time']}", "%Y-%m-%d %H:%M:%S"
874
- ).strftime("%a, %d %b %Y %H:%M:%S +0000")
875
-
876
- # Generate the XML string
877
- rss_feed = ET.tostring(rss, encoding="utf-8", method="xml").decode("utf-8")
878
- return rss_feed
879
- else:
880
- return "No data available."
881
-
882
- except Exception as e:
883
- logging.error(f"Error generating RSS feed for {url}: {e}")
884
- return f"Error generating RSS feed for {url}: {e}"
885
-
886
- # Function to parse user commands using spaCy
887
- def parse_command(message: str) -> tuple:
888
- """
889
- Parses the user message using spaCy to identify if it contains a command.
890
- Returns the command and its parameters if found, else (None, None).
891
- """
892
- doc = nlp(message.lower())
893
- command = None
894
- params = {}
895
-
896
- # Define command patterns
897
- if "filter" in message.lower():
898
- # Example: "Filter apples, oranges in column Description"
899
- match = re.search(r"filter\s+([\w\s,]+)\s+in\s+column\s+(\w+)", message, re.IGNORECASE)
900
- if match:
901
- words = [word.strip() for word in match.group(1).split(",")]
902
- column = match.group(2)
903
- command = "filter"
904
- params = {"words": words, "column": column}
905
-
906
- elif "sort" in message.lower():
907
- # Example: "Sort Price ascending"
908
- match = re.search(r"sort\s+(\w+)\s+(ascending|descending)", message, re.IGNORECASE)
909
- if match:
910
- column = match.group(1)
911
- order = match.group(2)
912
- command = "sort"
913
- params = {"column": column, "order": order}
914
-
915
- elif "export to csv as" in message.lower():
916
- # Example: "Export to CSV as filtered_data.csv"
917
- match = re.search(r"export\s+to\s+csv\s+as\s+([\w\-]+\.csv)", message, re.IGNORECASE)
918
- if match:
919
- filename = match.group(1)
920
- command = "export"
921
- params = {"filename": filename}
922
-
923
- elif "log action" in message.lower():
924
- # Example: "Log action Filtered data for specific fruits"
925
- match = re.search(r"log\s+action\s+(.+)", message, re.IGNORECASE)
926
- if match:
927
- action = match.group(1)
928
- command = "log"
929
- params = {"action": action}
930
-
931
- return command, params
932
-
933
- # Function to execute parsed commands
934
- def execute_command(command: str, params: dict) -> str:
935
- """
936
- Executes the corresponding function based on the command and parameters.
937
- """
938
- if command == "filter":
939
- words = params["words"]
940
- column = params["column"]
941
- return filter_data(column, words)
942
- elif command == "sort":
943
- column = params["column"]
944
- order = params["order"]
945
- return sort_data(column, order)
946
- elif command == "export":
947
- filename = params["filename"]
948
- return export_csv(filename)
949
- elif command == "log":
950
- action = params["action"]
951
- return log_action(action)
952
- else:
953
- return "Unknown command."
954
-
955
- # Data Manipulation Functions
956
- def filter_data(column: str, words: list) -> str:
957
- """
958
- Filters the scraped data to include only rows where the specified column contains the given words.
959
- Saves the filtered data to a new CSV file.
960
- """
961
- try:
962
- storage_location = DEFAULT_FILE_PATH
963
-
964
- connection = get_db_connection()
965
- if connection:
966
- try:
967
- cursor = connection.cursor(dictionary=True)
968
- # Fetch all data
969
- query = "SELECT * FROM scraped_data"
970
- cursor.execute(query)
971
- results = cursor.fetchall()
972
-
973
- if not results:
974
- return "No data available to filter."
975
-
976
- df = pd.DataFrame(results)
977
- # Create a regex pattern to match any of the words
978
- pattern = '|'.join(words)
979
- if column not in df.columns:
980
- return f"Column '{column}' does not exist in the data."
981
-
982
- filtered_df = df[df[column].astype(str).str.contains(pattern, case=False, na=False)]
983
-
984
- if filtered_df.empty:
985
- return f"No records found with words {words} in column '{column}'."
986
-
987
- # Save the filtered data to a new CSV
988
- timestamp = int(time.time())
989
- filtered_csv = os.path.join(storage_location, f"filtered_data_{timestamp}.csv")
990
- filtered_df.to_csv(filtered_csv, index=False)
991
- logging.info(f"Data filtered on column '{column}' for words {words}.")
992
- return f"Data filtered and saved to {filtered_csv}."
993
- except mysql.connector.Error as err:
994
- logging.error(f"Error fetching data from database: {err}")
995
- # Fallback to CSV
996
- else:
997
- logging.info("No database connection. Filtering data from CSV.")
998
-
999
- # Fallback to CSV
1000
- csv_files = [f for f in os.listdir(storage_location) if f.endswith("_changes.csv") or f.endswith("_filtered.csv") or f.endswith("_sorted_asc.csv") or f.endswith("_sorted_desc.csv")]
1001
- if not csv_files:
1002
- return "No CSV files found to filter."
1003
-
1004
- # Assume the latest CSV is the target
1005
- latest_csv = max([os.path.join(storage_location, f) for f in csv_files], key=os.path.getmtime)
1006
- df = pd.read_csv(latest_csv)
1007
-
1008
- if column not in df.columns:
1009
- return f"Column '{column}' does not exist in the data."
1010
-
1011
- filtered_df = df[df[column].astype(str).str.contains('|'.join(words), case=False, na=False)]
1012
-
1013
- if filtered_df.empty:
1014
- return f"No records found with words {words} in column '{column}'."
1015
-
1016
- # Save the filtered data to a new CSV
1017
- timestamp = int(time.time())
1018
- filtered_csv = latest_csv.replace(".csv", f"_filtered_{timestamp}.csv")
1019
- filtered_df.to_csv(filtered_csv, index=False)
1020
- logging.info(f"Data filtered on column '{column}' for words {words}.")
1021
- return f"Data filtered and saved to {filtered_csv}."
1022
- except Exception as e:
1023
- logging.error(f"Error filtering data: {e}")
1024
- return f"Error filtering data: {e}"
1025
-
1026
- def sort_data(column: str, order: str) -> str:
1027
- """
1028
- Sorts the scraped data based on the specified column and order.
1029
- Saves the sorted data to a new CSV file.
1030
- """
1031
- try:
1032
- storage_location = DEFAULT_FILE_PATH
1033
-
1034
- connection = get_db_connection()
1035
- if connection:
1036
- try:
1037
- cursor = connection.cursor(dictionary=True)
1038
- # Fetch all data
1039
- query = "SELECT * FROM scraped_data"
1040
- cursor.execute(query)
1041
- results = cursor.fetchall()
1042
-
1043
- if not results:
1044
- return "No data available to sort."
1045
-
1046
- df = pd.DataFrame(results)
1047
- if column not in df.columns:
1048
- return f"Column '{column}' does not exist in the data."
1049
-
1050
- ascending = True if order.lower() == "ascending" else False
1051
- sorted_df = df.sort_values(by=column, ascending=ascending)
1052
-
1053
- # Save the sorted data to a new CSV
1054
- timestamp = int(time.time())
1055
- sorted_csv = os.path.join(storage_location, f"sorted_data_{column}_{order.lower()}_{timestamp}.csv")
1056
- sorted_df.to_csv(sorted_csv, index=False)
1057
- logging.info(f"Data sorted on column '{column}' in {order} order.")
1058
- return f"Data sorted and saved to {sorted_csv}."
1059
- except mysql.connector.Error as err:
1060
- logging.error(f"Error fetching data from database: {err}")
1061
- # Fallback to CSV
1062
- else:
1063
- logging.info("No database connection. Sorting data from CSV.")
1064
-
1065
- # Fallback to CSV
1066
- csv_files = [f for f in os.listdir(storage_location) if f.endswith("_changes.csv") or f.endswith("_filtered.csv") or f.endswith("_sorted_asc.csv") or f.endswith("_sorted_desc.csv")]
1067
- if not csv_files:
1068
- return "No CSV files found to sort."
1069
-
1070
- # Assume the latest CSV is the target
1071
- latest_csv = max([os.path.join(storage_location, f) for f in csv_files], key=os.path.getmtime)
1072
- df = pd.read_csv(latest_csv)
1073
-
1074
- if column not in df.columns:
1075
- return f"Column '{column}' does not exist in the data."
1076
-
1077
- ascending = True if order.lower() == "ascending" else False
1078
- sorted_df = df.sort_values(by=column, ascending=ascending)
1079
-
1080
- # Save the sorted data to a new CSV
1081
- timestamp = int(time.time())
1082
- sorted_csv = latest_csv.replace(".csv", f"_sorted_{order.lower()}_{timestamp}.csv")
1083
- sorted_df.to_csv(sorted_csv, index=False)
1084
- logging.info(f"Data sorted on column '{column}' in {order} order.")
1085
- return f"Data sorted and saved to {sorted_csv}."
1086
- except Exception as e:
1087
- logging.error(f"Error sorting data: {e}")
1088
- return f"Error sorting data: {e}"
1089
-
1090
- def export_csv(filename: str) -> str:
1091
- """
1092
- Exports the latest scraped data to a specified CSV filename.
1093
- """
1094
- try:
1095
- storage_location = DEFAULT_FILE_PATH
1096
-
1097
- connection = get_db_connection()
1098
- if connection:
1099
- try:
1100
- cursor = connection.cursor(dictionary=True)
1101
- # Fetch all data
1102
- query = "SELECT * FROM scraped_data"
1103
- cursor.execute(query)
1104
- results = cursor.fetchall()
1105
-
1106
- if not results:
1107
- return "No data available to export."
1108
-
1109
- df = pd.DataFrame(results)
1110
- export_path = os.path.join(storage_location, filename)
1111
- df.to_csv(export_path, index=False)
1112
- logging.info(f"Data exported to {export_path}.")
1113
- return f"Data exported to {export_path}."
1114
- except mysql.connector.Error as err:
1115
- logging.error(f"Error exporting data from database: {err}")
1116
- # Fallback to CSV
1117
- else:
1118
- logging.info("No database connection. Exporting data from CSV.")
1119
-
1120
- # Fallback to CSV
1121
- csv_files = [f for f in os.listdir(storage_location) if f.endswith("_changes.csv") or f.endswith("_filtered.csv") or f.endswith("_sorted_asc.csv") or f.endswith("_sorted_desc.csv")]
1122
- if not csv_files:
1123
- return "No CSV files found to export."
1124
-
1125
- # Assume the latest CSV is the target
1126
- latest_csv = max([os.path.join(storage_location, f) for f in csv_files], key=os.path.getmtime)
1127
- df = pd.read_csv(latest_csv)
1128
- export_path = os.path.join(storage_location, filename)
1129
- df.to_csv(export_path, index=False)
1130
- logging.info(f"Data exported to {export_path}.")
1131
- return f"Data exported to {export_path}."
1132
- except Exception as e:
1133
- logging.error(f"Error exporting CSV: {e}")
1134
- return f"Error exporting CSV: {e}"
1135
-
1136
- def log_action(action: str) -> str:
1137
- """
1138
- Logs a custom action message to the MySQL database or CSV.
1139
- """
1140
- try:
1141
- connection = get_db_connection()
1142
- if connection:
1143
- try:
1144
- cursor = connection.cursor()
1145
- insert_query = """
1146
- INSERT INTO action_logs (action)
1147
- VALUES (%s)
1148
- """
1149
- cursor.execute(insert_query, (action,))
1150
- connection.commit()
1151
- logging.info(f"Action logged in database: {action}")
1152
- cursor.close()
1153
- connection.close()
1154
- return f"Action logged: {action}"
1155
- except mysql.connector.Error as err:
1156
- logging.error(f"Error logging action to database: {err}")
1157
- # Fallback to CSV
1158
- else:
1159
- logging.info("No database connection. Logging action to CSV.")
1160
-
1161
- # Fallback to CSV
1162
- storage_location = DEFAULT_FILE_PATH
1163
- try:
1164
- os.makedirs(storage_location, exist_ok=True)
1165
- csv_file_path = os.path.join(storage_location, "action_logs.csv")
1166
- file_exists = os.path.isfile(csv_file_path)
1167
-
1168
- with open(csv_file_path, "a", newline="", encoding="utf-8") as csvfile:
1169
- fieldnames = ["timestamp", "action"]
1170
- writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
1171
- if not file_exists:
1172
- writer.writeheader()
1173
- writer.writerow(
1174
- {
1175
- "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
1176
- "action": action,
1177
- }
1178
- )
1179
- logging.info(f"Action logged to CSV: {action}")
1180
- return f"Action logged: {action}"
1181
- except Exception as e:
1182
- logging.error(f"Error logging action to CSV: {e}")
1183
- return f"Error logging action: {e}"
1184
- except Exception as e:
1185
- logging.error(f"Error logging action: {e}")
1186
- return f"Error logging action: {e}"
1187
-
1188
- # Function to get the latest CSV file based on modification time
1189
- def get_latest_csv() -> str:
1190
- """
1191
- Retrieves the latest CSV file from the storage directory based on modification time.
1192
- """
1193
- try:
1194
- storage_location = "/home/users/app/scraped_data"
1195
- csv_files = [f for f in os.listdir(storage_location) if f.endswith(".csv")]
1196
- if not csv_files:
1197
- return None
1198
-
1199
- latest_csv = max([os.path.join(storage_location, f) for f in csv_files], key=os.path.getmtime)
1200
- return latest_csv
1201
- except Exception as e:
1202
- logging.error(f"Error retrieving latest CSV: {e}")
1203
- return None
1204
-
1205
- def respond(
1206
- message: str,
1207
- history: list,
1208
- system_message: str,
1209
- max_tokens: int,
1210
- temperature: float,
1211
- top_p: float,
1212
- ) -> str:
1213
- """
1214
- Generates a response using OpenLlamaForCausalLM.
1215
- """
1216
- try:
1217
- # Check if the message contains a command
1218
- command, params = parse_command(message)
1219
- if command:
1220
- # Execute the corresponding function
1221
- response = execute_command(command, params)
1222
- else:
1223
- # Generate a regular response using OpenLlama
1224
- prompt = (
1225
- f"System: {system_message}\n"
1226
- f"History: {history}\n"
1227
- f"User: {message}\n"
1228
- f"Assistant:"
1229
- )
1230
- response = openllama_pipeline(
1231
- prompt,
1232
- max_length=max_tokens,
1233
- temperature=temperature,
1234
- top_p=top_p,
1235
- )[0]["generated_text"]
1236
-
1237
-
1238
- # Extract the assistant's reply
1239
- response = response.split("Assistant:")[-1].strip()
1240
- return response
1241
- except Exception as e:
1242
- logging.error(f"Error generating response: {e}")
1243
- return "Error generating response."
1244
-
1245
- # Define the Gradio interface
1246
- def create_interface() -> gr.Blocks():
1247
- """
1248
- Defines and returns the Gradio interface for the application.
1249
- """
1250
- with gr.Blocks() as demo:
1251
- gr.Markdown("# All-in-One Scraper, Database, and RSS Feeder")
1252
-
1253
- with gr.Row():
1254
- with gr.Column():
1255
- # Scraping Controls
1256
- storage_location = gr.Textbox(
1257
- value=DEFAULT_FILE_PATH, label="Storage Location"
1258
- )
1259
- urls = gr.Textbox(
1260
- label="URLs (comma separated)",
1261
- placeholder="https://example.com, https://anotherexample.com",
1262
- )
1263
- scrape_interval = gr.Slider(
1264
- minimum=1,
1265
- maximum=60,
1266
- value=5,
1267
- step=1,
1268
- label="Scrape Interval (minutes)",
1269
- )
1270
- content_type = gr.Radio(
1271
- choices=["text", "media", "both"],
1272
- value="text",
1273
- label="Content Type",
1274
- )
1275
- selector = gr.Textbox(
1276
- label="CSS Selector for Media (Optional)",
1277
- placeholder="e.g., img.main-image",
1278
- )
1279
- start_button = gr.Button("Start Scraping")
1280
- stop_button = gr.Button("Stop Scraping")
1281
- status_output = gr.Textbox(
1282
- label="Status Output", interactive=False, lines=2
1283
- )
1284
-
1285
- with gr.Column():
1286
- # Chat Interface
1287
- chat_history = gr.Chatbot(label="Chat History", type='messages')
1288
- with gr.Row():
1289
- message = gr.Textbox(label="Message", placeholder="Type your message here...")
1290
- system_message = gr.Textbox(
1291
- value="You are a helpful assistant.", label="System message"
1292
- )
1293
- max_tokens = gr.Slider(
1294
- minimum=1,
1295
- maximum=2048,
1296
- value=512,
1297
- step=1,
1298
- label="Max new tokens",
1299
- )
1300
- temperature = gr.Slider(
1301
- minimum=0.1,
1302
- maximum=4.0,
1303
- value=0.7,
1304
- step=0.1,
1305
- label="Temperature",
1306
- )
1307
- top_p = gr.Slider(
1308
- minimum=0.1,
1309
- maximum=1.0,
1310
- value=0.95,
1311
- step=0.05,
1312
- label="Top-p (nucleus sampling)",
1313
- )
1314
- response_box = gr.Textbox(label="Response", interactive=False, lines=2)
1315
-
1316
- with gr.Row():
1317
- with gr.Column():
1318
- # CSV Display Controls
1319
- selected_url_csv = gr.Textbox(
1320
- label="Select URL for CSV Content",
1321
- placeholder="https://example.com",
1322
- )
1323
- csv_button = gr.Button("Display CSV Content")
1324
- csv_content_output = gr.Textbox(
1325
- label="CSV Content Output", interactive=False, lines=10
1326
- )
1327
-
1328
- with gr.Column():
1329
- # RSS Feed Generation Controls
1330
- selected_url_rss = gr.Textbox(
1331
- label="Select URL for RSS Feed",
1332
- placeholder="https://example.com",
1333
- )
1334
- rss_button = gr.Button("Generate RSS Feed")
1335
- rss_output = gr.Textbox(
1336
- label="RSS Feed Output", interactive=False, lines=20
1337
- )
1338
-
1339
- # Historical Data View
1340
- with gr.Row():
1341
- with gr.Column():
1342
- historical_view_url = gr.Textbox(
1343
- label="Select URL for Historical Data",
1344
- placeholder="https://example.com",
1345
- )
1346
- historical_button = gr.Button("View Historical Data")
1347
- historical_output = gr.Dataframe(
1348
- headers=["ID", "URL", "Content Hash", "Change Detected"],
1349
- label="Historical Data",
1350
- interactive=False
1351
- )
1352
-
1353
- # Connect buttons to their respective functions
1354
- start_button.click(
1355
- fn=start_scraping,
1356
- inputs=[
1357
- storage_location,
1358
- urls,
1359
- scrape_interval,
1360
- content_type,
1361
- selector,
1362
- ],
1363
- outputs=status_output,
1364
- )
1365
-
1366
- stop_button.click(fn=stop_scraping, outputs=status_output)
1367
-
1368
- csv_button.click(
1369
- fn=display_csv,
1370
- inputs=[storage_location, selected_url_csv],
1371
- outputs=csv_content_output,
1372
- )
1373
-
1374
- rss_button.click(
1375
- fn=generate_rss_feed,
1376
- inputs=[storage_location, selected_url_rss],
1377
- outputs=rss_output,
1378
- )
1379
-
1380
- historical_button.click(
1381
- fn=display_historical_data,
1382
- inputs=[storage_location, historical_view_url],
1383
- outputs=historical_output,
1384
- )
1385
-
1386
- # Connect message submission to the chat interface
1387
- def update_chat(message_input, history, system_msg, max_toks, temp, top_p_val):
1388
- if not message_input.strip():
1389
- return history, "Please enter a message."
1390
-
1391
- response = respond(
1392
- message_input,
1393
- history,
1394
- system_msg,
1395
- max_toks,
1396
- temp,
1397
- top_p_val,
1398
- )
1399
- history.append((message_input, response))
1400
- return history, response
1401
-
1402
- message.submit(
1403
- update_chat,
1404
- inputs=[
1405
- message,
1406
- chat_history,
1407
- system_message,
1408
- max_tokens,
1409
- temperature,
1410
- top_p,
1411
- ],
1412
- outputs=[chat_history, response_box],
1413
- )
1414
-
1415
- return demo
1416
- # Function to display historical data
1417
- def display_historical_data(storage_location: str, url: str):
1418
- """
1419
- Retrieves and displays historical scraping data for a given URL.
1420
- """
1421
- try:
1422
- connection = get_db_connection()
1423
- if connection:
1424
- try:
1425
- cursor = connection.cursor(dictionary=True)
1426
- query = "SELECT * FROM scraped_data WHERE url = %s ORDER BY change_detected DESC"
1427
- cursor.execute(query, (url,))
1428
- results = cursor.fetchall()
1429
-
1430
- if not results:
1431
- return pd.DataFrame()
1432
-
1433
- df = pd.DataFrame(results)
1434
- cursor.close()
1435
- connection.close()
1436
- return df
1437
- except mysql.connector.Error as err:
1438
- logging.error(f"Error fetching historical data from database: {err}")
1439
- # Fallback to CSV
1440
- else:
1441
- logging.info("No database connection. Fetching historical data from CSV.")
1442
-
1443
- # Fallback to CSV
1444
- hostname = urlparse(url).hostname
1445
- csv_path = os.path.join(storage_location, f"{hostname}_changes.csv")
1446
- if os.path.exists(csv_path):
1447
- df = pd.read_csv(csv_path)
1448
- return df
1449
- else:
1450
- return pd.DataFrame()
1451
- except Exception as e:
1452
- logging.error(f"Error fetching historical data for {url}: {e}")
1453
- return pd.DataFrame()
1454
-
1455
- def load_model():
1456
- """
1457
- Loads the openLlama model and tokenizer once and returns the pipeline.
1458
- """
1459
- try:
1460
- model_name = "openlm-research/open_llama_3b_v2"
1461
- tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, legacy=False)
1462
- model = AutoModelForCausalLM.from_pretrained(model_name)
1463
-
1464
- # This should be inside the try block
1465
- max_supported_length = 2048
1466
-
1467
- openllama_pipeline = pipeline(
1468
- "text-generation",
1469
- model=model,
1470
- tokenizer=tokenizer,
1471
- truncation=True,
1472
- max_length=max_supported_length,
1473
- temperature=0.7,
1474
- top_p=0.95,
1475
- device=0 if torch.cuda.is_available() else -1,
1476
- )
1477
- logging.info("Model loaded successfully.")
1478
- return openllama_pipeline # Return the pipeline
1479
- except Exception as e:
1480
- logging.error(f"Error loading google/flan-t5-xl model: {e}")
1481
- return None
1482
-
1483
- def load_model(model_name: str):
1484
- """
1485
- Loads the specified model and tokenizer.
1486
- """
1487
- try:
1488
- tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, legacy=False)
1489
- model = AutoModelForCausalLM.from_pretrained(model_name)
1490
- # This should be inside the try block
1491
- max_supported_length = 2048 # Get this from the model config
1492
- openllama_pipeline = pipeline(
1493
- "text-generation",
1494
- model=model,
1495
- tokenizer=tokenizer,
1496
- truncation=True,
1497
- max_length=max_supported_length,
1498
- temperature=0.7,
1499
- top_p=0.95,
1500
- device=0 if torch.cuda.is_available() else -1,
1501
- )
1502
- logging.info(f"{model_name} loaded successfully.")
1503
- return openllama_pipeline
1504
- except Exception as e:
1505
- logging.error(f"Error loading {model_name} model: {e}")
1506
- return None
1507
-
1508
- # Automated Testing using unittest
1509
- class TestApp(unittest.TestCase):
1510
- def test_parse_command_filter(self):
1511
- command = "Filter apples, oranges in column Description"
1512
- parsed_command = parse_command(command)
1513
- self.assertEqual(parsed_command[0], "filter")
1514
- self.assertListEqual(parsed_command[1]["words"], ["apples", "oranges"])
1515
- self.assertEqual(parsed_command[1]["column"], "Description")
1516
-
1517
- def test_parse_command_sort(self):
1518
- command = "Sort Price ascending"
1519
- parsed_command = parse_command(command)
1520
- self.assertEqual(parsed_command[0], "sort")
1521
- self.assertEqual(parsed_command[1]["column"], "Price")
1522
- self.assertEqual(parsed_command[1]["order"], "ascending")
1523
-
1524
- def test_parse_command_export(self):
1525
- command = "Export to CSV as filtered_data.csv"
1526
- parsed_command = parse_command(command)
1527
- self.assertEqual(parsed_command[0], "export")
1528
- self.assertEqual(parsed_command[1]["filename"], "filtered_data.csv")
1529
-
1530
- def test_parse_command_log(self):
1531
- command = "Log action Filtered data for specific fruits"
1532
- parsed_command = parse_command(command)
1533
- self.assertEqual(parsed_command[0], "log")
1534
- self.assertEqual(parsed_command[1]["action"], "Filtered data for specific fruits")
1535
-
1536
- def test_database_connection(self):
1537
- connection = get_db_connection()
1538
- # Connection may be None if not configured; adjust the test accordingly
1539
- if connection:
1540
- self.assertTrue(connection.is_connected())
1541
- connection.close()
1542
- else:
1543
- self.assertIsNone(connection)
1544
-
1545
- def main():
1546
- # Initialize and run the application
1547
- logging.info("Starting the application...")
1548
- model = load_model()
1549
- if model:
1550
- logging.info("Application started successfully.")
1551
- print("Main function executed")
1552
- print("Creating interface...")
1553
- demo = create_interface()
1554
- print("Launching interface...")
1555
- demo.launch(server_name="0.0.0.0", server_port=7860)
1556
- else:
1557
- logging.error("Failed to start the application.")
1558
-
1559
- # Main execution
1560
- if __name__ == "__main__":
1561
- # Initialize database
1562
- initialize_database()
1563
-
1564
- # Create and launch Gradio interface
1565
- demo = create_interface()
1566
- demo.launch()
1567
-
1568
- # Run automated tests
1569
- unittest.main(argv=[''], exit=False)