Spaces:
Sleeping
Sleeping
arabellastrange
commited on
Commit
•
bcae708
1
Parent(s):
f58ccf2
replaced chromedriver/selenium with zenrows
Browse files- app.py +2 -2
- requirements.txt +3 -2
- web_search.py +32 -29
app.py
CHANGED
@@ -110,8 +110,8 @@ if __name__ == '__main__':
|
|
110 |
# libnss3=2:3.26.2-1.1+deb9u1 \
|
111 |
# libgconf-2-4=3.2.6-4+b1 \
|
112 |
# libfontconfig1=2.11.0-6.7+b1
|
113 |
-
check_call(['apt-get', 'install', '-y', 'libglib2.0-0 libnss3 libgconf-2-4 libfontconfig1'],
|
114 |
-
|
115 |
|
116 |
logger.info("Launching Gradio ChatInterface for searchbot...")
|
117 |
|
|
|
110 |
# libnss3=2:3.26.2-1.1+deb9u1 \
|
111 |
# libgconf-2-4=3.2.6-4+b1 \
|
112 |
# libfontconfig1=2.11.0-6.7+b1
|
113 |
+
# check_call(['apt-get', 'install', '-y', 'libglib2.0-0 libnss3 libgconf-2-4 libfontconfig1'],
|
114 |
+
# stdout=open(os.devnull, 'wb'), stderr=STDOUT)
|
115 |
|
116 |
logger.info("Launching Gradio ChatInterface for searchbot...")
|
117 |
|
requirements.txt
CHANGED
@@ -8,8 +8,9 @@ llama-index-embeddings-openai
|
|
8 |
llama-index-llms-openai
|
9 |
# needed for simpledirectoryreader to work
|
10 |
llama-index-readers-file
|
11 |
-
selenium==4.22.0
|
12 |
unstructured
|
13 |
requests
|
14 |
-
chromium
|
|
|
15 |
|
|
|
8 |
llama-index-llms-openai
|
9 |
# needed for simpledirectoryreader to work
|
10 |
llama-index-readers-file
|
11 |
+
# selenium==4.22.0
|
12 |
unstructured
|
13 |
requests
|
14 |
+
# chromium
|
15 |
+
zenrows
|
16 |
|
web_search.py
CHANGED
@@ -2,7 +2,6 @@ import copy
|
|
2 |
import json
|
3 |
import logging
|
4 |
import os
|
5 |
-
import stat
|
6 |
import time
|
7 |
import traceback
|
8 |
import urllib.parse as en
|
@@ -10,11 +9,8 @@ import warnings
|
|
10 |
from itertools import zip_longest
|
11 |
|
12 |
import requests
|
13 |
-
import selenium.common.exceptions
|
14 |
-
from selenium import webdriver
|
15 |
-
from selenium.webdriver.chrome.options import Options
|
16 |
-
from selenium.webdriver.chrome.service import Service as ChromeService
|
17 |
from unstructured.partition.html import partition_html
|
|
|
18 |
|
19 |
from llmsearch import site_stats
|
20 |
# this import style works in pycharm
|
@@ -27,6 +23,7 @@ from llmsearch import utilityV2 as ut
|
|
27 |
# from llmsearch import utilityV2 as ut
|
28 |
|
29 |
logger = logging.getLogger("agent_logger")
|
|
|
30 |
|
31 |
|
32 |
# todo drop blocked pages > see og llmsearch code
|
@@ -68,39 +65,45 @@ def process_url(url, timeout):
|
|
68 |
try:
|
69 |
with warnings.catch_warnings():
|
70 |
warnings.simplefilter("ignore")
|
71 |
-
options = Options()
|
72 |
-
options.page_load_strategy = "eager"
|
73 |
-
options.add_argument("--headless")
|
74 |
-
options.add_argument("--no-sandbox")
|
75 |
-
options.add_argument("--disable-dev-shm-usage")
|
76 |
-
|
77 |
-
options.add_argument("start-maximized")
|
78 |
-
options.add_argument("disable-infobars")
|
79 |
-
options.add_argument("--disable-extensions")
|
80 |
-
options.add_argument("--disable-gpu")
|
81 |
-
options.add_argument("--disable-dev-shm-usage")
|
82 |
result = ""
|
83 |
# make driver exec
|
84 |
-
os.chmod('chromedriver-linux64/chromedriver', stat.S_IEXEC)
|
85 |
try:
|
86 |
-
driver = webdriver.Chrome(service=ChromeService(executable_path='chromedriver-linux64/chromedriver'),
|
87 |
-
|
88 |
-
logger.info(f"*****setting page load timeout {timeout}")
|
89 |
-
driver.set_page_load_timeout(timeout)
|
90 |
-
driver.get(url)
|
91 |
-
response = driver.page_source
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
|
|
96 |
traceback.print_exc()
|
97 |
-
logger.info(f"webdriver failed to load")
|
98 |
return "", url
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
except Exception:
|
100 |
traceback.print_exc()
|
101 |
logger.info(f"{site} err")
|
102 |
pass
|
103 |
-
logger.info(f"Processed {site}: {len(response)} / {len(result)} {int((time.time() - start_time) * 1000)} ms")
|
104 |
return result, url
|
105 |
|
106 |
|
|
|
2 |
import json
|
3 |
import logging
|
4 |
import os
|
|
|
5 |
import time
|
6 |
import traceback
|
7 |
import urllib.parse as en
|
|
|
9 |
from itertools import zip_longest
|
10 |
|
11 |
import requests
|
|
|
|
|
|
|
|
|
12 |
from unstructured.partition.html import partition_html
|
13 |
+
from zenrows import ZenRowsClient
|
14 |
|
15 |
from llmsearch import site_stats
|
16 |
# this import style works in pycharm
|
|
|
23 |
# from llmsearch import utilityV2 as ut
|
24 |
|
25 |
logger = logging.getLogger("agent_logger")
|
26 |
+
logger = logging.getLogger("agent_logger")
|
27 |
|
28 |
|
29 |
# todo drop blocked pages > see og llmsearch code
|
|
|
65 |
try:
|
66 |
with warnings.catch_warnings():
|
67 |
warnings.simplefilter("ignore")
|
68 |
+
# options = Options()
|
69 |
+
# options.page_load_strategy = "eager"
|
70 |
+
# options.add_argument("--headless")
|
71 |
+
# options.add_argument("--no-sandbox")
|
72 |
+
# options.add_argument("--disable-dev-shm-usage")
|
73 |
+
#
|
74 |
+
# options.add_argument("start-maximized")
|
75 |
+
# options.add_argument("disable-infobars")
|
76 |
+
# options.add_argument("--disable-extensions")
|
77 |
+
# options.add_argument("--disable-gpu")
|
78 |
+
# options.add_argument("--disable-dev-shm-usage")
|
79 |
result = ""
|
80 |
# make driver exec
|
81 |
+
# os.chmod('chromedriver-linux64/chromedriver', stat.S_IEXEC)
|
82 |
try:
|
83 |
+
# driver = webdriver.Chrome(service=ChromeService(executable_path='chromedriver-linux64/chromedriver'),
|
84 |
+
# options=options)
|
85 |
+
# logger.info(f"*****setting page load timeout {timeout}")
|
86 |
+
# driver.set_page_load_timeout(timeout)
|
87 |
+
# driver.get(url)
|
88 |
+
# response = driver.page_source
|
89 |
+
client = ZenRowsClient(os.getenv('zenrows_api_key'))
|
90 |
+
response = client.get(url)
|
91 |
+
# result = response_text_extract(url=url, response=response)
|
92 |
+
result = response.text
|
93 |
+
except Exception:
|
94 |
traceback.print_exc()
|
|
|
95 |
return "", url
|
96 |
+
# except selenium.common.exceptions.TimeoutException:
|
97 |
+
# return "", url
|
98 |
+
# except selenium.common.exceptions.WebDriverException:
|
99 |
+
# traceback.print_exc()
|
100 |
+
# logger.info(f"webdriver failed to load")
|
101 |
+
# return "", url
|
102 |
except Exception:
|
103 |
traceback.print_exc()
|
104 |
logger.info(f"{site} err")
|
105 |
pass
|
106 |
+
logger.info(f"Processed {site}: {len(response.text)} / {len(result)} {int((time.time() - start_time) * 1000)} ms")
|
107 |
return result, url
|
108 |
|
109 |
|