arabellastrange commited on
Commit
bcae708
1 Parent(s): f58ccf2

replaced chromedriver/selenium with zenrows

Browse files
Files changed (3) hide show
  1. app.py +2 -2
  2. requirements.txt +3 -2
  3. web_search.py +32 -29
app.py CHANGED
@@ -110,8 +110,8 @@ if __name__ == '__main__':
110
  # libnss3=2:3.26.2-1.1+deb9u1 \
111
  # libgconf-2-4=3.2.6-4+b1 \
112
  # libfontconfig1=2.11.0-6.7+b1
113
- check_call(['apt-get', 'install', '-y', 'libglib2.0-0 libnss3 libgconf-2-4 libfontconfig1'],
114
- stdout=open(os.devnull, 'wb'), stderr=STDOUT)
115
 
116
  logger.info("Launching Gradio ChatInterface for searchbot...")
117
 
 
110
  # libnss3=2:3.26.2-1.1+deb9u1 \
111
  # libgconf-2-4=3.2.6-4+b1 \
112
  # libfontconfig1=2.11.0-6.7+b1
113
+ # check_call(['apt-get', 'install', '-y', 'libglib2.0-0 libnss3 libgconf-2-4 libfontconfig1'],
114
+ # stdout=open(os.devnull, 'wb'), stderr=STDOUT)
115
 
116
  logger.info("Launching Gradio ChatInterface for searchbot...")
117
 
requirements.txt CHANGED
@@ -8,8 +8,9 @@ llama-index-embeddings-openai
8
  llama-index-llms-openai
9
  # needed for simpledirectoryreader to work
10
  llama-index-readers-file
11
- selenium==4.22.0
12
  unstructured
13
  requests
14
- chromium
 
15
 
 
8
  llama-index-llms-openai
9
  # needed for simpledirectoryreader to work
10
  llama-index-readers-file
11
+ # selenium==4.22.0
12
  unstructured
13
  requests
14
+ # chromium
15
+ zenrows
16
 
web_search.py CHANGED
@@ -2,7 +2,6 @@ import copy
2
  import json
3
  import logging
4
  import os
5
- import stat
6
  import time
7
  import traceback
8
  import urllib.parse as en
@@ -10,11 +9,8 @@ import warnings
10
  from itertools import zip_longest
11
 
12
  import requests
13
- import selenium.common.exceptions
14
- from selenium import webdriver
15
- from selenium.webdriver.chrome.options import Options
16
- from selenium.webdriver.chrome.service import Service as ChromeService
17
  from unstructured.partition.html import partition_html
 
18
 
19
  from llmsearch import site_stats
20
  # this import style works in pycharm
@@ -27,6 +23,7 @@ from llmsearch import utilityV2 as ut
27
  # from llmsearch import utilityV2 as ut
28
 
29
  logger = logging.getLogger("agent_logger")
 
30
 
31
 
32
  # todo drop blocked pages > see og llmsearch code
@@ -68,39 +65,45 @@ def process_url(url, timeout):
68
  try:
69
  with warnings.catch_warnings():
70
  warnings.simplefilter("ignore")
71
- options = Options()
72
- options.page_load_strategy = "eager"
73
- options.add_argument("--headless")
74
- options.add_argument("--no-sandbox")
75
- options.add_argument("--disable-dev-shm-usage")
76
-
77
- options.add_argument("start-maximized")
78
- options.add_argument("disable-infobars")
79
- options.add_argument("--disable-extensions")
80
- options.add_argument("--disable-gpu")
81
- options.add_argument("--disable-dev-shm-usage")
82
  result = ""
83
  # make driver exec
84
- os.chmod('chromedriver-linux64/chromedriver', stat.S_IEXEC)
85
  try:
86
- driver = webdriver.Chrome(service=ChromeService(executable_path='chromedriver-linux64/chromedriver'),
87
- options=options)
88
- logger.info(f"*****setting page load timeout {timeout}")
89
- driver.set_page_load_timeout(timeout)
90
- driver.get(url)
91
- response = driver.page_source
92
- result = response_text_extract(url=url, response=response)
93
- except selenium.common.exceptions.TimeoutException:
94
- return "", url
95
- except selenium.common.exceptions.WebDriverException:
 
96
  traceback.print_exc()
97
- logger.info(f"webdriver failed to load")
98
  return "", url
 
 
 
 
 
 
99
  except Exception:
100
  traceback.print_exc()
101
  logger.info(f"{site} err")
102
  pass
103
- logger.info(f"Processed {site}: {len(response)} / {len(result)} {int((time.time() - start_time) * 1000)} ms")
104
  return result, url
105
 
106
 
 
2
  import json
3
  import logging
4
  import os
 
5
  import time
6
  import traceback
7
  import urllib.parse as en
 
9
  from itertools import zip_longest
10
 
11
  import requests
 
 
 
 
12
  from unstructured.partition.html import partition_html
13
+ from zenrows import ZenRowsClient
14
 
15
  from llmsearch import site_stats
16
  # this import style works in pycharm
 
23
  # from llmsearch import utilityV2 as ut
24
 
25
  logger = logging.getLogger("agent_logger")
26
+ logger = logging.getLogger("agent_logger")
27
 
28
 
29
  # todo drop blocked pages > see og llmsearch code
 
65
  try:
66
  with warnings.catch_warnings():
67
  warnings.simplefilter("ignore")
68
+ # options = Options()
69
+ # options.page_load_strategy = "eager"
70
+ # options.add_argument("--headless")
71
+ # options.add_argument("--no-sandbox")
72
+ # options.add_argument("--disable-dev-shm-usage")
73
+ #
74
+ # options.add_argument("start-maximized")
75
+ # options.add_argument("disable-infobars")
76
+ # options.add_argument("--disable-extensions")
77
+ # options.add_argument("--disable-gpu")
78
+ # options.add_argument("--disable-dev-shm-usage")
79
  result = ""
80
  # make driver exec
81
+ # os.chmod('chromedriver-linux64/chromedriver', stat.S_IEXEC)
82
  try:
83
+ # driver = webdriver.Chrome(service=ChromeService(executable_path='chromedriver-linux64/chromedriver'),
84
+ # options=options)
85
+ # logger.info(f"*****setting page load timeout {timeout}")
86
+ # driver.set_page_load_timeout(timeout)
87
+ # driver.get(url)
88
+ # response = driver.page_source
89
+ client = ZenRowsClient(os.getenv('zenrows_api_key'))
90
+ response = client.get(url)
91
+ # result = response_text_extract(url=url, response=response)
92
+ result = response.text
93
+ except Exception:
94
  traceback.print_exc()
 
95
  return "", url
96
+ # except selenium.common.exceptions.TimeoutException:
97
+ # return "", url
98
+ # except selenium.common.exceptions.WebDriverException:
99
+ # traceback.print_exc()
100
+ # logger.info(f"webdriver failed to load")
101
+ # return "", url
102
  except Exception:
103
  traceback.print_exc()
104
  logger.info(f"{site} err")
105
  pass
106
+ logger.info(f"Processed {site}: {len(response.text)} / {len(result)} {int((time.time() - start_time) * 1000)} ms")
107
  return result, url
108
 
109