Ashhar commited on
Commit
c972785
1 Parent(s): 1d4400d

support browser simulation in google scraping

Browse files
Files changed (5) hide show
  1. .gitignore +1 -1
  2. app.py +5 -18
  3. soup_dump.html +0 -0
  4. tools/webScraper.py +90 -37
  5. utils.py +4 -0
.gitignore CHANGED
@@ -4,4 +4,4 @@ __pycache__/
4
  .gitattributes
5
  gradio_cached_examples/
6
  app_*.py
7
- soup_dump.html
 
4
  .gitattributes
5
  gradio_cached_examples/
6
  app_*.py
7
+ soup_dump*.html
app.py CHANGED
@@ -67,7 +67,6 @@ def __countTokens(text):
67
  st.set_page_config(
68
  page_title="Mini Perplexity",
69
  page_icon=C.AI_ICON,
70
- # menu_items={"About": None}
71
  )
72
 
73
 
@@ -482,20 +481,8 @@ if prompt := (
482
  except Exception as e:
483
  U.pprint(e)
484
 
485
- if "counter" not in st.session_state:
486
- st.session_state.counter = 1
487
-
488
- st.session_state.counter += 1
489
-
490
- import streamlit.components.v1 as components
491
- components.html(
492
- f"<p>{st.session_state.counter}</p>"
493
- """
494
- <script>
495
- console.log("===== script running =====")
496
- const input = window.parent.document.querySelector('.stChatInput');
497
- console.log({input});
498
- </script>
499
- """,
500
- height=0
501
- )
 
67
  st.set_page_config(
68
  page_title="Mini Perplexity",
69
  page_icon=C.AI_ICON,
 
70
  )
71
 
72
 
 
481
  except Exception as e:
482
  U.pprint(e)
483
 
484
+ # if st.button("Rerun"):
485
+ # # __resetButtonState()
486
+ # st.session_state.chatHistory = []
487
+ # st.session_state.messages = []
488
+ # st.rerun()
 
 
 
 
 
 
 
 
 
 
 
 
soup_dump.html CHANGED
The diff for this file is too large to render. See raw diff
 
tools/webScraper.py CHANGED
@@ -1,48 +1,101 @@
 
1
  from urllib.parse import parse_qs, urlparse
2
  from bs4 import BeautifulSoup
3
  import requests
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
 
6
  def scrapeGoogleSearch(query):
 
7
  finalResponse = []
8
 
 
 
 
 
 
 
 
 
 
 
9
  searchUrl = f"https://www.google.com/search?q={query}"
10
- response = requests.get(searchUrl)
11
- if response.status_code == 200:
12
- soup = BeautifulSoup(response.text, 'html.parser')
13
- with open('soup_dump.html', 'w', encoding='utf-8') as file:
14
- file.write(soup.prettify())
15
-
16
- results = soup.find('body')
17
- mainDiv = soup.find('div', attrs={'id': 'main'})
18
- answerDiv = (
19
- mainDiv.select_one('div.PqksIc')
20
- or mainDiv.select_one('div.BNeawe.iBp4i')
21
- )
22
- if answerDiv:
23
- citationDateDiv = answerDiv.select_one('sub.gMUaMb.r0bn4c.rQMQod')
24
- citationDate = citationDateDiv.text if citationDateDiv else ""
25
- answerText = answerDiv.text.replace(citationDate, '').strip()
26
- citationText = f"Citation Date: {citationDate}" if citationDate else ""
27
- finalResponse.append(f"Verified Answer:\n{answerText}\n{citationText}\n\n\n")
28
-
29
- results = mainDiv.select('div.egMi0.kCrYT')
30
- resultsDesc = mainDiv.select('div.BNeawe.s3v9rd.AP7Wnd .BNeawe.s3v9rd.AP7Wnd:last-child')
31
-
32
- if results:
33
- finalResponse.append("Search Results:\n")
34
-
35
- for (i, result) in enumerate(results[:10]):
36
- title = result.find('h3').text
37
- link = result.find('a')['href']
38
- parsedUrl = urlparse(link)
39
- urlParams = parse_qs(parsedUrl.query)
40
- link = urlParams.get('q', [None])[0]
41
- desc = resultsDesc[i].text
42
- finalResponse.append(f"Title: {title}")
43
- finalResponse.append(f"Description: {desc}")
44
- finalResponse.append(f"Link: {link}\n")
45
- else:
46
- print("Failed to retrieve search results.")
 
 
 
 
 
 
 
 
 
 
47
 
48
  return "\n".join(finalResponse)
 
1
+ import os
2
  from urllib.parse import parse_qs, urlparse
3
  from bs4 import BeautifulSoup
4
  import requests
5
+ from typing import TypedDict
6
+ import utils as U
7
+
8
+ SIMULATE_BROWSER = os.environ.get("SIMULATE_BROWSER_SEARCH") == "true"
9
+
10
+
11
+ class SelectorsDict(TypedDict):
12
+ answer: str
13
+ answer_desc: str
14
+ answer_citation: str
15
+ search_results: str
16
+ search_results_desc: str
17
+
18
+
19
+ SELECTORS: SelectorsDict
20
+
21
+ if SIMULATE_BROWSER:
22
+ SELECTORS = {
23
+ "answer": ".IZ6rdc",
24
+ "answer_desc": ".LGOjhe",
25
+ "answer_citation": ".kX21rb.ZYHQ7e",
26
+ "search_results": ".Ww4FFb",
27
+ "search_results_desc": ".VwiC3b.yXK7lf",
28
+ }
29
+ else:
30
+ SELECTORS = {
31
+ "answer_desc": "div.PqksIc",
32
+ "answer_citation": "sub.gMUaMb.r0bn4c.rQMQod",
33
+ "search_results": "div.egMi0.kCrYT",
34
+ "search_results_desc": "div.BNeawe.s3v9rd.AP7Wnd .BNeawe.s3v9rd.AP7Wnd:last-child",
35
+ }
36
 
37
 
38
  def scrapeGoogleSearch(query):
39
+ U.pprint(f"{SIMULATE_BROWSER=}")
40
  finalResponse = []
41
 
42
+ headers = {
43
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
44
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
45
+ "Accept-Language": "en-US,en;q=0.5",
46
+ "Referer": "https://www.google.com/",
47
+ "DNT": "1", # Do Not Track Request Header
48
+ "Connection": "keep-alive",
49
+ "Upgrade-Insecure-Requests": "1"
50
+ }
51
+
52
  searchUrl = f"https://www.google.com/search?q={query}"
53
+
54
+ # Use a session to maintain cookies
55
+ with requests.Session() as session:
56
+ if SIMULATE_BROWSER:
57
+ session.headers.update(headers)
58
+ response = session.get(searchUrl)
59
+ if response.status_code == 200:
60
+ soup = BeautifulSoup(response.text, "html.parser")
61
+ with open("soup_dump.html", "w", encoding="utf-8") as file:
62
+ file.write(soup.prettify())
63
+
64
+ results = soup.find("body")
65
+ mainDiv = soup.find("div", attrs={"id": "main"})
66
+ answerText = ""
67
+ if SELECTORS.get("answer"):
68
+ mainAnswerDiv = mainDiv.select_one(SELECTORS["answer"])
69
+ if mainAnswerDiv:
70
+ mainAnswer = mainAnswerDiv.text.strip()
71
+ answerText = f"**{mainAnswer}**. "
72
+
73
+ answerDescDiv = mainDiv.select_one(SELECTORS["answer_desc"])
74
+ if answerDescDiv:
75
+ citationDateDiv = answerDescDiv.select_one(SELECTORS["answer_citation"])
76
+ citationDate = citationDateDiv.text if citationDateDiv else ""
77
+ answerText += answerDescDiv.text.replace(citationDate, "").strip()
78
+ citationText = f"Citation Date: {citationDate}" if citationDate else ""
79
+ finalResponse.append(f"Verified Answer:\n{answerText}\n{citationText}\n\n\n")
80
+
81
+ results = mainDiv.select(SELECTORS["search_results"])
82
+ resultsDesc = mainDiv.select(SELECTORS["search_results_desc"])
83
+
84
+ if results:
85
+ finalResponse.append("Search Results:\n")
86
+
87
+ for (i, result) in enumerate(results[:10]):
88
+ title = result.find("h3").text
89
+ link = result.find("a")["href"]
90
+ if not SIMULATE_BROWSER:
91
+ parsedUrl = urlparse(link)
92
+ urlParams = parse_qs(parsedUrl.query)
93
+ link = urlParams.get("q", [None])[0]
94
+ desc = resultsDesc[i].text
95
+ finalResponse.append(f"Title: {title}")
96
+ finalResponse.append(f"Description: {desc}")
97
+ finalResponse.append(f"URL: {link}\n")
98
+ else:
99
+ print("Failed to retrieve search results.")
100
 
101
  return "\n".join(finalResponse)
utils.py CHANGED
@@ -15,6 +15,10 @@ def applyCommonStyles():
15
  font-family: 'Raleway';
16
  }
17
 
 
 
 
 
18
  @keyframes blinker {
19
  0% {
20
  opacity: 1;
 
15
  font-family: 'Raleway';
16
  }
17
 
18
+ .stButton p {
19
+ font-size: 0.9rem;
20
+ }
21
+
22
  @keyframes blinker {
23
  0% {
24
  opacity: 1;