client

Sleeping

client / tools /webScraper.py

Ashhar

support browser simulation in google scraping

c972785 3 months ago

3.77 kB

	import os
	from urllib.parse import parse_qs, urlparse
	from bs4 import BeautifulSoup
	import requests
	from typing import TypedDict
	import utils as U

	SIMULATE_BROWSER = os.environ.get("SIMULATE_BROWSER_SEARCH") == "true"


	class SelectorsDict(TypedDict):
	answer: str
	answer_desc: str
	answer_citation: str
	search_results: str
	search_results_desc: str


	SELECTORS: SelectorsDict

	if SIMULATE_BROWSER:
	SELECTORS = {
	"answer": ".IZ6rdc",
	"answer_desc": ".LGOjhe",
	"answer_citation": ".kX21rb.ZYHQ7e",
	"search_results": ".Ww4FFb",
	"search_results_desc": ".VwiC3b.yXK7lf",
	}
	else:
	SELECTORS = {
	"answer_desc": "div.PqksIc",
	"answer_citation": "sub.gMUaMb.r0bn4c.rQMQod",
	"search_results": "div.egMi0.kCrYT",
	"search_results_desc": "div.BNeawe.s3v9rd.AP7Wnd .BNeawe.s3v9rd.AP7Wnd:last-child",
	}


	def scrapeGoogleSearch(query):
	U.pprint(f"{SIMULATE_BROWSER=}")
	finalResponse = []

	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8",
	"Accept-Language": "en-US,en;q=0.5",
	"Referer": "https://www.google.com/",
	"DNT": "1", # Do Not Track Request Header
	"Connection": "keep-alive",
	"Upgrade-Insecure-Requests": "1"
	}

	searchUrl = f"https://www.google.com/search?q={query}"

	# Use a session to maintain cookies
	with requests.Session() as session:
	if SIMULATE_BROWSER:
	session.headers.update(headers)
	response = session.get(searchUrl)
	if response.status_code == 200:
	soup = BeautifulSoup(response.text, "html.parser")
	with open("soup_dump.html", "w", encoding="utf-8") as file:
	file.write(soup.prettify())

	results = soup.find("body")
	mainDiv = soup.find("div", attrs={"id": "main"})
	answerText = ""
	if SELECTORS.get("answer"):
	mainAnswerDiv = mainDiv.select_one(SELECTORS["answer"])
	if mainAnswerDiv:
	mainAnswer = mainAnswerDiv.text.strip()
	answerText = f"{mainAnswer}. "

	answerDescDiv = mainDiv.select_one(SELECTORS["answer_desc"])
	if answerDescDiv:
	citationDateDiv = answerDescDiv.select_one(SELECTORS["answer_citation"])
	citationDate = citationDateDiv.text if citationDateDiv else ""
	answerText += answerDescDiv.text.replace(citationDate, "").strip()
	citationText = f"Citation Date: {citationDate}" if citationDate else ""
	finalResponse.append(f"Verified Answer:\n{answerText}\n{citationText}\n\n\n")

	results = mainDiv.select(SELECTORS["search_results"])
	resultsDesc = mainDiv.select(SELECTORS["search_results_desc"])

	if results:
	finalResponse.append("Search Results:\n")

	for (i, result) in enumerate(results[:10]):
	title = result.find("h3").text
	link = result.find("a")["href"]
	if not SIMULATE_BROWSER:
	parsedUrl = urlparse(link)
	urlParams = parse_qs(parsedUrl.query)
	link = urlParams.get("q", [None])[0]
	desc = resultsDesc[i].text
	finalResponse.append(f"Title: {title}")
	finalResponse.append(f"Description: {desc}")
	finalResponse.append(f"URL: {link}\n")
	else:
	print("Failed to retrieve search results.")

	return "\n".join(finalResponse)