import os from urllib.parse import parse_qs, urlparse from bs4 import BeautifulSoup import requests from typing import TypedDict import utils as U SIMULATE_BROWSER = os.environ.get("SIMULATE_BROWSER_SEARCH") == "true" class SelectorsDict(TypedDict): answer: str answer_desc: str answer_citation: str search_results: str search_results_desc: str SELECTORS: SelectorsDict if SIMULATE_BROWSER: SELECTORS = { "answer": ".IZ6rdc", "answer_desc": ".LGOjhe", "answer_citation": ".kX21rb.ZYHQ7e", "search_results": ".Ww4FFb", "search_results_desc": ".VwiC3b.yXK7lf", } else: SELECTORS = { "answer_desc": "div.PqksIc", "answer_citation": "sub.gMUaMb.r0bn4c.rQMQod", "search_results": "div.egMi0.kCrYT", "search_results_desc": "div.BNeawe.s3v9rd.AP7Wnd .BNeawe.s3v9rd.AP7Wnd:last-child", } def scrapeGoogleSearch(query): U.pprint(f"{SIMULATE_BROWSER=}") finalResponse = [] headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", "Referer": "https://www.google.com/", "DNT": "1", # Do Not Track Request Header "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1" } searchUrl = f"https://www.google.com/search?q={query}" # Use a session to maintain cookies with requests.Session() as session: if SIMULATE_BROWSER: session.headers.update(headers) response = session.get(searchUrl) if response.status_code == 200: soup = BeautifulSoup(response.text, "html.parser") with open("soup_dump.html", "w", encoding="utf-8") as file: file.write(soup.prettify()) results = soup.find("body") mainDiv = soup.find("div", attrs={"id": "main"}) answerText = "" if SELECTORS.get("answer"): mainAnswerDiv = mainDiv.select_one(SELECTORS["answer"]) if mainAnswerDiv: mainAnswer = mainAnswerDiv.text.strip() answerText = f"**{mainAnswer}**. " answerDescDiv = mainDiv.select_one(SELECTORS["answer_desc"]) if answerDescDiv: citationDateDiv = answerDescDiv.select_one(SELECTORS["answer_citation"]) citationDate = citationDateDiv.text if citationDateDiv else "" answerText += answerDescDiv.text.replace(citationDate, "").strip() citationText = f"Citation Date: {citationDate}" if citationDate else "" finalResponse.append(f"Verified Answer:\n{answerText}\n{citationText}\n\n\n") results = mainDiv.select(SELECTORS["search_results"]) resultsDesc = mainDiv.select(SELECTORS["search_results_desc"]) if results: finalResponse.append("Search Results:\n") for (i, result) in enumerate(results[:10]): title = result.find("h3").text link = result.find("a")["href"] if not SIMULATE_BROWSER: parsedUrl = urlparse(link) urlParams = parse_qs(parsedUrl.query) link = urlParams.get("q", [None])[0] desc = resultsDesc[i].text finalResponse.append(f"Title: {title}") finalResponse.append(f"Description: {desc}") finalResponse.append(f"URL: {link}\n") else: print("Failed to retrieve search results.") return "\n".join(finalResponse)