client

Sleeping

File size: 3,875 Bytes

import os
from urllib.parse import parse_qs, urlparse
from bs4 import BeautifulSoup
import requests
from typing import TypedDict

SIMULATE_BROWSER = os.environ.get("SIMULATE_BROWSER_SEARCH") == "true"


class SelectorsDict(TypedDict):
    answer: str
    answer_desc: str
    answer_citation: str
    search_results: str
    search_results_desc: str


SELECTORS: SelectorsDict

if SIMULATE_BROWSER:
    SELECTORS = {
        "answer": ".IZ6rdc",
        "answer_desc": ".LGOjhe",
        "answer_citation": ".kX21rb.ZYHQ7e",
        "search_results": ".Ww4FFb",
        "search_results_desc": ".VwiC3b.yXK7lf",
    }
else:
    SELECTORS = {
        "answer_desc": "div.PqksIc",
        "answer_citation": "sub.gMUaMb.r0bn4c.rQMQod",
        "search_results": "div.egMi0.kCrYT",
        "search_results_desc": "div.BNeawe.s3v9rd.AP7Wnd .BNeawe.s3v9rd.AP7Wnd:last-child",
    }


def scrapeGoogleSearch(query):
    # U.pprint(f"{SIMULATE_BROWSER=}")
    finalResponse = []

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Referer": "https://www.google.com/",
        "DNT": "1",  # Do Not Track Request Header
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1"
    }

    searchUrl = f"https://www.google.com/search?q={query}"

    # Use a session to maintain cookies
    with requests.Session() as session:
        if SIMULATE_BROWSER:
            session.headers.update(headers)
        response = session.get(searchUrl)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")
            with open("soup_dump.html", "w", encoding="utf-8") as file:
                file.write(soup.prettify())

            mainDiv = soup.find("div", attrs={"id": "main"})
            answerText = ""
            if SELECTORS.get("answer"):
                mainAnswerDiv = mainDiv.select_one(SELECTORS["answer"])
                if mainAnswerDiv:
                    mainAnswer = mainAnswerDiv.text.strip()
                    answerText = f"**{mainAnswer}**. "

            answerDescDiv = mainDiv.select_one(SELECTORS["answer_desc"])
            if answerDescDiv:
                citationDateDiv = answerDescDiv.select_one(SELECTORS["answer_citation"])
                citationDate = citationDateDiv.text if citationDateDiv else ""
                answerText += answerDescDiv.text.replace(citationDate, "").strip()
                citationText = f"Citation Date: {citationDate}" if citationDate else ""
                finalResponse.append(f"Verified Answer:\n{answerText}\n{citationText}\n\n\n")

            results = mainDiv.select(SELECTORS["search_results"])
            resultsDesc = mainDiv.select(SELECTORS["search_results_desc"])
            # Ensure resultsDesc has the same length as results
            resultsDesc += [None] * (len(results) - len(resultsDesc))

            if results:
                finalResponse.append("Search Results:\n")

            for (i, result) in enumerate(results[:10]):
                title = result.find("h3").text
                link = result.find("a")["href"]
                if not SIMULATE_BROWSER:
                    parsedUrl = urlparse(link)
                    urlParams = parse_qs(parsedUrl.query)
                    link = urlParams.get("q", [None])[0]
                desc = resultsDesc[i].text if resultsDesc[i] else ""
                finalResponse.append(f"Title: {title}")
                finalResponse.append(f"Description: {desc}")
                finalResponse.append(f"URL: {link}\n")
        else:
            print("Failed to retrieve search results.")

    return "\n".join(finalResponse)