# import requests # from bs4 import BeautifulSoup # def scrape_website(url: str) -> dict: # try: # # Send a GET request to the URL # response = requests.get(url) # response.raise_for_status() # Raise an exception for bad status codes # # Parse the HTML content # soup = BeautifulSoup(response.content, 'html.parser') # # Extract text content # texts = soup.stripped_strings # content = ' '.join(texts) # # Limit the content to 4000 characters # content = content[:8000] # # Return the result as a dictionary # return { # "source": url, # "content": content # } # except requests.RequestException as e: # # Handle any requests-related errors # return { # "source": url, # "content": f"Error scraping website: {str(e)}" # } # # Example usage: # # result = scrape_website("https://example.com") # # print(result) # import requests # from bs4 import BeautifulSoup # from urllib.parse import urljoin, urlparse # import time # import random # from requests.exceptions import RequestException # from fake_useragent import UserAgent # class AdvancedWebScraper: # def __init__(self, max_retries=3, backoff_factor=0.3, timeout=10): # self.max_retries = max_retries # self.backoff_factor = backoff_factor # self.timeout = timeout # self.session = requests.Session() # self.ua = UserAgent() # def get_random_user_agent(self): # return self.ua.random # def scrape_website(self, url: str) -> dict: # headers = {'User-Agent': self.get_random_user_agent()} # for attempt in range(self.max_retries): # try: # response = self.session.get(url, headers=headers, timeout=self.timeout) # response.raise_for_status() # soup = BeautifulSoup(response.content, 'html.parser') # # Remove script and style elements # for script in soup(["script", "style"]): # script.decompose() # # Get text content # text = soup.get_text(separator=' ', strip=True) # # Basic content cleaning # lines = (line.strip() for line in text.splitlines()) # chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # text = ' '.join(chunk for chunk in chunks if chunk) # # Limit content length # content = text[:8000] # # Extract title # title = soup.title.string if soup.title else "No title found" # # Extract meta description # meta_desc = soup.find('meta', attrs={'name': 'description'}) # description = meta_desc['content'] if meta_desc else "No description found" # # Extract links # links = [urljoin(url, a.get('href')) for a in soup.find_all('a', href=True)] # return { # "source": url, # "title": title, # "description": description, # "content": content, # "Potentially useful links": links[:10] # Limit to first 10 links # } # except RequestException as e: # if attempt == self.max_retries - 1: # return { # "source": url, # "error": f"Failed to scrape website after {self.max_retries} attempts: {str(e)}" # } # else: # time.sleep(self.backoff_factor * (2 ** attempt)) # continue # Example usage: # scraper = AdvancedWebScraper() # result = scraper.scrape_website("https://example.com") # print(result) import os from termcolor import colored from langchain_community.document_loaders import AsyncChromiumLoader from langchain_community.document_transformers import BeautifulSoupTransformer from langchain_community.document_loaders import PyPDFLoader from langchain_core.messages import AIMessage from fake_useragent import UserAgent ua = UserAgent() os.environ["USER_AGENT"] = ua.random def scraper(url: str) -> dict: print(colored(f"\n\n RAG tool failed, starting basic scraping with URL: {url}\n\n", "green")) try: print(colored(f"\n\nStarting HTML scraper with URL: {url}\n\n", "green")) loader = AsyncChromiumLoader([url]) html = loader.load() # Transform bs_transformer = BeautifulSoupTransformer() docs_transformed = bs_transformer.transform_documents(html, tags_to_extract=["p"]) print({"source":url, "content": AIMessage(docs_transformed[0].page_content)}) return {"source":url, "content": AIMessage(docs_transformed[0].page_content)} except Exception as e: try: print(colored(f"\n\nStarting PDF scraper with URL: {url}\n\n", "green")) loader = PyPDFLoader(url) pages = loader.load_and_split() # print({"source":url, "content":AIMessage(pages)}) return {"source":url, "content":AIMessage(pages)} except Exception as e: return {"source": url, "content": AIMessage("Unsupported document type, supported types are 'html' and 'pdf'.")} if __name__ == "__main__": scraper("https://python.langchain.com/v0.1/docs/modules/data_connection/document_loaders/pdf/")