Spaces:
Sleeping
Sleeping
# import requests | |
# from bs4 import BeautifulSoup | |
# def scrape_website(url: str) -> dict: | |
# try: | |
# # Send a GET request to the URL | |
# response = requests.get(url) | |
# response.raise_for_status() # Raise an exception for bad status codes | |
# # Parse the HTML content | |
# soup = BeautifulSoup(response.content, 'html.parser') | |
# # Extract text content | |
# texts = soup.stripped_strings | |
# content = ' '.join(texts) | |
# # Limit the content to 4000 characters | |
# content = content[:8000] | |
# # Return the result as a dictionary | |
# return { | |
# "source": url, | |
# "content": content | |
# } | |
# except requests.RequestException as e: | |
# # Handle any requests-related errors | |
# return { | |
# "source": url, | |
# "content": f"Error scraping website: {str(e)}" | |
# } | |
# # Example usage: | |
# # result = scrape_website("https://example.com") | |
# # print(result) | |
# import requests | |
# from bs4 import BeautifulSoup | |
# from urllib.parse import urljoin, urlparse | |
# import time | |
# import random | |
# from requests.exceptions import RequestException | |
# from fake_useragent import UserAgent | |
# class AdvancedWebScraper: | |
# def __init__(self, max_retries=3, backoff_factor=0.3, timeout=10): | |
# self.max_retries = max_retries | |
# self.backoff_factor = backoff_factor | |
# self.timeout = timeout | |
# self.session = requests.Session() | |
# self.ua = UserAgent() | |
# def get_random_user_agent(self): | |
# return self.ua.random | |
# def scrape_website(self, url: str) -> dict: | |
# headers = {'User-Agent': self.get_random_user_agent()} | |
# for attempt in range(self.max_retries): | |
# try: | |
# response = self.session.get(url, headers=headers, timeout=self.timeout) | |
# response.raise_for_status() | |
# soup = BeautifulSoup(response.content, 'html.parser') | |
# # Remove script and style elements | |
# for script in soup(["script", "style"]): | |
# script.decompose() | |
# # Get text content | |
# text = soup.get_text(separator=' ', strip=True) | |
# # Basic content cleaning | |
# lines = (line.strip() for line in text.splitlines()) | |
# chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) | |
# text = ' '.join(chunk for chunk in chunks if chunk) | |
# # Limit content length | |
# content = text[:8000] | |
# # Extract title | |
# title = soup.title.string if soup.title else "No title found" | |
# # Extract meta description | |
# meta_desc = soup.find('meta', attrs={'name': 'description'}) | |
# description = meta_desc['content'] if meta_desc else "No description found" | |
# # Extract links | |
# links = [urljoin(url, a.get('href')) for a in soup.find_all('a', href=True)] | |
# return { | |
# "source": url, | |
# "title": title, | |
# "description": description, | |
# "content": content, | |
# "Potentially useful links": links[:10] # Limit to first 10 links | |
# } | |
# except RequestException as e: | |
# if attempt == self.max_retries - 1: | |
# return { | |
# "source": url, | |
# "error": f"Failed to scrape website after {self.max_retries} attempts: {str(e)}" | |
# } | |
# else: | |
# time.sleep(self.backoff_factor * (2 ** attempt)) | |
# continue | |
# Example usage: | |
# scraper = AdvancedWebScraper() | |
# result = scraper.scrape_website("https://example.com") | |
# print(result) | |
import os | |
from termcolor import colored | |
from langchain_community.document_loaders import AsyncChromiumLoader | |
from langchain_community.document_transformers import BeautifulSoupTransformer | |
from langchain_community.document_loaders import PyPDFLoader | |
from langchain_core.messages import AIMessage | |
from fake_useragent import UserAgent | |
ua = UserAgent() | |
os.environ["USER_AGENT"] = ua.random | |
def scraper(url: str) -> dict: | |
print(colored(f"\n\n RAG tool failed, starting basic scraping with URL: {url}\n\n", "green")) | |
try: | |
print(colored(f"\n\nStarting HTML scraper with URL: {url}\n\n", "green")) | |
loader = AsyncChromiumLoader([url]) | |
html = loader.load() | |
# Transform | |
bs_transformer = BeautifulSoupTransformer() | |
docs_transformed = bs_transformer.transform_documents(html, tags_to_extract=["p"]) | |
print({"source":url, "content": AIMessage(docs_transformed[0].page_content)}) | |
return {"source":url, "content": AIMessage(docs_transformed[0].page_content)} | |
except Exception as e: | |
try: | |
print(colored(f"\n\nStarting PDF scraper with URL: {url}\n\n", "green")) | |
loader = PyPDFLoader(url) | |
pages = loader.load_and_split() | |
# print({"source":url, "content":AIMessage(pages)}) | |
return {"source":url, "content":AIMessage(pages)} | |
except Exception as e: | |
return {"source": url, "content": AIMessage("Unsupported document type, supported types are 'html' and 'pdf'.")} | |
if __name__ == "__main__": | |
scraper("https://python.langchain.com/v0.1/docs/modules/data_connection/document_loaders/pdf/") |